In [1]:
import glob
import gzip
from bs4 import BeautifulSoup
import lxml
from collections import defaultdict
from tqdm import tqdm
import requests
import random
import sys
import pathlib
import csv
import pandas as pd
import json
import argparse

# import multiprocessing
from fuzzywuzzy import fuzz
from statistics import mean
import numpy as np
import itertools
import re
import io
# set the system path
sys.path.insert(1, '/nfs/gns/literature/machine-learning/Santosh/Gitlab/biobertepmc/')

# BioBERT NER models
import logging
import torch
from torch.utils.data import DataLoader
import pickle
from biobert.model.bert_crf_model import BertCRF
from biobert.data_loader.epmc_loader import NERDatasetBatch
from biobert.utils.utils import my_collate

from collections import namedtuple


# Relations and associations model
import en_ner_europepmc_md
import en_relationv01

import unicodedata
# import datetime

from nltk.tokenize import WordPunctTokenizer
tokenizer = WordPunctTokenizer()

In [2]:
Entity = namedtuple('Entity', ['span', 'tag', 'text', 'pre', 'post'])
Entity_Label = namedtuple('Label', ['index', 'pos', 'tag', 'span'])
missing_list = ['covid-19', 'coronavirus disease 2019', '2019-ncov', 'covid 19']    


In [3]:
# Define all functions here

batch_size = 8
class MLModel:
    def __init__(self):
        self.bertCrf_model = load_model()

        # bertCrf_model.load_state_dict(torch.load('/homes/yangx/home/gitrepo/biobertepmc/model/bert_crf_model.states', map_location=device))
        self.bertCrf_model.load_state_dict(torch.load(MODEL_PATH + 'bert_crf_model.states', map_location=device))
        self.bertCrf_model.bert_model.bert_model.to(device)

    def post(self, sentences):
        BATCH_SIZE = 16
        text = sentences
        # print(text)
        with torch.no_grad():
            processor, tokens, spans = load_data_processor(text)
            dataLoader = DataLoader(dataset=processor, batch_size=BATCH_SIZE, collate_fn=my_collate, num_workers=2)

            idx2label = params['idx2label']
            self.bertCrf_model.eval()
            entities = []
            for i_batch, sample_batched in enumerate(dataLoader):
                inputs = sample_batched['input']

                bert_inputs, bert_attention_mask, bert_token_mask, wordpiece_alignment, split_alignments, lengths, token_mask \
                    = processor.tokens_totensor(inputs)

                _, preds = self.bertCrf_model.predict(input_ids=bert_inputs.to(device),
                                                      bert_attention_mask=bert_attention_mask.to(device),
                                                      bert_token_mask=bert_token_mask,
                                                      alignment=wordpiece_alignment,
                                                      splits=(split_alignments, lengths),
                                                      token_mask=token_mask)
                if idx2label:
                    for i, (path, score) in enumerate(preds):
                        labels = [idx2label[p] for p in path]
                        offset_index = i_batch * BATCH_SIZE + i
                        entities.append([[e.span[0], e.span[1], e.tag, e.text]
                                         for e in extract_entity(labels, spans[offset_index], text[offset_index])])
        return {'annotations': entities}


def load_data_processor(inputs):
    token_spans = []
    tokens = []
    for line in inputs:
        token_spans.append(list(tokenizer.span_tokenize(line)))
        tokens.append([line[start: end] for start, end in token_spans[-1]])

    processor = NERDatasetBatch.from_params(params=params, inputs=tokens)
    return processor, tokens, token_spans


def load_model():
    allowed_transitions = None
    model = BertCRF(num_tags=params['num_tags'],
                    model_name=params['model_name'],
                    stride=params['stride'],
                    include_start_end_transitions=True,
                    constraints=allowed_transitions)
    return model


def extract_entity(preds, spans, text, length=20):
    """
    extract entity from label sequence
    :param preds: a list of labels in a sentence
    :type preds: List[str
    :param spans:
    :type spans:
    :return: A list of entity object
    :rtype: List[Entity]
    """
    entities = []
    tmp = []

    for i, token in enumerate(preds):
        if token == 'O':
            pos, tag = 'O', 'O'
            label = None
        else:
            pos, tag = token.split('-')
            label = Entity_Label(index=i, pos=pos, tag=tag, span=spans[i])

        if pos in {'B', 'O'} and tmp:
            start_span = tmp[0].span[0]
            end_span = tmp[-1].span[1]
            entities.append(Entity(span=(start_span, end_span),
                                   tag=tmp[0].tag,
                                   text=text[start_span:end_span],
                                   pre=text[max(0, start_span - length):start_span],
                                   post=text[end_span: end_span + length]))
            tmp[:] = []
        if pos == 'B' or pos == 'I':
            tmp.append(label)

    if tmp:
        start_span = tmp[0].span[0]
        end_span = tmp[-1].span[-1]
        entities.append(
            Entity(span=(start_span, end_span),
                   tag=tmp[0].tag,
                   text=text[start_span:end_span],
                   pre=text[max(0, start_span - length):start_span],
                   post=text[end_span:end_span + length])
        )
    return entities


# Create a function called "chunks" with two arguments, l and n:
def chunks(l, n):
    # For item i in a range that is a length of l,
    for i in range(0, len(l), n):
        # Create an index range for l of n items:
        yield l[i:i + n]


def clean_Nones(ner_tags_):
    ner_tags = []
    # had to do this as the position of entity tag and entity are exchanged in CD
    for each_ner_tag in ner_tags_:
        if 'CD' == each_ner_tag[2]:
            ner_tags.append([each_ner_tag[0], each_ner_tag[1], each_ner_tag[3], each_ner_tag[2]])
        else:
            ner_tags.append(each_ner_tag)

    ner_tags = sorted(ner_tags, key=lambda x: len(x[3]), reverse=True)
    if len(ner_tags) == 1 and 'None' in ner_tags:
        return ner_tags
    elif len(ner_tags) > 1 and 'None' in ner_tags:
        ner_tags.remove('None')
        return ner_tags
    else:
        return ner_tags


In [4]:
# This function will compare ml tags and ztags. The agreed tags are then returned back
def compare_ml_annotations_with_dictionary_tagged(ml_tags_, z_tags_, missing_list_):
    agreed_z_tags = set()
#     print(z_tags_, ml_tags_)
    for each_z_tag in z_tags_:
        for each_ml_annotation in ml_tags_:
            if each_z_tag.lower() in missing_list_:
                agreed_z_tags.add(each_z_tag)
            else:
                score = fuzz.partial_ratio(each_ml_annotation, each_z_tag) #token_set_ratio
                if score > 80:
                    agreed_z_tags.add(each_z_tag)
    return agreed_z_tags

In [5]:
fulltext_scores={
    
'title':10,
'intro':1,
'result':5,
'discus':2,
'conclus':2,
'case':1,
'fig':5,
'table':5,
'appendix':1,
'other':1
}
# fulltext_scores


def assign_scores_to_sections(fulltext_scores_, section_tagged_):
    scores = []
    for key,val in fulltext_scores_.items():
        if key in section_tagged_.lower():
            return val

    return fulltext_scores_['other']

In [6]:
# read xmls files
def getfileblocks(file_path):
    subFileBlocks = []

    with io.open(file_path, 'r', encoding='utf8') as fh:
        for line in fh:
            if line.startswith('<!DOCTYPE'):
                subFileBlocks.append(line)
            else:
                subFileBlocks[-1] += line

    return subFileBlocks


In [7]:
# this function will generate the tag spans given the missing spans of entities
def get_new_missing_tags(each_sentence, missing_list_, tag_type):
    new_entities = []
    for missing_string in missing_list_:
        for i in re.finditer(missing_string, each_sentence):
            indexlocation= i.span()
    #         print(indexlocation)
            startindex= i.start()
            endindex= i.end()
            entity = each_sentence[indexlocation[0]:indexlocation[1]]
            new_entities.append([startindex,endindex, tag_type, entity])
    return new_entities

In [8]:
# this will get matches
def get_sentences_matches_tags(sentences_tags,abs_full):
    matches = defaultdict(list)
    for each_sentence, ml_tags in sentences_tags.items():
        for each_ml_tag in ml_tags:
            if each_ml_tag[2]!= 'OG':
                mini_dict = {}
                mini_dict['label'] = each_ml_tag[3]
                mini_dict['type'] = each_ml_tag[2]
                mini_dict['startInSentence'] = each_ml_tag[0]
                mini_dict['endInSentence'] = each_ml_tag[1]
                if each_sentence in abs_full:
                    start_index = abs_full.find(each_sentence)
                    mini_dict['sectionStart'] = start_index
                    mini_dict['sectionEnd'] = start_index + len(each_sentence)
                matches[each_sentence].append(mini_dict)
    
    return matches

# map annotations in sets of pairs
def get_mapped_list_from_annotations(annotation_list):
    mapped_list = list(itertools.combinations(annotation_list, 2))

    unique_maplist = []
    for each_list in mapped_list:
        if each_list[0][2] !=each_list[1][2] and each_list[1][2]!='OG' and each_list[0][2]!='OG':
            unique_maplist.append((each_list[0], each_list[1]))

    return unique_maplist  

# get only those sentences with relevant pairs
def get_sentences_offset_per_cooccurance(sentences_tags):
    
    dict_gp_ds = defaultdict(list)
    dict_gp_cd = defaultdict(list)
    dict_ds_cd = defaultdict(list)

    for sentence, tags in sentences_tags.items():
        if len(tags)>1: # only if more than 1 tag is available
            check_tags =np.array(tags)
            if 'GP' in  check_tags and 'DS' in check_tags:
                dict_gp_ds[sentence] = get_mapped_list_from_annotations(tags)
            if 'GP' in  check_tags and 'CD' in check_tags:
                dict_gp_cd[sentence] = get_mapped_list_from_annotations(tags)
            if 'DS' in  check_tags and 'CD' in check_tags:
                dict_ds_cd[sentence]= get_mapped_list_from_annotations(tags)         
                
    return dict_gp_ds, dict_gp_cd, dict_ds_cd


# if not in the right position if the pair and swap them such that always GP is followed by either CD or DS and DS is followed by CD
def swap_positions(cooccurance_list, pos1, pos2): 
    cooccurance_list[pos1], cooccurance_list[pos2] = cooccurance_list[pos2], cooccurance_list[pos1]
    return cooccurance_list    
    


In [9]:
# this is for getting relationship text
def get_relations(gp_ds_text_sentence):
    docs = relation_model2(gp_ds_text_sentence)
    rel_list =[]
    for ent in docs.ents:
        if ent.label_!='GP' and ent.label_!='DS':
            rel_dict = {}
            rel_dict['startr'] = ent.start_char
            rel_dict['endr'] = ent.end_char
            rel_dict['labelr'] = ent.text
            rel_dict['typer'] = ent.label_
            rel_list.append(rel_dict)
    return rel_list

# roundoff the association model scores
def roundoff(dict_y):
    for k, v in dict_y.items():
        v = round(v,2) 
        dict_y[k] = v 
    return dict_y

In [41]:
# get the occurances
def get_cooccurance_evidence(average_evidence_scores, dict_tags, tag_type_1, tag_type_2):
    co_occurance_sentences = defaultdict(list)
    #     mined_sentences = []
    for each_sent_map, mappedtags in dict_tags.items():
        # always see that GP-DS, GP-CD and CD-DS is followed

        if tag_type_1 not in mappedtags[0][0][2]:
            mappedtags[0] = swap_positions(list(mappedtags[0]), 0, 1)
        else:
            mappedtags[0] = list(mappedtags[0])

        for eachtag in mappedtags:
            if tag_type_1 == eachtag[0][2] and tag_type_2 == eachtag[1][2]:
                mini_dict = {}
                mini_dict['start1'] = eachtag[0][0]
                mini_dict['end1'] = eachtag[0][1]
                mini_dict['label1'] = eachtag[0][3]
                mini_dict['start2'] = eachtag[1][0]
                mini_dict['end2'] = eachtag[1][1]
                mini_dict['label2'] = eachtag[1][3]
                mini_dict['type'] = tag_type_1 + '-' + tag_type_2

                if average_evidence_scores[each_sent_map]:
                    mini_dict['evidence_score'] = average_evidence_scores[each_sent_map]
                else:
                    mini_dict['evidence_score'] = 1

                if tag_type_1 == 'GP' and tag_type_2 == 'DS':
                    # get associations scores
                    mini_dict['association'] = roundoff(relation_model1(each_sent_map).cats)
                    # get relations
                    rels = get_relations(each_sent_map)
                    if rels:
                        mini_dict['relation'] = rels
                co_occurance_sentences[each_sent_map].append(mini_dict)
    return co_occurance_sentences

In [11]:
# generate dictionary for matches and co-occurances, section and other scores
def generate_interested_sentences_in_json_format(final_sentences, section_tags, match_gp_ds_cd, co_occurance_gp_ds,co_occurance_gp_cd,co_occurance_ds_cd):
    interested_sentences=[]
    for each_sentence, tags in final_sentences.items():
        minidict = {}

        minidict['text'] = each_sentence

        if section_tags[each_sentence]:
            minidict['section'] = list(section_tags[each_sentence])[0]
        else:
            minidict['section'] = 'Other'

        all_matches = match_gp_ds_cd[each_sentence]

        if all_matches:
            minidict['matches'] = all_matches

        all_co_occurances = co_occurance_gp_ds[each_sentence] + co_occurance_gp_cd[each_sentence]+co_occurance_ds_cd[each_sentence]

        if all_co_occurances:
            minidict['co-occurrence'] = all_co_occurances
        if all_co_occurances or all_matches:
            interested_sentences.append(minidict)
    
    return interested_sentences

In [12]:
def get_ml_tags(all_sentences, missing_list_):
    ML_annotations = ml_model.post(all_sentences)
    # Biobert is missing COVIS-19, need to retrain the model later. For now I tag it as DS
    final_annotations =[]
    for each_annotation in ML_annotations['annotations']:
        if each_annotation: # Biobert is tagging COVIS-19 as GP need to retrain the model later. For now I tag it as DS
            if each_annotation[0][2]=='GP' and each_annotation[0][3].lower() in missing_list_: 
                each_annotation[0][2]='DS'
                final_annotations.append(each_annotation)
            elif each_annotation[0][2]=='CD' and each_annotation[0][3].lower()=='and':
                final_annotations.append(each_annotation)
            else:
                final_annotations.append(each_annotation)
        else:
            final_annotations.append(each_annotation)
    
    return final_annotations

In [13]:
def extract_sentence_level_details(soup):
    
    plain_sentences_ = []
    section_tags_ = defaultdict(set)
    evidence_scores_ = defaultdict(list)
    average_evidence_scores__ = defaultdict(list)
    uniprot_set_ = set()
    efo_set_ = set()
    line_count = 0    
  
    # get all the sentences
    all_sentences = soup.find_all('sent')# all_sentences = soup.find_all('SENT')
    # get uniprot tags 
    try:
        uniprot_ztags = soup.find_all('z:uniprot')
        for each_tag in uniprot_ztags:
            uniprot_set_.add(each_tag.text)
    except:
        print('no uniprot_ztags found ')
    # get efo tags 
    try:   
        efo_ztags = soup.find_all('z:efo')
        for each_tag in efo_ztags:
            efo_set_.add(each_tag.text)
    except:
        print('no efo_ztags found ')
    

    # get abstract details if found
    try:
        abs_full = soup.find('abstract').text
        abs_sentences = soup.find('abstract').find_all('plain')
        total_abstract_length = len(abs_sentences)
    except:
        abs_full =''
        abs_sentences =''

    # get section tags, evidence_scores_ and plain sentences
    for each_sentence in all_sentences:
        extracted_sentence = each_sentence.plain

        if extracted_sentence:
            clean_text = unicodedata.normalize("NFKD",extracted_sentence.text).strip()

            try:
                title_tag = extracted_sentence.findParent('article-title')
            except:
                title_tag =''

            try:
                if title_tag:
                    section_tags_[clean_text].add('title')
                    evidence_scores_[clean_text].append(10)
                else:
                    try:
                        if extracted_sentence in abs_sentences:
                            section_tagged = 'Abstract'
                        else:
                            section_tagged = extracted_sentence.findParent('sec').title.text.strip()

                    except:
                        section_tagged =''

                    if section_tagged:
                        section_tags_[clean_text].add(section_tagged)
                        # evidence scores
                        if 'abstract' in section_tagged.lower():
    #                         print(line_count)
                            line_count = line_count+1
                            if line_count ==1 or line_count==2:
                                evidence_scores_[clean_text].append(2)
                            elif line_count==total_abstract_length:
                                evidence_scores_[clean_text].append(5)
                            else:
                                evidence_scores_[clean_text].append(3)
                        else:
                            evi_scor = assign_scores_to_sections(fulltext_scores,section_tagged)
                            evidence_scores_[clean_text].append(evi_scor)
                    else:
                        evidence_scores_[clean_text].append(1)               
            except:
                pass

            plain_sentences_.append(clean_text)
#     calculate average evidence scores        
    for each_sentence,scores in evidence_scores_.items():
        average_score = mean(scores)
        average_evidence_scores__[each_sentence] = average_score
    
    return section_tags_, average_evidence_scores__, plain_sentences_, uniprot_set_, efo_set_,abs_full

In [14]:
# load the model

In [15]:
MODEL_PATH = '/nfs/gns/literature/machine-learning/Santosh/Gitlab/biobertepmc/reproduce_GP_DS_OG_CD/1604049631/'

# path to the file that has model parameters
params_path = MODEL_PATH + "params.pickle"
with open(params_path, 'rb') as f:
    params = pickle.load(f)
params['max_ner_token_len'] = -1
params['max_bert_token_len'] = -1

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

ml_model = MLModel()

In [16]:
# load association and relation models
relation_model1 = en_relationv01.load()
relation_model2 = en_ner_europepmc_md.load()

In [78]:
def get_publication_date(soup):

    try:
        try:
            date_year = soup.find('pub-date', {'pub-type':"epub"})
            try:
                year = date_year.year.text
            except:
                year = ''
            try:    
                month = date_year.month.text
            except:
                month = ''
            try:
                day = date_year.day.text
            except:
                day = ''
        
        except:
            try:
                year = soup.find('year').text
            except:
                year = ''
            try:    
                month = soup.find('month').text
            except:
                month = ''
            try:
                day = soup.find('day').text
            except:
                day = ''
            
    except:
         pass

    
    pub_date = year+'-'+month+'-'+day 
    
    return pub_date

In [79]:
def generate_final_json(soup,section_tags,og_set,tagged_sentences, match_gp_ds_cd, co_occurance_gp_ds, co_occurance_gp_cd, co_occurance_ds_cd):
    json_generated = {}

    try:
        json_generated['pmid'] = 'PMC'+soup.find(attrs={"pub-id-type" : "pmcid"}).text
    except:
        try:
            json_generated['pmid'] = soup.find(attrs={"pub-id-type" : "pmid"}).text
        except:
            json_generated['pmid'] = ''

 
    json_generated['pubDate'] = get_publication_date(soup)

    json_generated['organisms'] = list(og_set)

    interested_sentences = generate_interested_sentences_in_json_format(tagged_sentences, section_tags, match_gp_ds_cd, co_occurance_gp_ds, co_occurance_gp_cd, co_occurance_ds_cd)
    json_generated['sentences'] = interested_sentences
    
    return json_generated

In [18]:
def get_only_ml_tagged_sentences(sentences,ml_annots, missing_list_):
    
    gp_set = set()
    ds_set = set()
    cd_set = set()
    og_set = set()
    ml_tagged_sentences ={}
    count=0
    
    for each_sentence in sentences:
        new_entities = get_new_missing_tags(each_sentence, missing_list_, tag_type='DS')
        all_tags = new_entities+ml_annots[count] 

        if all_tags:
            ml_tagged_sentences[each_sentence] = all_tags
            for each_ml_tag in all_tags:
                if each_ml_tag[2] =='GP':
                    gp_set.add(each_ml_tag[3])
                elif each_ml_tag[2] =='DS':
                    ds_set.add(each_ml_tag[3])
                if each_ml_tag[2] =='CD':
                    cd_set.add(each_ml_tag[3])
                if each_ml_tag[2] =='OG':
                    og_set.add(each_ml_tag[3])
        count = count+1
    return ml_tagged_sentences, gp_set, cd_set, og_set

In [19]:
def get_only_ztag_sentences(sentences,uniprot_fp_removed_set,z_efo_set,cd_set):
    
    new_cd_set = set()
    ztag_sentences = {}

    for each_cd_tag in cd_set:
        if 'and' != each_cd_tag:
            new_cd_set.add(each_cd_tag.replace(')','').replace('(','').strip())
        

    for each_sentence in sentences:
        uniport_entities = get_new_missing_tags(each_sentence, uniprot_fp_removed_set, tag_type='GP')
        efo_entities = get_new_missing_tags(each_sentence, z_efo_set, tag_type='DS')
        try:
            cd_entities = get_new_missing_tags(each_sentence, new_cd_set, tag_type='CD')
        except:
            cd_entities =[]
        
        all_tags = uniport_entities+efo_entities+cd_entities
        
        if all_tags:
            ztag_sentences[each_sentence]= all_tags
    
    return ztag_sentences 

In [83]:
data_folder_path = '/nfs/production/literature/Santosh_Tirunagari/20.09_FT_Chunks/'
data_file_path = data_folder_path+ 'Annot_PMC2932713_PMC2959506_split_19.xml' #'Annot_PMC1851099_PMC1994013_split_19.xml', Annot_PMC6432232_PMC6447240_split_36.xml'#'Annot_PMC2111990_PMC2131188_split_99.xml'#
files_list = getfileblocks(data_file_path)

ml_result_path = '/nfs/production/literature/Santosh_Tirunagari/NMP_test/'
ztag_result_path ='/nfs/production/literature/Santosh_Tirunagari/NDP_test/' 

In [75]:
data_file_path

'/nfs/production/literature/Santosh_Tirunagari/20.09_FT_Chunks/Annot_PMC2932713_PMC2959506_split_19.xml'

In [84]:
each_file =files_list[2]


In [85]:
xml_soup = BeautifulSoup(each_file, 'lxml')
section_tag_sents, average_evidence_scores_sents, plain_sentences, uniprot_set, efo_set, absfull = extract_sentence_level_details(
    xml_soup)

ml_annotations = get_ml_tags(plain_sentences,missing_list)
mltag_sentences, ml_gp_set, ml_cd_set, ml_og_set = get_only_ml_tagged_sentences(plain_sentences, ml_annotations,
                                                                                missing_list)

uniprot_nofp_set = compare_ml_annotations_with_dictionary_tagged(ml_gp_set, uniprot_set, missing_list)
ztag_sentences = get_only_ztag_sentences(plain_sentences, uniprot_nofp_set, efo_set, ml_cd_set)

ml_gp_ds, ml_gp_cd, ml_ds_cd = get_sentences_offset_per_cooccurance(mltag_sentences)

ztag_gp_ds, ztag_gp_cd, ztag_ds_cd = get_sentences_offset_per_cooccurance(ztag_sentences)

ml_co_occurance_gp_ds = get_cooccurance_evidence(average_evidence_scores_sents, ml_gp_ds, tag_type_1='GP', tag_type_2='DS')
ml_co_occurance_gp_cd = get_cooccurance_evidence(average_evidence_scores_sents, ml_gp_cd, tag_type_1='GP', tag_type_2='CD')
ml_co_occurance_ds_cd = get_cooccurance_evidence(average_evidence_scores_sents, ml_ds_cd, tag_type_1='DS', tag_type_2='CD')

ztag_co_occurance_gp_ds = get_cooccurance_evidence(average_evidence_scores_sents, ztag_gp_ds, tag_type_1='GP', tag_type_2='DS')
ztag_co_occurance_gp_cd = get_cooccurance_evidence(average_evidence_scores_sents, ztag_gp_cd, tag_type_1='GP', tag_type_2='CD')
ztag_co_occurance_ds_cd = get_cooccurance_evidence(average_evidence_scores_sents, ztag_ds_cd, tag_type_1='DS', tag_type_2='CD')

ml_match_gp_ds_cd = get_sentences_matches_tags(mltag_sentences, absfull)
ztag_match_gp_ds_cd = get_sentences_matches_tags(ztag_sentences, absfull)

ml_json = generate_final_json(xml_soup, section_tag_sents, ml_og_set, mltag_sentences, ml_match_gp_ds_cd,
                              ml_co_occurance_gp_ds, ml_co_occurance_gp_cd, ml_co_occurance_ds_cd)
ztag_json = generate_final_json(xml_soup, section_tag_sents, ml_og_set, ztag_sentences, ztag_match_gp_ds_cd,
                                ztag_co_occurance_gp_ds, ztag_co_occurance_gp_cd, ztag_co_occurance_ds_cd)

# save ml json
with open(ml_result_path + 'NMP_' + data_file_path.split('/')[-1][:-3] + 'jsonl', 'at',
          encoding='utf8') as json_file:
    json.dump(ml_json, json_file, ensure_ascii=False)
    json_file.write('\n')

# save ml json
with open(ztag_result_path + 'NDP_' + data_file_path.split('/')[-1][:-3] + 'jsonl', 'at',
          encoding='utf8') as json_file:
    json.dump(ztag_json, json_file, ensure_ascii=False)
    json_file.write('\n')

In [52]:
xml_soup = BeautifulSoup(each_file, 'lxml')

In [None]:
parser = argparse.ArgumentParser(description='This script will process patch files to extract GP DS CDs in job folders on OTAR FullTextLoadings')
parser.add_argument("-f", "--file", nargs=1, required=True, help="OTAR New Pipeline GP DS CD extractor to Jsonl format", metavar="PATH")
args = parser.parse_args()

process_each_file_in_job(args.file[0])