# TRY NER

In [1]:
from __future__ import division
import spacy, re , json
from spacy.matcher import Matcher
from spacy.tokens import Span
from collections import defaultdict
from glob import glob

class MacherBase(object):
    def __init__(self, nlp, patterns, label):

        self.matcher = Matcher(nlp.vocab)
        self.matcher.add(label, None, *patterns)
    def __call__(self, doc):
        matches = self.matcher(doc)
        seen_tokens = set()
        new_entities = []
        entities = doc.ents
        start_end_list = []
        for match_id,start,end in matches:
            start_end_list.append([start,end])

        start_end_length = len(start_end_list)
        del_mark = []

        for i in range(start_end_length-1):
            i_start,i_end = start_end_list[i]

            for j in range(i+1,start_end_length):
                j_start,j_end = start_end_list[j]
                
                if j_start>i_end:
                    break
                if j_end> i_end:
                    del_mark.append(i)

        for m_index,match in enumerate(matches):
            if m_index in del_mark:
                continue
            match_id, start, end = match
            if start not in seen_tokens and end - 1 not in seen_tokens:
                new_entities.append(Span(doc, start, end, label=match_id))
                entities = [
                    e for e in entities if not (e.start < end and e.end > start)
                ]
                seen_tokens.update(range(start, end))

        doc.ents = tuple(entities) + tuple(new_entities)
        return doc

        
class TiterMatcher(MacherBase):
    name = "Titer_matcher"
    
    def __init__(self, nlp, patterns, label):
        MacherBase.__init__(self, nlp, patterns, label)

class YieldMatcher(MacherBase):
    name = "Yield_matcher"

    def __init__(self, nlp, patterns, label):
        MacherBase.__init__(self, nlp, patterns, label)
        
class ProdRateMatcher(MacherBase):
    name = "ProdRate_matcher"

    def __init__(self, nlp, patterns, label):
        MacherBase.__init__(self, nlp, patterns, label)
        

def nlp_change():
    nlp = spacy.load("en_core_web_sm")

                   
    titer_patterns = [
                      [{"POS": "NUM","OP": "?"},{"ORTH":"+","OP": "?"},{"ORTH":"-","OP": "?"},{"TEXT":{"REGEX": "\d+$"}},{"TEXT":{"REGEX": "g$"}},{"TEXT":{"REGEX": "^\/[Ll]"}}],
                      [{"POS": "NUM","OP": "?"},{"ORTH":"+","OP": "?"},{"ORTH":"-","OP": "?"},{"TEXT":{"REGEX": "\d+$"}},{"TEXT":{"REGEX": "g$"}},{"TEXT":{"REGEX": "\/"}},{"TEXT":{"REGEX": "^.?[Ll]"}}],
                      [{"POS": "NUM","OP": "?"},{"ORTH":"+","OP": "?"},{"ORTH":"-","OP": "?"},{"TEXT":{"REGEX": "\d+.?g$"}},{"TEXT":{"REGEX": "\/"}},{"TEXT":{"REGEX": "^[Ll]"}}],
                      [{"POS": "NUM","OP": "?"},{"ORTH":"+","OP": "?"},{"ORTH":"-","OP": "?"},{"TEXT":{"REGEX": "\d+"}},{"TEXT":{"REGEX": "g·[Ll]\-1$"}}],
                      [{"POS": "NUM","OP": "?"},{"ORTH":"+","OP": "?"},{"ORTH":"-","OP": "?"},{"TEXT":{"REGEX": "(^[0-9]+(\.[0-9]{1,2})?$)"}},{"TEXT":{"REGEX": "(M|mol)$"}}],
                      [{"POS": "NUM","OP": "?"},{"ORTH":"+","OP": "?"},{"ORTH":"-","OP": "?"},{"TEXT":{"REGEX": "(^[0-9]+(\.[0-9]{1,2})?$)"}},{"TEXT":{"REGEX": "(M|mol)$"}},{"TEXT":{"REGEX": "\/"}},{"TEXT":{"REGEX": "^[Ll]"}}],
                      [{"POS": "NUM","OP": "?"},{"ORTH":"+","OP": "?"},{"ORTH":"-","OP": "?"},{"TEXT":{"REGEX": "^\d+(.+)?(M|mol)$"}}],
                      [{"POS": "NUM"},{"TEXT":{"REGEX": "\+"}},{"TEXT":{"REGEX": "-"}},{"TEXT":{"REGEX": "^\d+"}},{"TEXT":{"REGEX": "g$"}},{"TEXT":{"REGEX": "^\/?[Ll]"}}],
                      [{"TEXT":{"REGEX": "^\d+m?g·?[Ll]\(?\-1$"}}],
                      [{"TEXT":{"REGEX": "^\d+g·?[Ll]\-1$"}}]
                     ]


    titer_matcher = TiterMatcher(nlp, titer_patterns, 'TITER')
    nlp.add_pipe(titer_matcher,before='ner')

    # DCW: dry cell weight

    yield_patterns = [
                      [{"POS": "NUM","OP": "?"},{"ORTH":"+","OP": "?"},{"ORTH":"-","OP": "?"},{"TEXT":{"REGEX": "\d+$"}},{"TEXT":{"REGEX": "g$"}},{"TEXT":{"REGEX": "\/"}},{"TEXT":{"REGEX": "g$"}}],
                      [{"POS": "NUM","OP": "?"},{"ORTH":"+","OP": "?"},{"TEXT":{"REGEX": "\/?-"}},{"TEXT":{"REGEX": "\d+$"}},{"TEXT":{"REGEX": "g$"}},{"TEXT":{"REGEX": "\/"}},{"TEXT":{"REGEX": "g$"}}],
                      [{"POS": "NUM","OP": "?"},{"ORTH":"+","OP": "?"},{"ORTH":"-","OP": "?"},{"TEXT":{"REGEX": "\d+$"}},{"TEXT":{"REGEX": "g$"}},{"TEXT":{"REGEX": "g\-1$"}}],
                      [{"POS": "NUM","OP": "?"},{"ORTH":"+","OP": "?"},{"ORTH":"-","OP": "?"},{"TEXT":{"REGEX": "\d+$"}},{"TEXT":{"REGEX": "[Ll]$"}},{"TEXT":{"REGEX": "\/"}},{"TEXT":{"REGEX": "g$"}}],
                      [{"POS": "NUM","OP": "?"},{"ORTH":"+","OP": "?"},{"ORTH":"-","OP": "?"},{"TEXT":{"REGEX": "\d+"}},{"TEXT":{"REGEX": "g$"}},{"TEXT":{"REGEX": "\/?"}},{"TEXT":{"REGEX": "g‐glucose$"}}],
                      [{"POS": "NUM","OP": "?"},{"ORTH":"+","OP": "?"},{"ORTH":"-","OP": "?"},{"TEXT":{"REGEX": "\d+"}},{"TEXT":{"REGEX": "g$"}},{"TEXT":{"REGEX": "\/?"}},{"TEXT":{"REGEX": "gDC?W$"}}],
                      [{"POS": "NUM","OP": "?"},{"ORTH":"+","OP": "?"},{"ORTH":"-","OP": "?"},{"TEXT":{"REGEX": "\d+"}},{"TEXT":{"REGEX": "g$"}},{"TEXT":{"REGEX": "g\-1$"}},{"TEXT":{"REGEX": "DC?W$"}}],
                      [{"POS": "NUM","OP": "?"},{"ORTH":"+","OP": "?"},{"ORTH":"-","OP": "?"},{"TEXT":{"REGEX": "\d+"}},{"TEXT":{"REGEX": "g$"}},{"TEXT":{"REGEX": ".+"}},{"TEXT":{"REGEX": "per"}},{"TEXT":{"REGEX": "gram$"}}],
                      [{"POS": "NUM","OP": "?"},{"ORTH":"+","OP": "?"},{"ORTH":"-","OP": "?"},{"TEXT":{"REGEX": "\d+"}},{"TEXT":{"REGEX": "mol$"}},{"TEXT":{"REGEX": ".+"}},{"TEXT":{"REGEX": "per"}},{"TEXT":{"REGEX": "mol$"}}],
                      [{"POS": "NUM","OP": "?"},{"ORTH":"+","OP": "?"},{"ORTH":"-","OP": "?"},{"TEXT":{"REGEX": "\d+"}},{"TEXT":{"REGEX": "mol$"}},{"TEXT":{"REGEX": "\/?"}},{"TEXT":{"REGEX": "mol$"}}],
                      [{"POS": "NUM","OP": "?"},{"ORTH":"+","OP": "?"},{"ORTH":"-","OP": "?"},{"TEXT":{"REGEX": "\d+"}},{"TEXT":{"REGEX": "mol$"}},{"TEXT":{"REGEX": "(mol-1|mol−1)$"}}],
                      [{"TEXT":{"REGEX": "\d+"}},{"ORTH":"%","OP": "?"},{"ORTH":"(","OP": "?"},{"TEXT":{"REGEX": "C$"}},{"TEXT":{"REGEX": "mol$"}},{"TEXT":{"REGEX": "(mol-1|mol−1)$"}}],
                      [{"TEXT":{"REGEX": "\d+"}},{"ORTH":"%","OP": "?"},{"ORTH":"(","OP": "?"},{"ORTH":"C","OP": "?"},{"ORTH":"-","OP": "?"},{"TEXT":{"REGEX": "mol$"}},{"ORTH":"C","OP": "?"},{"ORTH":"-","OP": "?"},{"TEXT":{"REGEX": "(mol-1|mol−1)$"}},{"ORTH":")","OP": "?"}],
                      [{"TEXT":{"REGEX": "\d+"}},{"ORTH":"%"},{"ORTH":"molar"},{"TEXT":{"REGEX": "^yield"}}],
                      [{"TEXT":{"REGEX": "\d+"}},{"ORTH":"%"},{"ORTH":"theoretical"},{"TEXT":{"REGEX": "^yield"}}],
                      [{"TEXT":{"REGEX": "\d+"}},{"ORTH":"%"},{"ORTH":"theoretical"},{"TEXT":{"REGEX": "^yield"}}],
                      [{"TEXT":{"REGEX": "[0-9]+(\.[0-9]{1,2})?"}},{"ORTH":"%"},{"TEXT":{"REGEX": "mol$"}},{"ORTH":"/","OP": "?"},{"TEXT":{"REGEX": "mol$"}}],
                      [{"TEXT":{"REGEX": "[0-9]+(\.[0-9]{1,2})?.g$"}},{"TEXT":{"REGEX": "\/$"}},{"TEXT":{"REGEX": "g$"}}],
                      [{"POS": "NUM","OP": "?"},{"ORTH":"+","OP": "?"},{"ORTH":"-","OP": "?"},{"TEXT":{"REGEX": "(^[0-9]+(\.[0-9]{1,2})?$)"}},{"ORTH":"%"},{"TEXT":{"REGEX": "mol$"}},{"ORTH":"/","OP": "?"},{"TEXT":{"REGEX": "mol$"}}],
                      #[{"TEXT":{"REGEX": "^yield"}},{"ORTH":"of"},{"TEXT":{"REGEX": "\d+"}},{"ORTH":"%"}]
                     ]

    yield_matcher = YieldMatcher(nlp, yield_patterns, 'YIELDS')
    nlp.add_pipe(yield_matcher,before='ner')

    prodrate_patterns = [
                      [{"POS": "NUM","OP": "?"},{"ORTH":"+","OP": "?"},{"ORTH":"-","OP": "?"},{"TEXT":{"REGEX": "\d+"}},{"TEXT":{"REGEX": "(M|mol)$"}},{"TEXT":{"REGEX": "\/?"}},{"TEXT":{"REGEX": "^(min|hr?|d|days?)$"}}],
                      [{"POS": "NUM","OP": "?"},{"ORTH":"+","OP": "?"},{"ORTH":"-","OP": "?"},{"TEXT":{"REGEX": "\d+"}},{"TEXT":{"REGEX": "g·[Ll].+·(min|hr?|d|days?)$"}}],
                      [{"POS": "NUM","OP": "?"},{"ORTH":"+","OP": "?"},{"ORTH":"-","OP": "?"},{"TEXT":{"REGEX": "\d+"}},{"TEXT":{"REGEX": "g?"}},{"TEXT":{"REGEX": "\/?"}},{"TEXT":{"REGEX": "^[Ll]"}},{"ORTH":"/","OP": "?"},{"TEXT":{"REGEX": "^(min|hr?|d|days?)$"}}],
                      [{"POS": "NUM","OP": "?"},{"ORTH":"+","OP": "?"},{"ORTH":"-","OP": "?"},{"TEXT":{"REGEX": "\d+"}},{"TEXT":{"REGEX": "g?"}},{"ORTH":"/"},{"TEXT":{"REGEX": "^([Ll]·)?(min|hr?|d|days?)$"}}],
                      [{"POS": "NUM","OP": "?"},{"ORTH":"+","OP": "?"},{"ORTH":"-","OP": "?"},{"TEXT":{"REGEX": "\d+"}},{"TEXT":{"REGEX": "g?"}},{"ORTH":"/","OP": "?"},{"TEXT":{"REGEX": "^(min|hr?|d|days?)"}},{"ORTH":"/","OP": "?"},{"TEXT":{"REGEX": "^[Ll]"}}],
                      [{"POS": "NUM","OP": "?"},{"ORTH":"+","OP": "?"},{"ORTH":"-","OP": "?"},{"TEXT":{"REGEX": "\d+"}},{"TEXT":{"REGEX": "g?"}},{"TEXT":{"REGEX": "^[Ll]-1"}},{"TEXT":{"REGEX": "^(min|hr?|d|days?)"}}]                    
                     ]
    prodrate_matcher = ProdRateMatcher(nlp, prodrate_patterns, 'PRODRATE')
    nlp.add_pipe(prodrate_matcher,before='ner')
 
    nlp.remove_pipe('ner')
    return nlp

In [2]:
def prod_token(text):
    doc = nlp(text)
    """
    for tok in doc:
        print(tok.text,"-->", tok.pos_,"-->",tok.ent_type_,"-->",tok.i)
    """
    prod_record = []
    for ent in doc.ents:
        #print(dir(ent))
        ent_start = str(int(ent.start_char)+titel_length)
        ent_end = str(int(ent.end_char)+titel_length)
        #print(ent.text, ent.start_char, ent.end_char, ent.label_,ent.start,ent.end)

        sub_prod = '**#**'.join([ent.text,ent.label_,ent_start,ent_end])
        prod_record.append(sub_prod)
    return prod_record
nlp = spacy.load("en_core_web_sm")
nlp = nlp_change()
text = 'Then, over-expression of 2,3-DHBA synthase and 2,3-DHBA 1-monoxygenase achieved synthesis of pyrogallol in Escherichia coli, with a titer of 201.52 mg/L at 24 h.'
titel_length =20
#pubmed_id = '20000'
prod_token(text)

['201.52 mg/L**#**TITER**#**161**#**172']

# Chem NER

In [3]:
from chemdataEx.doc.document import Document

def chemdataEx_chem(abstract_text):
    doc = Document(abstract_text)
    doc_chems = doc.cems
    ChemsText = []
    for chems in doc_chems:
        chem_text = chems.text
        chem_start= chems.start
        chem_end = chems.end        
        one_chem_digs = [c_dig for c_dig in range(chem_start,chem_end)]
        ChemsText.append(chem_text) 
    ChemsText = list(set(ChemsText))  
    return ChemsText
abstract_text = 'The QS system can dynamically balance the relationship between efficient synthesis of the target product and cell growth. Finally, we validated the usefulness of this strategy by dynamic control of menaquinone-7 (MK-7 ) synthesis in Bacillus subtilis 168.'
ChemsList = chemdataEx_chem(abstract_text)
ChemsList

['MK-7', 'menaquinone-7']

In [4]:
from spacy.matcher import PhraseMatcher
class ChemsMatcher(object):
    # detected chem is longer one
    name = "Chems_matcher"

    def __init__(self, nlp, terms, label):
        patterns = [nlp.make_doc(text) for text in terms]
        self.matcher = PhraseMatcher(nlp.vocab)
        self.matcher.add(label, None, *patterns)

    def __call__(self, doc):
        matches = self.matcher(doc)
        seen_tokens = set()
        new_entities = []
        entities = doc.ents
        start_end_list = []
        for match_id,start,end in matches:
            start_end_list.append([start,end])

        start_end_length = len(start_end_list)
        del_mark = []

        for i in range(start_end_length-1):
            i_start,i_end = start_end_list[i]

            for j in range(i+1,start_end_length):
                j_start,j_end = start_end_list[j]
                
                if j_start>i_end:
                    break
                if j_end> i_end:
                    del_mark.append(i)
        
        #print(del_mark)
        for m_index,match in enumerate(matches):
            if m_index in del_mark:
                continue
            match_id, start, end = match
            if start not in seen_tokens and end - 1 not in seen_tokens:
                new_entities.append(Span(doc, start, end, label=match_id))
                entities = [
                    e for e in entities if not (e.start < end and e.end > start)
                ]
                seen_tokens.update(range(start, end))

        doc.ents = tuple(entities) + tuple(new_entities)
        return doc 
#"""
nlp = spacy.load("en_core_web_sm")    
nlp = nlp_change()
#"""
chems_list = ['2,3-DHBA']
print("chems_list:",chems_list)
chems_matcher = ChemsMatcher(nlp,chems_list,"CHEM")
nlp.add_pipe(chems_matcher,before='Titer_matcher')
text = 'Then, over-expression of 2,3-DHBA synthase and 2,3-DHBA 1-monoxygenase achieved synthesis of pyrogallol in Escherichia coli, with a titer of 201.52 mg/L at 24 h.'
doc = nlp(text)
for ent in doc.ents:
    #print(dir(ent))
    print(ent.text, ent.start_char, ent.end_char, ent.label_,ent.start,ent.end)

chems_list: ['2,3-DHBA']
2,3-DHBA 25 33 CHEM 6 7
2,3-DHBA 47 55 CHEM 9 10
201.52 mg/L 141 152 TITER 23 27


# Preprocessing the Data

In [6]:
import pickle,json,os
import pandas as pd
def is_decimal(num):
    dnumre = re.compile(r"^[0-9]+(\.[0-9]{1,2})?$")
    result = dnumre.search(num)
    return bool(result)

def add_single_unit(text):
    nlp = nlp_change()
    doc = nlp(text)
    units_dict = defaultdict(list)
    prod_start_end = {}
    for ent in doc.ents:
        #print(dir(ent))
        #print(ent.text, ent.start_char, ent.end_char, ent.label_,ent.start,ent.end)
        prod_start_end[ent.start] = ent.end 

    for tok in doc:
        #print(tok.text,"-->", tok.pos_,"-->",tok.ent_type_,"-->",tok.i)
        if tok.i<1:
            continue
        if tok.ent_type_ in ["TITER","YIELDS"]:
            if not tok.nbor(-1).ent_type_:
                prod_end = prod_start_end[tok.i]
                unite_key = tok.i
                units_dict[unite_key].append(prod_end)
            else:
                try:
                    units_dict[unite_key].append(tok.i)
                except:
                    continue
    unit_text_dict = {}
    for digital,unit_list in units_dict.items():
        if len(unit_list) >1:
            #print("#"*20,"unit_list",unit_list)
            unit_text = doc[unit_list[1]:unit_list[0]]
            unit_text_dict[unit_list[1]] = unit_text

        #print("unit_text",unit_text)
    new_doc = ""
    init_num = 0
    unit_num = len(unit_text_dict.keys())
    for tok in doc:
        tok_index = tok.i
        
        if tok.pos_ == "NUM" :
            tok_text = tok.text
            if not is_decimal(tok_text):
                continue
            try:
                tok_position = tok.i
                nbor_1 = tok.nbor(1)
                nbor_1_text = nbor_1.text
                if nbor_1_text not in [",","and"]:
                    continue                               
                nbor_2 = tok.nbor(2)
                nbor_2_pos = nbor_2.i + 2 - 1 # 2:neigbor distance; 1: token position(-1)
                if nbor_2_pos in unit_text_dict.keys(): 
                    unit = unit_text_dict[nbor_2_pos]
                    unit_text = unit.text
                    tok_prod = tok.text
                    new_doc = new_doc + doc[init_num:tok_position+1].text + " " + unit_text + " "
                    #print(new_doc)
                    init_num = tok_position + 1
                    del unit_text_dict[nbor_2_pos]
                    unit_text_keys_len = len(unit_text_dict.keys())
            except:
                pass
    new_doc = new_doc + doc[init_num:].text
    return new_doc
def add_multipe_units(text):
    
    if "respectively" not in text:
        return text
    #print("&*"*200,text)        
    text = re.sub('(?<=\d), and (?=\d)', ' and ', text, re.S)
    for i in range(4):
        newtext = add_single_unit(text)
        if not newtext:
            break
        if newtext != text:       
            text = newtext        
        else:
            break            
    return text



def newtext_token(text):
    new_text = add_multipe_units(text)
    nlp = nlp_change()
    doc = nlp(new_text)
    for tok in doc:
        print(tok.text,"-->", tok.pos_,"-->",tok.ent_type_,"-->",tok.i)
    for ent in doc.ents:
        #print(dir(ent))
        print(ent.text, ent.start_char, ent.end_char, ent.label_,ent.start,ent.end)
def get_text_between(chem_word_idx,prod_word_idx,tokens):
    """
    Returns the text between the two person mentions in the sentence for a candidate
    """

    chem_start = chem_word_idx[0]
    chem_end = chem_word_idx[1]
    prod_start = prod_word_idx[0]
    prod_end = prod_word_idx[1]  
    if chem_end < prod_end:
        start = chem_end
        end = prod_start
    else:
        start = prod_end
        end = chem_start

    tokens_between =tokens[start:end]
    return tokens_between
def adv_delete_process(text):
    text = text.replace('significantly','')
    text = text.replace('levels','')
    text = text.replace('obvious','')
    text = text.replace('little','')
    text = text.replace('substantially','')
    text = text.replace('less than','')
    return text
def abstract_process(text):
    text = adv_delete_process(text)
    text = text.replace(' g L-1',' g/L ')
    #text = text.replace('L-1',' /L')
    text = text.replace(' L-1','/L')
    text = text.replace('L(-1)', '/L')
    text = text.replace(' l(-)(1)','/L')
    text = text.replace(' l(-1)','/L')
    text = text.replace('L(org) (-1)','/L')
    text = text.replace('/L. ', '/L . ')
    text = text.replace(' d. ', ' d . ')
    text = text.replace(' . g','/g')

    text = text.replace('g/liter','g/L')
    text = text.replace('g liter-1','g/L')
    text = text.replace('grams per liter','g/L')
    text = text.replace('by-product','byproduct')
    text = text.replace('g (dry weight)','g')
    text = text.replace('each (','')
    text = text.replace('(L h)','L/h')
    text = text.replace('/L h','/L/h')
    text = text.replace('  h-1','/h')
    text = text.replace(' h(-1)','/h')
    text = text.replace('/100 g','/g')
    text = text.replace(' day-1','/d')
    text = text.replace(' day','/d')
    text = text.replace('dry cell weight','DCW')
    text = text.replace("DCW",'')
    text = text.replace('()','')
    text = re.sub('(?<=g)L-1(?=d)', '/L ', text, re.S)
    text = re.sub('(?<=\d)g(?=/)', ' g', text, re.S)
    text = re.sub('(?<=/.)g day', 'g/d', text, re.S)
    text = re.sub('(?<=\d)mol(?=/)',' mol',text,re.S)
    text = re.sub('(?<=[a-z])A\. (?=[A-Z])', 'A . ', text, re.S)
    text = re.sub(' h\. (?=[A-Z])', ' h . ', text, re.S)
    text = re.sub('(?<=[A-Z])\. (?=[A-Z])', ' . ', text, re.S)
    text = re.sub('% molar yield', '% mol/mol', text, re.S)
    return text
def get_pubmedid_from_path(path_text):
    path,filename = os.path.split(path_text)
    pubmedid_raw,file_type = os.path.splitext(filename)
    pubmed_id = pubmedid_raw[:-8]
    return pubmed_id

path_df = "../ProcessedData/all_synbio20200716.pkl"
with open(path_df, 'rb') as data:
    df = pickle.load(data)
pmid_list = list(df['PMID'])
abs_title_error_lemma = ['body', 'diet', 'KD', 'Km','mean','degrade','degradation','infect','infected','ingest', 'infuse', 'Ki', 'LC50', 'serum', 'Kd', 'K(M', 'K(m','K(i', 'rabbit', 'dose', 'perfuse', 'reperfusion', 'IC50', 'inject', 'dosage']
abs_title_error_lemma_set = set(abs_title_error_lemma)

sp = spacy.load("en_core_web_sm") 


error_list = set()
head_column = ["chem_word","prod_word","chem_word_idx",'prod_word_idx','sentence','tokens',\
               'between_tokens','chems','prods','chem_pos_word',"label",'pubmedid']
result =pd.DataFrame(columns=head_column)
for i,filename in enumerate(glob("../prodcurator/*curator.json")):
    print(i,filename)
    pubmed_id = get_pubmedid_from_path(filename)
    pmid_index = pmid_list.index(pubmed_id)
    #print(pmid_index)
    abs_title = df.loc[pmid_index].Title_Abstract
    abs_title_token = sp(abs_title)
    abs_title_lemma = [word.lemma_ for word in abs_title_token]
    abs_title_lemma_set = set(abs_title_lemma)
    if len(abs_title_lemma_set.intersection(abs_title_error_lemma_set))>0:
        continue
    annotator_text = json.load(open(filename))
    for key,sentence_curator in annotator_text.items():
        sub_chem_prod_list = []
        sentence_text = sentence_curator['sentece_text']
        sentence_text = abstract_process(sentence_text)
        new_text = add_multipe_units(sentence_text)
        nlp = nlp_change()
        doc = nlp(new_text)
        sentence_tokens = [tok.text for tok in doc]
        sentence_chems_set = set()
        sentence_prods_set = set()
        chem_prod_list = []
        chem_pos_word = {}
        
        for sub_key, text_info in sentence_curator.items():
            if sub_key == 'sentece_text':
                continue
            sub_chem_prod_dict = {} 
            if sub_key == "PRO":
                curator_label_dig = 1
            else:
                curator_label_dig = -1            
            for positive_unit in text_info:
                
                chem = positive_unit['chem']
                chempos = positive_unit['chempos']
                prod_lable = positive_unit['prod_lable']
                if prod_lable == 0:
                    prod_lable_dig = -1
                else:
                    prod_lable_dig = 1
                curator_label = curator_label_dig * prod_lable_dig
                chempos_text = "*#*".join(chempos)
                sentence_chems_set.add(chempos_text)
                new_chempos = [int(chemposdig) for chemposdig in chempos]
                old_chem = doc[new_chempos[0]:new_chempos[1]].text
                chem_pos_word[chempos_text] = old_chem
                if old_chem != chem:
                    error_list.add(filename)
                    #print("#"*20,doc[new_chempos[0]:new_chempos[1]],chem)
                   
                
                
                prod = positive_unit['prod']
                prodpos = positive_unit['prodpos']
                prodpos_text = "*#*".join(prodpos)
                sentence_prods_set.add(prodpos_text)
                new_prodpos = [int(prodposdig) for prodposdig in prodpos]
                old_prod = doc[new_prodpos[0]:new_prodpos[1]].text
                if old_prod != prod:
                    error_list.add(filename)
                    
                
                text_between = get_text_between(new_chempos,new_prodpos,sentence_tokens)
                sub_chem_prod_dict = {'chem':chem,\
                                      'chempos':new_chempos,\
                                      'prod':prod,\
                                      'prodpos':new_prodpos,\
                                      'between_tokens':text_between,\
                                      'Label':curator_label}
                chem_prod_list.append(sub_chem_prod_dict)

            
            
        sentence_chems = list(sentence_chems_set)
        sentence_prods = list(sentence_prods_set)
        for chem_prod in chem_prod_list:
            chem_word = chem_prod['chem']
            
            chem_word_idx = chem_prod["chempos"]
            
            prod_word = chem_prod['prod']
            prod_word_idx = chem_prod['prodpos']
            #print("#"*20,type(prod_word_idx),prod_word_idx)
            label = chem_prod["Label"]
            between_tokens = chem_prod['between_tokens']

            temp = pd.DataFrame({
                                 "chem_word":[chem_word],\
                                 "chem_word_idx":[chem_word_idx],\
                                 "prod_word":[prod_word],\
                                 'prod_word_idx':[prod_word_idx],\
                                 'sentence':[sentence_text],\
                                 'tokens':[sentence_tokens],\
                                 'between_tokens': [between_tokens],\
                                 'chems':[sentence_chems],\
                                 'prods':[sentence_prods],\
                                 'chem_pos_word':[chem_pos_word],\
                                 "label":[label],\
                                 "pubmedid":[pubmed_id]
                                 })            
            result = pd.concat([result, temp], ignore_index=True)
                                         
        print(sentence_text)
pd.set_option("display.max_colwidth", 0)
result = result.reindex(columns=head_column) 
result.reset_index(drop=True)

#result.to_pickle("../ProcessedData/procon_corpus.pkl")
result.head()

0 ../prodcurator/3955571_curator.json
As  as 0.2mM added alpha-D-glucose (0.4mM alpha-D-xylose)  increased the rate of enzymically catalyzed release of fluoride from 25mM beta-D-glucosyl fluoride at 0 degrees.
1 ../prodcurator/12932830_curator.json
The oligonucleosomes assembled from 12-repeat DNA and saturating amounts of core histone octamer plus histone H5 are compacted, in the presence of 1 mM free magnesium ions, to the level of the 30-nm fiber.
2 ../prodcurator/16507371_curator.json
3 ../prodcurator/27372134_curator.json
Dissolved oxygen (DO) of UASS and AF reactors kept around 1.39+-0.27 and 0.99+-0.38mg/L, respectively.
4 ../prodcurator/28311_curator.json
It is, however, markedly labilized by an increase in the ionic strength of the medium brought about by the addition of 0.2 M potassium chloride or in pH above 9.
5 ../prodcurator/2877616_curator.json
Ornithine and arginine (5 to 20 mM), but not glutamic acid or proline, exerted a concentration-dependent stimulatory effect on t

The polylysine-synthesizing ability by staphylococcal ribosomes increased up to about two times as much as that by E. coli Q13 ribosomes, when S-100 from E. coli Q13 was mixed with staphylococcal ribosomes which had been washed with a high salt HEPES buffer containing 10 mM HEPES, 1 mM EGTA, 16 mM magnesium acetate, 1.0 M ammonium chloride and 0.1 mM dithiothreitol (pH7.6).
35 ../prodcurator/18584754_curator.json
By limiting phosphorous concentration, HBsAg expression level for the YNN27/p2micro-S11 strain with inducible PHO5 promoter reached 0.2-0.3 mg/L .
By controlling nutrient addition rate and dissolved oxygen concentration, HBsAg concentrations of 3-10 mg/L were achieved in 60-70 h fermentation using the YNN27/pDCB-S2 strain with the constitutive GPD promoter.
36 ../prodcurator/1735138_curator.json
37 ../prodcurator/16299003_curator.json
Exogenous application of 10 microM ABA leads to swelling, root hair formation and initiation of lateral root primodia in the tips of young, semi

Under optimal conditions (100 mM glycerol, 100-200 mM benzoic anhydride, dioxane, 25-30 degrees C), the enzymatic synthesis of (R)-MBG was successfully operated in a packed-bed reactor for about 1 week, with an average productivity of 0.79 g MBG/day/g biocatalyst in the case of continuous operation and 0.94 g MBG/day/g biocatalyst in the case of semicontinuous operation.
67 ../prodcurator/24068697_curator.json
68 ../prodcurator/22133602_curator.json
The finally genetic strain YJM13 harboring the MVA pathway and ispS(Pa) gene could accumulate isoprene up to 2.48 mg/l and 532 mg/l under the flask and fed-batch fermentation conditions, respectively, which is about three times and five times to the control strain.
69 ../prodcurator/24376888_curator.json
70 ../prodcurator/28336295_curator.json
Biomass productivity was highest for AC768 strain with 1.8 g/L/d, while hydrocarbon production ranged from none to up to 42% per gram biomass dry weight, with Showa showing the highest hydrocarbon con

Relatively better production of L-DOPA (> 0.60 mg/ml) was obtained when 2.0% (w/v) glucose was used as a carbon source in the mycelium production medium and the tyrosinase activity increased constitutively (1.08 mg/ml), which resulted in a greater production of L-DOPA .
The maximum conversion of L-tyrosine to L-DOPA (0.428 mg/ml) was achieved 60 min after the biochemical reaction.
Best results of L-DOPA biosynthesis were observed when the concentration of illite was 3.5 x 10(-6) M (1.686 mg/ml L-DOPA produced with 1.525 mg/ml consumption of L-tyrosine).
The comparison of kinetic parameters showed the ability of mutant to yield L-DOPA (i.e., Yp/x 7.360 +/- 0.04 mg/mg).
102 ../prodcurator/15135955_curator.json
Buffers and water containing 10 mM NaCl became bactericidal against all three bacteria upon PEF treatment, and the bactericidal effect could be neutralized by thiosulfate, suggesting that chlorine and/or hypochlorite had been formed.
In the Ames mutagenicity test using His- S .
103

A two-liquid phase fed-batch fermentation with glucose as the sole carbon and energy source resulted in the formation of 700 mg /L (S)-limonene.
The use of glycerol as a carbon source resulted in a prolonged growth and production phase (specific activities of >=50 mU g(cdw) (-1) ) leading to a final (S)-limonene concentration of 2,700 mg /L .
144 ../prodcurator/30125151_curator.json
145 ../prodcurator/17222854_curator.json
The method consisted of a two-step gradient (0% B for 2 min and from 0 to 50% B in 10 min) being mobile phase A a 2 0mM borate buffer (pH 9) and mobile phase B a 20 mM borate buffer (pH 9) containing 1M sodium chloride.
146 ../prodcurator/20045990_curator.json
147 ../prodcurator/2369580_curator.json
148 ../prodcurator/8407961_curator.json
149 ../prodcurator/18811000_curator.json
Application of 100 microM aspterric acid (AA), a pollen growth inhibitor, with different concentrations of indole-3-acetic acid (IAA) results in the recovery of normal pollen development of A

Insulin-like growth factor (IGF)-I (10(-9)M-10(-7)M) was found to be mitogenic for cultured cells from all three zones as shown by increased incorporation of [3H]thymidine into DNA.
183 ../prodcurator/9325277_curator.json
TrwD is probably a peripheral outer membrane protein because it could be solubilized by increasing salt concentration to 0.5 M NaCl in the lysis buffer.
184 ../prodcurator/9439593_curator.json
185 ../prodcurator/20883441_curator.json
Using an A. thaliana microarray chip, we determined changes in the expression of approximately 2 800 genes between A. stelleri plants treated with 0.2 M mannitol versus mock-treated plants.
japonica seedlings treated with 0.2 M mannitol, 0.2 M sorbitol, and 0.2 M NaCl.
186 ../prodcurator/9398079_curator.json
Herbicide 14C-terbutryn binding measurements, also with thylakoids, show that the QB niche of the mutant is  modified, at least 7-8 fold increased terbutryn dissociation constant is shown (220 nM in the mutant versus 29 nM in the wild

Unnamed: 0,chem_word,prod_word,chem_word_idx,prod_word_idx,sentence,tokens,between_tokens,chems,prods,chem_pos_word,label,pubmedid
0,alpha-D-glucose,0.2mM,"[5, 10]","[3, 4]",As as 0.2mM added alpha-D-glucose (0.4mM alpha-D-xylose) increased the rate of enzymically catalyzed release of fluoride from 25mM beta-D-glucosyl fluoride at 0 degrees.,"[As, , as, 0.2mM, added, alpha, -, D, -, glucose, (, 0.4mM, alpha, -, D, -, xylose, ), , increased, the, rate, of, enzymically, catalyzed, release, of, fluoride, from, 25mM, beta, -, D, -, glucosyl, fluoride, at, 0, degrees, .]",[added],"[27*#*28, 5*#*10, 12*#*17, 30*#*36]","[29*#*30, 3*#*4, 11*#*12]","{'5*#*10': 'alpha-D-glucose', '12*#*17': 'alpha-D-xylose', '27*#*28': 'fluoride', '30*#*36': 'beta-D-glucosyl fluoride'}",-1,3955571
1,alpha-D-glucose,25mM,"[5, 10]","[29, 30]",As as 0.2mM added alpha-D-glucose (0.4mM alpha-D-xylose) increased the rate of enzymically catalyzed release of fluoride from 25mM beta-D-glucosyl fluoride at 0 degrees.,"[As, , as, 0.2mM, added, alpha, -, D, -, glucose, (, 0.4mM, alpha, -, D, -, xylose, ), , increased, the, rate, of, enzymically, catalyzed, release, of, fluoride, from, 25mM, beta, -, D, -, glucosyl, fluoride, at, 0, degrees, .]","[(, 0.4mM, alpha, -, D, -, xylose, ), , increased, the, rate, of, enzymically, catalyzed, release, of, fluoride, from]","[27*#*28, 5*#*10, 12*#*17, 30*#*36]","[29*#*30, 3*#*4, 11*#*12]","{'5*#*10': 'alpha-D-glucose', '12*#*17': 'alpha-D-xylose', '27*#*28': 'fluoride', '30*#*36': 'beta-D-glucosyl fluoride'}",-1,3955571
2,alpha-D-xylose,0.2mM,"[12, 17]","[3, 4]",As as 0.2mM added alpha-D-glucose (0.4mM alpha-D-xylose) increased the rate of enzymically catalyzed release of fluoride from 25mM beta-D-glucosyl fluoride at 0 degrees.,"[As, , as, 0.2mM, added, alpha, -, D, -, glucose, (, 0.4mM, alpha, -, D, -, xylose, ), , increased, the, rate, of, enzymically, catalyzed, release, of, fluoride, from, 25mM, beta, -, D, -, glucosyl, fluoride, at, 0, degrees, .]","[added, alpha, -, D, -, glucose, (, 0.4mM]","[27*#*28, 5*#*10, 12*#*17, 30*#*36]","[29*#*30, 3*#*4, 11*#*12]","{'5*#*10': 'alpha-D-glucose', '12*#*17': 'alpha-D-xylose', '27*#*28': 'fluoride', '30*#*36': 'beta-D-glucosyl fluoride'}",-1,3955571
3,alpha-D-xylose,25mM,"[12, 17]","[29, 30]",As as 0.2mM added alpha-D-glucose (0.4mM alpha-D-xylose) increased the rate of enzymically catalyzed release of fluoride from 25mM beta-D-glucosyl fluoride at 0 degrees.,"[As, , as, 0.2mM, added, alpha, -, D, -, glucose, (, 0.4mM, alpha, -, D, -, xylose, ), , increased, the, rate, of, enzymically, catalyzed, release, of, fluoride, from, 25mM, beta, -, D, -, glucosyl, fluoride, at, 0, degrees, .]","[), , increased, the, rate, of, enzymically, catalyzed, release, of, fluoride, from]","[27*#*28, 5*#*10, 12*#*17, 30*#*36]","[29*#*30, 3*#*4, 11*#*12]","{'5*#*10': 'alpha-D-glucose', '12*#*17': 'alpha-D-xylose', '27*#*28': 'fluoride', '30*#*36': 'beta-D-glucosyl fluoride'}",-1,3955571
4,fluoride,0.2mM,"[27, 28]","[3, 4]",As as 0.2mM added alpha-D-glucose (0.4mM alpha-D-xylose) increased the rate of enzymically catalyzed release of fluoride from 25mM beta-D-glucosyl fluoride at 0 degrees.,"[As, , as, 0.2mM, added, alpha, -, D, -, glucose, (, 0.4mM, alpha, -, D, -, xylose, ), , increased, the, rate, of, enzymically, catalyzed, release, of, fluoride, from, 25mM, beta, -, D, -, glucosyl, fluoride, at, 0, degrees, .]","[added, alpha, -, D, -, glucose, (, 0.4mM, alpha, -, D, -, xylose, ), , increased, the, rate, of, enzymically, catalyzed, release, of]","[27*#*28, 5*#*10, 12*#*17, 30*#*36]","[29*#*30, 3*#*4, 11*#*12]","{'5*#*10': 'alpha-D-glucose', '12*#*17': 'alpha-D-xylose', '27*#*28': 'fluoride', '30*#*36': 'beta-D-glucosyl fluoride'}",-1,3955571
