In [1]:
# https://github.com/jiangfeng1124/ChemRxnExtractor/blob/main/chemrxnextractor/train/role_labeling.py
import re
import pandas as pd
from seqeval.metrics import f1_score, precision_score, recall_score

def generate_bio_label(text):
    # Initialize the label list as 'O'
    labels = ['O'] * len(text.split())

    # For each tag type
    for tag_type in ['Reactants', 'Yield', 'Prod', 'Solvent', 'Catalyst_Reagents', 'Temperature', 'Reaction', 'Time']:
        # Use regular expression to find all entities
        for match in re.finditer(r'<{}.*?>'.format(tag_type), text):
            start, end = match.span()
            start_index = len(text[:start].split())
            end_index = len(text[:end].split())
            
            # Assign 'B' tag to the first word, and 'I' tag to the rest
            labels[start_index] = 'B-{}'.format(tag_type)
            for i in range(start_index + 1, end_index):
                labels[i] = 'I-{}'.format(tag_type)

    return labels

# Example of generate_bio_label
sample_text = "The <Prod*product*Prod> was obtained after <Time*24 hours*Time>."
sample_labels = generate_bio_label(sample_text)
print(sample_labels)

prediction = "Reaction of <Reactants*diphenylacetylene*Reactants> with complex 19A led to only cycloheptadienone 23A in 30 % yield ; with (phenylcyclopropy1)- carbene complex <Reactants*19B*Reactants> , cycloheptadienone <Prod*25*Prod> was produced in <Yield*53 %*Yield> yield."
pred_bio_label = generate_bio_label(prediction)
print(pred_bio_label)

['O', 'B-Prod', 'O', 'O', 'O', 'B-Time', 'I-Time']
['O', 'O', 'B-Reactants', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-Reactants', 'O', 'O', 'B-Prod', 'O', 'O', 'O', 'B-Yield', 'I-Yield', 'O']


In [4]:
# Load Predictions
pred_file = "results/predictions/prediction_of_saved_models-Mistral-7B-Instruct-v0.2-train_599_lr5e-06_bs2-mixed_checkpoint-1800-2100-2400-2700-3000.csv"
pred_df = pd.read_csv(pred_file)
pred_df

Unnamed: 0,Generated Text,Actual Text,Paragraph,BIO Label
0,Reaction of <Reactants*diphenylacetylene*React...,Reaction of <Reactants*diphenylacetylene*React...,Reaction of diphenylacetylene with complex 19A...,"['O', 'O', 'B-Reactants', 'O', 'O', 'B-Reactan..."
1,Reaction of <Reactants*diphenylacetylene*React...,Reaction of <Reactants*diphenylacetylene*React...,Reaction of diphenylacetylene with complex 19A...,"['O', 'O', 'B-Reactants', 'O', 'O', 'B-Reactan..."
2,Reaction of diphenylacetylene with complex 19A...,Reaction of <Reactants*diphenylacetylene*React...,Reaction of diphenylacetylene with complex 19A...,"['O', 'O', 'B-Reactants', 'O', 'O', 'O', 'O', ..."
3,"We were excited to ﬁnd that , with 2.0 equiv o...","We were excited to ﬁnd that , with 2.0 equiv o...","We were excited to ﬁnd that , with 2.0 equiv o...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."
4,"We were excited to ﬁnd that , with 2.0 equiv o...","We were excited to ﬁnd that , with 2.0 equiv o...","We were excited to ﬁnd that , with 2.0 equiv o...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."
...,...,...,...,...
106,The metal - mediated conversion of terminal al...,The metal - mediated conversion of terminal al...,The metal - mediated conversion of terminal al...,"['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."
107,The metal - mediated conversion of terminal al...,The metal - mediated conversion of terminal al...,The metal - mediated conversion of terminal al...,"['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."
108,"For example , <Reaction*enantioselective depro...","For example , <Reaction*enantioselective depro...","For example , enantioselective deproto- nation...","['O', 'O', 'O', 'B-Reaction', 'I-Reaction', 'I..."
109,"For example , <Reaction*enantioselective depro...","For example , <Reaction*enantioselective depro...","For example , enantioselective deproto- nation...","['O', 'O', 'O', 'B-Reaction', 'I-Reaction', 'I..."


### Wihout post-processing

In [5]:
word_lists = []
pred_lists = []
label_lists = []

for i, row in pred_df.iterrows():
    word_list = row['Paragraph'].split(" ")

     # Predictions
    if isinstance(row['Generated Text'], str):
        # modify of the results of T5 due to its vocab
        if "t5" in pred_file:
            pred_df['Generated Text'] = pred_df['Generated Text'].apply(lambda x: x.replace("Prod*", "<Prod*").replace("Reactants*", "<Reactants*").replace("Yield*", "<Yield*").replace("Solvent*", "<Solvent*").replace("Catalyst_Reagents*", "<Catalyst_Reagents*").replace("Temperature*", "<Temperature*").replace("Reaction*", "<Reaction*").replace("Time*", "<Time*"))
        else:
            prediction = row['Generated Text'] 
        pred_list = generate_bio_label(prediction)
    else:
        pred_list = ['O'] * len(word_list)

    # Lables
    label_list = eval(row['BIO Label'])
    if len(label_list) == len(pred_list):
        word_lists.append(word_list)
        pred_lists.append(pred_list)
        label_lists.append(label_list)
    else:
        print(i)
        print(row['Paragraph'])
        print(row['Actual Text'])  
        print(row['Generated Text'])    
        print(row['BIO Label'])    
        print()
    
print("The number of sentences don't need post-preprocessed:", f"{len(label_lists)} / {len(pred_df)}")

The number of sentences don't need post-preprocessed: 111 / 111


In [None]:
# Evaluate
print("Precision: ", precision_score(label_lists, pred_lists))
print("Recall: ", recall_score(label_lists, pred_lists))
print("F1_score: ", f1_score(label_lists, pred_lists))

Precision:  0.8194444444444444
Recall:  0.7955056179775281
F1_score:  0.8072976054732041


### With post-processing

In [None]:
def replace_with_context(predict_text, input_text):
    # Check if the predict_text contains any tags, return input_text if not
    if not re.search(r'<.*?\*.*?\*.*?>', predict_text):
        return input_text

    def replace_nth_occurrence(s, n, replacements):
        def replacer(match):
            # Use the escaped version of the matched word to look up in occurrences
            word = re.escape(match.group(0))
            occurrences[word] += 1
            if occurrences[word] == n:
                # Use the original word (without escape) to get the replacement
                return replacements.get(match.group(0), match.group(0))
            return match.group(0)

        # Escape all keys in replacements for regex pattern
        occurrences = {re.escape(word): 0 for word in replacements.keys()}
        pattern = '|'.join(occurrences.keys())
        return re.sub(pattern, replacer, s)

    tag_types = ['Reactants', 'Yield', 'Solvent', 'Catalyst_Reagents', 'Temperature', 'Reaction', 'Time']
    pattern1 = r'<.*?\*.*?\*.*?>'
    pattern2 = '|'.join([fr'\*([^*]+)\*' for tag in tag_types])
    
    matches = re.findall(pattern1, predict_text)
    word2identifier = {}
    for match in matches:
        word = re.search(pattern2, match).group(1)
        word2identifier[word] = match

    for word, identifier in word2identifier.items():
        new_identifier = re.escape(identifier)
        pattern = fr'\b{re.escape(word)}\b|{new_identifier}'
        match1 = re.findall(pattern, input_text)
        if match1:
            match2 = re.findall(pattern, predict_text)
            different_elements_indices = [i + 1 for i, (a, b) in enumerate(zip(match1, match2)) if a != b]
            for index in different_elements_indices:
                input_text = replace_nth_occurrence(input_text, index, {word: identifier})
        else:
            pattern = fr'{re.escape(word)}|{new_identifier}'
            match1 = re.findall(pattern, input_text)
            match2 = re.findall(pattern, predict_text)
            different_elements_indices = [i + 1 for i, (a, b) in enumerate(zip(match1, match2)) if a != b]
            for index in different_elements_indices:
                input_text = replace_nth_occurrence(input_text, index, {word: identifier})
    return input_text

### Some Test Examples

Example1

In [None]:
# Example text and perform the replacement
predict_text = '''To produce 1 and <Reactants*1*Reactants> and <Reactants*1a*Reactants> of <Reactants*Me2- SiC12I7*Reactants>, <Temperature*-20*Temperature> C'''
input_text = '''To produce 1 and 1 and 1a of Me2- SiC12I7, -20 C'''

# Perform the replacement with context check
replaced_text = replace_with_context(predict_text, input_text)
print(replaced_text)
replaced_text = 'To produce 1 and <Reactants*1*Reactants> and <Reactants*1a*Reactants> of <Reactants*Me2- SiC12I7*Reactants>, <Temperature*-20*Temperature> C'

To produce 1 and <Reactants*1*Reactants> and <Reactants*1a*Reactants> of <Reactants*Me2- SiC12I7*Reactants>, <Temperature*-20*Temperature> C


Example2

In [None]:
predict_text = '''Treatment of <Reactants*allyl iodoacetate*Reactants> <Reactants*3a*Reactants> ( 1.0 mmol ) in <Solvent*water*Solvent> ( 30 mL ) with <Reactants*triethylborane*Reactants> ( 1.0 M methanol solution , 0.1 mL , 0.1 mmol ) at <Temperature*25*Temperature> ( cid : 176 ) C for <Time*3 h*Time> provided <Prod*4a*Prod> in <Yield*67 %*Yield> yield.'''
input_text = '''Treatment of allyl iodoacetate 3a ( 1.0 mmol ) in water ( 30 mL ) with triethylborane ( 1.0 M methanol solution , 0.1 mL , 0.1 mmol ) at 25 ( cid : 176 ) C for 3 h provided <Prod*4a*Prod> in 67 % yield.'''

# Perform the replacement with context check
replaced_text = replace_with_context(predict_text, input_text)
print(replaced_text)
replaced_text = 'Treatment of <Reactants*allyl iodoacetate*Reactants> <Reactants*3a*Reactants> ( 1.0 mmol ) in <Solvent*water*Solvent> ( 30 mL ) with <Reactants*triethylborane*Reactants> ( 1.0 M methanol solution , 0.1 mL , 0.1 mmol ) at <Temperature*25*Temperature> ( cid : 176 ) C for <Time*3 h*Time> provided <Prod*4a*Prod> in <Yield*67 %*Yield> yield.'

Treatment of <Reactants*allyl iodoacetate*Reactants> <Reactants*3a*Reactants> ( 1.0 mmol ) in <Solvent*water*Solvent> ( 30 mL ) with <Reactants*triethylborane*Reactants> ( 1.0 M methanol solution , 0.1 mL , 0.1 mmol ) at <Temperature*25*Temperature> ( cid : 176 ) C for <Time*3 h*Time> provided <Prod*4a*Prod> in <Yield*67 %*Yield> yield.


Example3

In [None]:
predict_text = '''The preparation ( Scheme 2 ) of the title compound <Prod*4*Prod> through Br / Li interchange of the known18 α-bromoalkene <Reactants*6*Reactants> in <Solvent*Et2O*Solvent> or <Solvent*t-BuOMe*Solvent> with <Catalyst_Reagents*n-butyllithium*Catalyst_Reagents> ( <Catalyst_Reagents*n-BuLi*Catalyst_Reagents> ) in <Solvent*hexane*Solvent> or <Solvent*cyclopentane*Solvent> was already described.19 However , this method was unproﬁtable in THF as the solvent20 since 4 coupled quickly with its coproduct 1- bromobutane ( n-BuBr ) to give 8 and LiBr.'''
input_text = '''The preparation ( Scheme 2 ) of the title compound <Prod*4*Prod> through Br / Li interchange of the known18 α-bromoalkene 6 in Et2O or t-BuOMe with n- butyllithium ( n-BuLi ) in hexane or cyclopentane was already described.19 However , this method was unproﬁtable in THF as the solvent20 since 4 coupled quickly with its coproduct 1- bromobutane ( n-BuBr ) to give 8 and LiBr.'''

# Perform the replacement with context check
replaced_text = replace_with_context(predict_text, input_text)
print(replaced_text)

The preparation ( Scheme 2 ) of the title compound <Prod*4*Prod> through Br / Li interchange of the known18 α-bromoalkene <Reactants*6*Reactants> in <Solvent*Et2O*Solvent> or <Solvent*t-BuOMe*Solvent> with n- butyllithium ( <Catalyst_Reagents*n-BuLi*Catalyst_Reagents> ) in <Solvent*hexane*Solvent> or <Solvent*cyclopentane*Solvent> was already described.19 However , this method was unproﬁtable in THF as the solvent20 since 4 coupled quickly with its coproduct 1- bromobutane ( n-BuBr ) to give 8 and LiBr.


In [None]:
word_lists = []
pred_lists = []
label_lists = []

for i, row in pred_df.iterrows():
    word_list = row['Paragraph'].split(" ")

    # Predictions
    if isinstance(row['Generated Text'], str):  
        prediction = row['Generated Text']
        prediction = replace_with_context(row['Generated Text'], row['Paragraph'])
        pred_list = generate_bio_label(prediction)
    else:
        print("Empty")
        pred_list = ['O'] * len(word_list)
        
    # Labels
    label_list = eval(row['BIO Label'])
    if len(label_list) == len(pred_list):
        # print("Yes\n") 
        word_lists.append(word_list)
        pred_lists.append(pred_list)
        label_lists.append(label_list)
        # print("predict:\n", row['Generated Text'])    
        # print("intput:\n", row['Paragraph'])
        # print("post_processed:\n", prediction)
        # # print("label:", row['Actual Text'])
        # print("label_BIO:\n", row['BIO Label'])    
        # print("predict_BIO:\n", generate_bio_label(prediction))    
        # print("\n")
        
    else:
        print("predict:\n", row['Generated Text'])    
        print("intput:\n", row['Paragraph'])
        print("post_processed:\n", prediction)
        # print("label:", row['Actual Text'])
        print("label_BIO:\n", row['BIO Label'])    
        print("predict_BIO:\n", generate_bio_label(prediction))    
        print()
    
    # # Prediction different with Label
    # if eval(row['BIO Label']) != generate_bio_label(prediction):
    #     print("predict:\n", row['Generated Text'])    
    #     print("intput:\n", row['Paragraph'])
    #     print("post_processed:\n", prediction)
    #     # print("label:", row['Actual Text'])
    #     print("label_BIO:\n", row['BIO Label'])    
    #     print("predict_BIO:\n", generate_bio_label(prediction))    
    #     print()
        
print("The number of sentences don't need post-preprocessed:", f"{len(label_lists)} / {len(pred_df)}")

The number of sentences don't need post-preprocessed: 111 / 111


In [None]:
# Evaluate
print("Precision: ", precision_score(label_lists, pred_lists))
print("Recall: ", recall_score(label_lists, pred_lists))
print("F1_score: ", f1_score(label_lists, pred_lists))

Precision:  0.8055555555555556
Recall:  0.7820224719101123
F1_score:  0.7936145952109463


: 