In [1]:
# The evaluation code is modified from https://github.com/jiangfeng1124/ChemRxnExtractor/blob/main/chemrxnextractor/train/prod_extraction.py
import re
import pandas as pd
from seqeval.metrics import f1_score, precision_score, recall_score

def generate_bio_label(text):
    # Initialize the label list as 'O'
    labels = ['O'] * len(text.split())

    # For each tag type
    for tag_type in ['Reactants', 'Yield', 'Prod', 'Solvent', 'Catalyst_Reagents', 'Temperature', 'Reaction', 'Time']:
        # Use regular expression to find all entities
        for match in re.finditer(r'<{}.*?>'.format(tag_type), text):
            start, end = match.span()
            start_index = len(text[:start].split())
            end_index = len(text[:end].split())
            
            # Assign 'B' tag to the first word, and 'I' tag to the rest
            labels[start_index] = 'B-{}'.format(tag_type)
            for i in range(start_index + 1, end_index):
                labels[i] = 'I-{}'.format(tag_type)
                
    return labels

# Example of generate_bio_label
sample_text = "The <Prod*product*Prod> was obtained after <Time*24 hours*Time>."
sample_labels = generate_bio_label(sample_text)
print(sample_labels)

prediction = "Reaction of <Reactants*diphenylacetylene*Reactants> with complex 19A led to only cycloheptadienone 23A in 30 % yield ; with (phenylcyclopropy1)- carbene complex <Reactants*19B*Reactants> , cycloheptadienone <Prod*25*Prod> was produced in <Yield*53 %*Yield> yield."
pred_bio_label = generate_bio_label(prediction)
print(pred_bio_label)

['O', 'B-Prod', 'O', 'O', 'O', 'B-Time', 'I-Time']
['O', 'O', 'B-Reactants', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-Reactants', 'O', 'O', 'B-Prod', 'O', 'O', 'O', 'B-Yield', 'I-Yield', 'O']


In [2]:
# Load Predictions
pred_file = "results/predictions/prediction_of_saved_models-Mistral-7B-Instruct-v0.2-train_6163_lr5e-06_bs2-checkpoint-3082.csv"
pred_df = pd.read_csv(pred_file)
pred_df

Unnamed: 0,Generated Text,Actual Text,Paragraph,BIO Label
0,The additional ring which arises from an intra...,The additional ring which arises from an intra...,The additional ring which arises from an intra...,"['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."
1,The decrease in entropy associated with tether...,The decrease in entropy associated with tether...,The decrease in entropy associated with tether...,"['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."
2,There were no reported examples of successful ...,There were no reported examples of successful ...,There were no reported examples of successful ...,"['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."
3,The dienophile in the tether can cyclize on Ca...,The dienophile in the tether can cyclize on Ca...,The dienophile in the tether can cyclize on Ca...,"['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."
4,Molecular models and MM2 calculations indicate...,Molecular models and MM2 calculations indicate...,Molecular models and MM2 calculations indicate...,"['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."
...,...,...,...,...
718,The '*O isotopic data at hand indicate that th...,The '*O isotopic data at hand indicate that th...,The '*O isotopic data at hand indicate that th...,"['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."
719,What can be ascertained from the peak position...,What can be ascertained from the peak position...,What can be ascertained from the peak position...,"['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."
720,"Thus , for the good resonance amides 5 and 6c ...","Thus , for the good resonance amides 5 and 6c ...","Thus , for the good resonance amides 5 and 6c ...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."
721,This is in accordance with expectations based ...,This is in accordance with expectations based ...,This is in accordance with expectations based ...,"['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."


### Wihout post-processing

In [3]:
word_lists = []
pred_lists = []
label_lists = []

for i, row in pred_df.iterrows():
    word_list = row['Paragraph'].split(" ")
    
    # Predictions
    if isinstance(row['Generated Text'], str):  
        # modify of the results of T5 due to its vocab
        if "t5" in pred_file:
            prediction = row['Generated Text'].replace("Prod*", "<Prod*")
        else:
            prediction = row['Generated Text']
        pred_list = generate_bio_label(prediction)
    else:
        print("EMTPY")
        pred_list = ['O'] * len(word_list)

    # Lables
    label_list = eval(row['BIO Label'])
    if len(label_list) == len(pred_list):
        # print("Yes\n") 
        word_lists.append(word_list)
        pred_lists.append(pred_list)
        label_lists.append(label_list)
    else:
        print("predict:\n", row['Generated Text'])    
        print("intput:\n", row['Paragraph'])
        # print("label:", row['Actual Text'])
        print(row['BIO Label'])
        print()

print("The number of sentences don't need post-preprocessed:", f"{len(label_lists)} / {len(pred_df)}")

predict:
 However , the strong hydrophobic inter- action between the long alkyl chain of the substrate and the aliphatic tail of the catalyst in the metallomicelle core may greatly improve the stability of the catalytic transition state ( Figure 5 ) , resulting in high enantioselectivities in thus long - chain aliphatic β-ketoesters ( 86 − 91 % ee , reduction ofTable 2 , entry 9 − 12 ).
intput:
 However , the strong hydrophobic inter- action between the long alkyl chain of the substrate and the aliphatic tail of the catalyst in the metallomicelle core may greatly improve the stability of the catalytic transition state ( Figure 5 ) , resulting in high enantioselectivities in thus long - chain aliphatic β-ketoesters ( 86 − 91 % ee , reduction of Table 2 , entry 9 − 12 ).
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O',

In [4]:
# Evaluate
print("Precision: ", precision_score(label_lists, pred_lists))
print("Recall: ", recall_score(label_lists, pred_lists))
print("F1_score: ", f1_score(label_lists, pred_lists))

Precision:  0.7222222222222222
Recall:  0.7428571428571429
F1_score:  0.732394366197183


### With post-processing

In [5]:
def replace_with_context(predict_text, input_text):
    if "t5" in pred_file:
        predict_text = predict_text.replace("Prod*", "<Prod*")
    # Check if the predict_text contains any tags, return input_text if not
    if not re.search(r'<.*?\*.*?\*.*?>', predict_text):
        return input_text

    def replace_nth_occurrence(s, n, replacements):
        def replacer(match):
            # Use the escaped version of the matched word to look up in occurrences
            word = re.escape(match.group(0))
            occurrences[word] += 1
            if occurrences[word] == n:
                # Use the original word (without escape) to get the replacement
                return replacements.get(match.group(0), match.group(0))
            return match.group(0)

        # Escape all keys in replacements for regex pattern
        occurrences = {re.escape(word): 0 for word in replacements.keys()}
        pattern = '|'.join(occurrences.keys())
        return re.sub(pattern, replacer, s)

    tag_types = ['Reactants', 'Yield', 'Prod', 'Solvent', 'Catalyst_Reagents', 'Temperature', 'Reaction', 'Time']
    pattern1 = r'<.*?\*.*?\*.*?>'
    pattern2 = '|'.join([fr'\*([^*]+)\*' for tag in tag_types])
    
    matches = re.findall(pattern1, predict_text)
    word2identifier = {}
    for match in matches:
        word = re.search(pattern2, match).group(1)
        word2identifier[word] = match

    for word, identifier in word2identifier.items():
        new_identifier = re.escape(identifier)
        pattern = fr'\b{re.escape(word)}\b|{new_identifier}'
        match1 = re.findall(pattern, input_text)
        if match1:
            match2 = re.findall(pattern, predict_text)
            different_elements_indices = [i + 1 for i, (a, b) in enumerate(zip(match1, match2)) if a != b]
            for index in different_elements_indices:
                input_text = replace_nth_occurrence(input_text, index, {word: identifier})
        else:
            pattern = fr'{re.escape(word)}|{new_identifier}'
            match1 = re.findall(pattern, input_text)
            match2 = re.findall(pattern, predict_text)
            different_elements_indices = [i + 1 for i, (a, b) in enumerate(zip(match1, match2)) if a != b]
            for index in different_elements_indices:
                input_text = replace_nth_occurrence(input_text, index, {word: identifier})
    return input_text

In [6]:
word_lists = []
pred_lists = []
label_lists = []

for i, row in pred_df.iterrows():
    word_list = row['Paragraph'].split(" ")

    # Predictions
    if isinstance(row['Generated Text'], str):  
        prediction = row['Generated Text']
        prediction = replace_with_context(row['Generated Text'], row['Paragraph'])
        pred_list = generate_bio_label(prediction)
        
    else:
        print("EMPTY")
        pred_list = ['O'] * len(word_list)
    # Labels
    label_list = eval(row['BIO Label'])
    if len(label_list) == len(pred_list):
        # print("Yes\n") 
        word_lists.append(word_list)
        pred_lists.append(pred_list)
        label_lists.append(label_list)

    else:
        print("predict:\n", row['Generated Text'])    
        print("intput:\n", row['Paragraph'])
        print("post_processed:\n", prediction)
        # print("label:", row['Actual Text'])
        print("label_BIO:\n", row['BIO Label'])    
        print("predict_BIO:\n", generate_bio_label(prediction))    
        print()
            
    # # Prediction different with Label
    # if eval(row['BIO Label']) != generate_bio_label(prediction):
    #     print("predict:\n", row['Generated Text'])    
    #     print("intput:\n", row['Paragraph'])
    #     print("post_processed:\n", prediction)
    #     # print("label:", row['Actual Text'])
    #     print("label_BIO:\n", row['BIO Label'])    
    #     print("predict_BIO:\n", generate_bio_label(prediction))    
    #     print()

print("The number of sentences don't need post-preprocessed:", f"{len(label_lists)} / {len(pred_df)}")

The number of sentences don't need post-preprocessed: 723 / 723


In [7]:
# Evaluate
print("Precision: ", precision_score(label_lists, pred_lists))
print("Recall: ", recall_score(label_lists, pred_lists))
print("F1_score: ", f1_score(label_lists, pred_lists))

Precision:  0.7168141592920354
Recall:  0.7297297297297297
F1_score:  0.7232142857142856
