In [2]:
import pandas as pd
import numpy as np
results_path = "results/predictions/prediction_of_saved_models-Mistral-7B-Instruct-v0.2-trial_1_train_10000_lr5e-06_bs2-checkpoint-25000.csv"
paragraph2compound_test_df = pd.read_csv(results_path)
paragraph2compound_test_df['ground_truth'] = paragraph2compound_test_df['Actual Text'].apply(lambda x: x.split(" | "))
paragraph2compound_test_df['prediction'] = paragraph2compound_test_df['Generated Text'].apply(lambda x: x.split(" | "))
paragraph2compound_test_df

Unnamed: 0,Generated Text,Actual Text,Paragraph,ground_truth,prediction
0,5-phenylsulfinyl-benzimidazoline-2-thione | al...,5-phenylsulfinyl-benzimidazoline-2-thione | al...,6.86 g (25 mmoles) of 5-phenylsulfinyl-benzimi...,"[5-phenylsulfinyl-benzimidazoline-2-thione, al...","[5-phenylsulfinyl-benzimidazoline-2-thione, al..."
1,[2-bromo-4-(tert-butoxycarbonylamino-methyl)-p...,[2-bromo-4-(tert-butoxycarbonylamino-methyl)-p...,A mixture of [2-bromo-4-(tert-butoxycarbonylam...,[[2-bromo-4-(tert-butoxycarbonylamino-methyl)-...,[[2-bromo-4-(tert-butoxycarbonylamino-methyl)-...
2,trimethylsilyl bromide | [4-(3-methoxypropyl)-...,trimethylsilyl bromide | [4-(3-methoxypropyl)-...,1.560 ml of trimethylsilyl bromide are added d...,"[trimethylsilyl bromide, [4-(3-methoxypropyl)-...","[trimethylsilyl bromide, [4-(3-methoxypropyl)-..."
3,"N-isopropyl-3-(3',4'-dimethoxyphenyl)-piperidi...","N-isopropyl-3-(3',4'-dimethoxyphenyl)-piperidi...","A solution of 4.8 g of N-isopropyl-3-(3',4'-di...","[N-isopropyl-3-(3',4'-dimethoxyphenyl)-piperid...","[N-isopropyl-3-(3',4'-dimethoxyphenyl)-piperid..."
4,thionyl chloride | 5-methoxy-1H-indole-2-carbo...,thionyl chloride | 5-methoxy-1H-indole-2-carbo...,14.5 ml of thionyl chloride are added dropwise...,"[thionyl chloride, 5-methoxy-1H-indole-2-carbo...","[thionyl chloride, 5-methoxy-1H-indole-2-carbo..."
...,...,...,...,...,...
995,"(E)-1-(1-hydroxy-3,3-dimethyl-1,3-dihydrobenzo...","crude compound | (E)-1-(1-hydroxy-3,3-dimethyl...",A mixture of crude compound (E)-1-(1-hydroxy-3...,"[crude compound, (E)-1-(1-hydroxy-3,3-dimethyl...","[(E)-1-(1-hydroxy-3,3-dimethyl-1,3-dihydrobenz..."
996,tert-butyl 3-aminopropylcarbamate | toluene | ...,tert-butyl 3-aminopropylcarbamate | toluene | ...,To a solution of tert-butyl 3-aminopropylcarba...,"[tert-butyl 3-aminopropylcarbamate, toluene, t...","[tert-butyl 3-aminopropylcarbamate, toluene, t..."
997,methanesulfonate | diglycolic anhydride | 4-am...,methanesulfonate | (a) | diglycolic anhydride ...,methanesulfonate by (a) reacting diglycolic an...,"[methanesulfonate, (a), diglycolic anhydride, ...","[methanesulfonate, diglycolic anhydride, 4-ami..."
998,Ethyl trifluoroacetate | tert-butyl methyl eth...,Ethyl trifluoroacetate | tert-butyl methyl eth...,Ethyl trifluoroacetate (6.32 g) was dissolved ...,"[Ethyl trifluoroacetate, tert-butyl methyl eth...","[Ethyl trifluoroacetate, tert-butyl methyl eth..."


In [2]:
# Lists to store metrics for each row
precisions = []
recalls = []
f1_scores = []
jaccard_indices = []
VERBOSE = False # True

# Iterate through each row and compute metrics

for index, row in paragraph2compound_test_df.iterrows():
    paragraph_text = row['Paragraph']
    ground_truth_entities = row['ground_truth']
    predicted_entities = row['prediction']
    
    if VERBOSE:        
        print("paragraph_text: \n", paragraph_text)
        print("ground_truth_entities: \n", ground_truth_entities)
        print("predicted_entities: \n", predicted_entities)

    # Calculate TP, FP, FN
    TP = len(set(ground_truth_entities) & set(predicted_entities))
    FP = len(predicted_entities) - TP
    FN = len(ground_truth_entities) - TP

    if TP + FP == 0:
        precision = 0.0
    else:
        precision = TP / (TP + FP)
        
    if TP + FN == 0:
        recall = 0.0
    else:
        recall = TP / (TP + FN)

    if precision + recall == 0:
        f1score = 0.0
    else:
        f1score = 2 * (precision * recall) / (precision + recall)
    
    jaccard = TP / (TP + FP + FN)
    
    # Calculate metrics
    precisions.append(precision)
    recalls.append(recall)
    f1_scores.append(f1score)
    jaccard_indices.append(jaccard)
    
    if VERBOSE:        
        break
    
# Calculate average metrics
print("avg_Precision: ", np.mean(precisions))
print("avg_Recall: ", np.mean(recalls))
print("avg_F1_score: ", np.mean(f1_scores))
print("avg_Jaccard_index: ", np.mean(jaccard_indices))

avg_Precision:  0.8698658300184658
avg_Recall:  0.8858837459968651
avg_F1_score:  0.8731743644423936
avg_Jaccard_index:  0.7988926875445815
