### Load Package and Data

In [1]:
import pandas as pd
import joblib
import pandas as pd
import numpy as np
from typing import Tuple, Sequence, List, Optional, Iterable, Any, Iterator, Callable
import textdistance

def levenshtein_similarity(truth: List[str], pred: List[str]) -> float:
    assert len(truth) == len(pred)
    scores = sum(textdistance.levenshtein.normalized_similarity(t, p) for t, p in zip(truth, pred))
    return scores / len(truth)

def full_sentence_accuracy(truth: List[str], pred: List[str]) -> float:
    """Calculate the number of exact matches."""
    assert len(truth) == len(pred)
    correct_count = sum(int(t == p) for t, p in zip(truth, pred))
    return correct_count / len(truth)

truth_file = 'data/data_for_bart_or_t5/test_300_one_column_lstrip_add_space.csv'
predict_file = 'saved_models/t5-base/train_200_t5-base_lr_1e-05_epoch_50_bs_8_intervel_1/predictions/epoch_10.csv'

truth_df = pd.read_csv(truth_file, encoding='utf-8').fillna('N/A')
predict_df = pd.read_csv(predict_file, encoding='utf-8').fillna('N/A')
predict_df

Unnamed: 0,Paragraph,Generated Text,Actual Text
0,Paragraph_2_IUPAC: Synthesis and crystallizati...,"1-(3,5-bis­(tri­fluoro­meth­yl)phen­yl)-2-brom...","1-(3,5-bis(trifluoromethyl)phenyl)-2-bromoetha..."
1,Paragraph_2_IUPAC: Synthesis of N-(diaminometh...,N-(diaminomethylidene)-4-[(E)-(4-hydroxyphenyl...,N-(diaminomethylidene)-4-[(E)-(4-hydroxyphenyl...
2,"Paragraph_2_IUPAC: 1-(3,4-Dimethylphenyl)-5-ph...","1-(3,4-Dimethylphenyl)-5-phenyl-1H-pyrazole-3,...","1-(3,4-Dimethylphenyl)-5-phenyl-1H-pyrazole-3,..."
3,Paragraph_2_IUPAC: Synthesis of [(M)-d-4]-C12-...,[(M)-d-4]-C12-TEG,[(M)-d-4]-C12-TEG
4,Paragraph_2_IUPAC: 4.1 3-Amino-5-(1-methyl-1H-...,3-Amino-5-(1-methyl-1H-pyrazol-4-yl)pyridin-2(...,3-Amino-5-(1-methyl-1H-pyrazol-4-yl)pyridin-2(...
...,...,...,...
2095,Paragraph_2_13C_NMR_data: (S)-5-Benzyl 1-tert-...,"172.9, 172.5, 172.0, 171.9, 156.8, 135.8, 128....","172.9, 172.5, 172.0, 171.9, 156.8, 135.8, 128...."
2096,Paragraph_2_13C_NMR_data: 2.1.1. Synthesis of ...,"166.9, 143.7, 137.2, 136.1, 132.5, 130.4, 129....","166.9, 143.7, 137.2, 136.1, 132.5, 130.4, 129...."
2097,Paragraph_2_13C_NMR_data: N-(2-((4-(3-(3-Hydro...,"158.6, 157.6, 149.5, 141.9, 140.3, 139.3, 138....","158.6, 157.6, 149.5, 141.9, 140.3, 139.3, 138...."
2098,"Paragraph_2_13C_NMR_data: 3.2.7. (1R,4aS)-2-Hy...","177.6, 151.1, 146.7, 145.2, 134.2, 126.5, 124....","177.6, 151.1, 146.7, 145.2, 134.2, 126.5, 124...."


### Evaluate

In [2]:
# Split into 7 tasks
len_data = len(predict_df)
print('length_of_data', len_data)
ground_truth_IUPAC, predict_IUPAC = predict_df['Actual Text'][:len_data//7*1], predict_df['Generated Text'][:len_data//7*1]
ground_truth_1H_NMR, predict_1HNMR = predict_df['Actual Text'][len_data//7*1:len_data//7*2], predict_df['Generated Text'][len_data//7*1:len_data//7*2]
ground_truth_13C_NMR, predict_13CNMR = predict_df['Actual Text'][len_data//7*2:len_data//7*3], predict_df['Generated Text'][len_data//7*2:len_data//7*3]
ground_truth_1H_NMR_conditions, predict_1HNMR_conditions = predict_df['Actual Text'][len_data//7*3:len_data//7*4], predict_df['Generated Text'][len_data//7*3:len_data//7*4] 
ground_truth_1H_NMR_data, predict_1HNMR_data = predict_df['Actual Text'][len_data//7*4:len_data//7*5], predict_df['Generated Text'][len_data//7*4:len_data//7*5]
ground_truth_13C_NMR_conditions, predict_13CNMR_conditions = predict_df['Actual Text'][len_data//7*5:len_data//7*6], predict_df['Generated Text'][len_data//7*5:len_data//7*6]
ground_truth_13C_NMR_data, predict_13CNMR_data = predict_df['Actual Text'][len_data//7*6:len_data], predict_df['Generated Text'][len_data//7*6:len_data]

print("levenshtein_similarity_of_IUPAC: ", levenshtein_similarity(ground_truth_IUPAC, predict_IUPAC))
print("levenshtein_similarity_of_1HNMR_text: ", levenshtein_similarity(ground_truth_1H_NMR, predict_1HNMR))
print("levenshtein_similarity_of_1HNMR_conditions: ", levenshtein_similarity(ground_truth_1H_NMR_conditions, predict_1HNMR_conditions))
print("levenshtein_similarity_of_1HNMR_data: ", levenshtein_similarity(ground_truth_1H_NMR_data,predict_1HNMR_data))
print("levenshtein_similarity_of_13CNMR_text: ", levenshtein_similarity(ground_truth_13C_NMR, predict_13CNMR))
print("levenshtein_similarity_of_13CNMR_conditions: ", levenshtein_similarity(ground_truth_13C_NMR_conditions, predict_13CNMR_conditions))
print("levenshtein_similarity_of_13CNMR_data: ", levenshtein_similarity(ground_truth_13C_NMR_data, predict_13CNMR_data))
print()
print("full_sentence_accuracy_of_IUPAC: ", full_sentence_accuracy(ground_truth_IUPAC, predict_IUPAC))
print("full_sentence_accuracy_of_1HNMR_text: ", full_sentence_accuracy(ground_truth_1H_NMR, predict_1HNMR))
print("full_sentence_accuracy_of_1HNMR_conditions: ", full_sentence_accuracy(ground_truth_1H_NMR_conditions, predict_1HNMR_conditions))
print("full_sentence_accuracy_of_1HNMR_data: ", full_sentence_accuracy(ground_truth_1H_NMR_data, predict_1HNMR_data))
print("full_sentence_accuracy_of_13CNMR_text: ", full_sentence_accuracy(ground_truth_13C_NMR, predict_13CNMR))
print("full_sentence_accuracy_of_13CNMR_conditions: ", full_sentence_accuracy(ground_truth_13C_NMR_conditions, predict_13CNMR_conditions))
print("full_sentence_accuracy_of_13CNMR_data: ", full_sentence_accuracy(ground_truth_13C_NMR_data, predict_13CNMR_data))

length_of_data 2100
levenshtein_similarity_of_IUPAC:  0.9596404007329166
levenshtein_similarity_of_1HNMR_text:  0.9044517033002613
levenshtein_similarity_of_1HNMR_conditions:  0.9432796800673551
levenshtein_similarity_of_1HNMR_data:  0.9174531695499049
levenshtein_similarity_of_13CNMR_text:  0.8245892721296092
levenshtein_similarity_of_13CNMR_conditions:  0.8976495188815012
levenshtein_similarity_of_13CNMR_data:  0.8201180981637292

full_sentence_accuracy_of_IUPAC:  0.86
full_sentence_accuracy_of_1HNMR_text:  0.03666666666666667
full_sentence_accuracy_of_1HNMR_conditions:  0.8333333333333334
full_sentence_accuracy_of_1HNMR_data:  0.7833333333333333
full_sentence_accuracy_of_13CNMR_text:  0.043333333333333335
full_sentence_accuracy_of_13CNMR_conditions:  0.7633333333333333
full_sentence_accuracy_of_13CNMR_data:  0.6866666666666666
