In [51]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [52]:
import sys
import os

path = "/content/drive/MyDrive/NLP_Project_New"
sys.path.append(os.path.abspath(path))

In [53]:
import pandas as pd
from datetime import datetime
import os.path

import logging

logfile = f"{path}/logs/evaluate_model.log"
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')

file_handler = logging.FileHandler(logfile)
file_handler.setFormatter(formatter)
file_handler.setLevel(logging.INFO)

stream_handler = logging.StreamHandler()
stream_handler.setFormatter(formatter)
stream_handler.setLevel(logging.INFO)

logger = logging.getLogger("evaluate_model.log")
logger.setLevel(logging.INFO)

logger.addHandler(file_handler)
logger.addHandler(stream_handler)

In [54]:
# get current date and time
now = datetime.now()
timestamp = now.strftime("%m-%d-%H-%M-%S")
test_pred_path = f'{path}/model_comparisons/test-predictions_comparison.tsv'

df = pd.read_csv(test_pred_path, sep="\t")

logger.info(f"Initialised model evaluation. Predictions table loaded from location {test_pred_path}")


2024-03-28 12:04:07,903 - evaluate_model.log - INFO - Initialised model evaluation. Predictions table loaded from location /content/drive/MyDrive/NLP_Project_New/model_comparisons/test-predictions_comparison.tsv
2024-03-28 12:04:07,903 - evaluate_model.log - INFO - Initialised model evaluation. Predictions table loaded from location /content/drive/MyDrive/NLP_Project_New/model_comparisons/test-predictions_comparison.tsv
2024-03-28 12:04:07,903 - evaluate_model.log - INFO - Initialised model evaluation. Predictions table loaded from location /content/drive/MyDrive/NLP_Project_New/model_comparisons/test-predictions_comparison.tsv
2024-03-28 12:04:07,903 - evaluate_model.log - INFO - Initialised model evaluation. Predictions table loaded from location /content/drive/MyDrive/NLP_Project_New/model_comparisons/test-predictions_comparison.tsv
INFO:evaluate_model.log:Initialised model evaluation. Predictions table loaded from location /content/drive/MyDrive/NLP_Project_New/model_comparisons/te

In [55]:
# get overview over possible attempts:
print("The available attempts are:\n")
for col in df.columns[2:]:
    model_i = col[:-16]
    date_i = col[-14:]
    print(f"Name: {col}\t(Model: {model_i}, Date: {date_i})")

The available attempts are:

Name: bert_base_german_cased_finetuned_3ep--03-24-14-25-43	(Model: bert_base_german_cased_finetuned_3ep, Date: 03-24-14-25-43)
Name: dbmdz/bert-base-german-cased--03-24-14-56-59	(Model: dbmdz/bert-base-german-cased, Date: 03-24-14-56-59)
Name: dbmdz/bert-base-german-cased--03-24-16-47-47	(Model: dbmdz/bert-base-german-cased, Date: 03-24-16-47-47)
Name: dbmdz/flair-historic-ner-lft--03-24-17-27-23	(Model: dbmdz/flair-historic-ner-lft, Date: 03-24-17-27-23)
Name: dbmdz/flair-historic-ner-onb--03-25-09-14-23	(Model: dbmdz/flair-historic-ner-onb, Date: 03-25-09-14-23)
Name: dbmdz/flair-historic-ner-onb_finetuned_3ep--03-26-05-26-04	(Model: dbmdz/flair-historic-ner-onb_finetuned_3ep, Date: 03-26-05-26-04)
Name: dbmdz/flair-historic-ner-onb_finetuned_3ep--03-26-09-30-27	(Model: dbmdz/flair-historic-ner-onb_finetuned_3ep, Date: 03-26-09-30-27)
Name: dbmdz/flair-historic-ner-onb_finetuned_3ep_2e-5--03-26-09-37-54	(Model: dbmdz/flair-historic-ner-onb_finetuned_3ep_2

In [56]:
# which attempt do we want to evaluate?
attempt = input("Please enter the name of the attempt you want to evaluate.")
if attempt in df.columns[2:]:
    logger.info(f"Chosen attempt to evaluate: {attempt}")
else:
    logger.error(f"The chosen attempt name ({attempt}) does not match the attempts in the prediciton s table!")
    raise Exception("The attempt entered does not match the attempts in the predictions table!")

Please enter the name of the attempt you want to evaluate./content/drive/MyDrive/NLP_Project_New/models/bert_lessdense_names_finetuned_6ep_lr0-0002--03-28-11-51-31


2024-03-28 12:04:19,107 - evaluate_model.log - INFO - Chosen attempt to evaluate: /content/drive/MyDrive/NLP_Project_New/models/bert_lessdense_names_finetuned_6ep_lr0-0002--03-28-11-51-31
2024-03-28 12:04:19,107 - evaluate_model.log - INFO - Chosen attempt to evaluate: /content/drive/MyDrive/NLP_Project_New/models/bert_lessdense_names_finetuned_6ep_lr0-0002--03-28-11-51-31
2024-03-28 12:04:19,107 - evaluate_model.log - INFO - Chosen attempt to evaluate: /content/drive/MyDrive/NLP_Project_New/models/bert_lessdense_names_finetuned_6ep_lr0-0002--03-28-11-51-31
2024-03-28 12:04:19,107 - evaluate_model.log - INFO - Chosen attempt to evaluate: /content/drive/MyDrive/NLP_Project_New/models/bert_lessdense_names_finetuned_6ep_lr0-0002--03-28-11-51-31
INFO:evaluate_model.log:Chosen attempt to evaluate: /content/drive/MyDrive/NLP_Project_New/models/bert_lessdense_names_finetuned_6ep_lr0-0002--03-28-11-51-31


In [57]:
def helper(x: str) -> str:
    if x == "B-PER":
        return 1
    elif x == "I-PER":
        return 2
    elif x == "B-LOC":
        return 3
    elif x == "I-LOC":
        return 4
    else:
        return 0

In [58]:
df_numeric = df[["NER", attempt]]
df_numeric.loc[:,"NER"] = df_numeric["NER"].apply(helper)
df_numeric.loc[:,attempt] = df_numeric[attempt].apply(helper)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_numeric.loc[:,"NER"] = df_numeric["NER"].apply(helper)
  df_numeric.loc[:,"NER"] = df_numeric["NER"].apply(helper)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_numeric.loc[:,attempt] = df_numeric[attempt].apply(helper)
  df_numeric.loc[:,attempt] = df_numeric[attempt].apply(helper)


In [59]:
false_positives = 0
false_negatives = 0
true_positives = 0
true_negatives = 0

for index, row in df_numeric.iterrows():
    a = row['NER']
    b = row[attempt]

    if a < b:
        false_positives += 1
    elif a > b:
        false_negatives += 1
    elif (a == b) and (a > 0):
        true_positives += 1
    else:
        true_negatives += 1

#print(f"True Positives: {true_positives}, False Positives: {false_positives}, True Negatives: {true_negatives}, False Negatives: {false_negatives}")

correct_predictions = true_positives + true_negatives
false_predictions = false_positives + false_negatives
all_predictions = correct_predictions + false_predictions

precision = true_positives / (true_positives + false_positives)
recall = true_positives / (true_positives + false_negatives)
accuracy = (true_positives + true_negatives) / (true_positives + true_negatives + false_positives + false_negatives)
f1 = (2 * precision * recall) / (precision + recall)

print(f"True Positives:\t\t{true_positives}\nTrue Negatives:\t\t{true_negatives}\nFalse Positives:\t{false_positives}\nFalse Negatives:\t{false_negatives}\n")
print(f"Correct predictions:\t{correct_predictions}\nFalse predictions:\t{false_predictions}\nAll predictions:\t{all_predictions}\n")
print(f"Therefore accuracy:\t{correct_predictions} ÷ {all_predictions} = {correct_predictions / all_predictions}\n")

logger.info(f"EVALUATION RESULTS:\nPrecision:\t\t{precision}\nRecall:\t\t\t{recall}\nAccuracy:\t\t{accuracy}\nF1-score:\t\t{f1}\n")

2024-03-28 12:04:33,566 - evaluate_model.log - INFO - EVALUATION RESULTS:
Precision:		0.5335426777048244
Recall:			0.6190791652865187
Accuracy:		0.9829107047492772
F1-score:		0.5731370745170192

2024-03-28 12:04:33,566 - evaluate_model.log - INFO - EVALUATION RESULTS:
Precision:		0.5335426777048244
Recall:			0.6190791652865187
Accuracy:		0.9829107047492772
F1-score:		0.5731370745170192

2024-03-28 12:04:33,566 - evaluate_model.log - INFO - EVALUATION RESULTS:
Precision:		0.5335426777048244
Recall:			0.6190791652865187
Accuracy:		0.9829107047492772
F1-score:		0.5731370745170192

2024-03-28 12:04:33,566 - evaluate_model.log - INFO - EVALUATION RESULTS:
Precision:		0.5335426777048244
Recall:			0.6190791652865187
Accuracy:		0.9829107047492772
F1-score:		0.5731370745170192

INFO:evaluate_model.log:EVALUATION RESULTS:
Precision:		0.5335426777048244
Recall:			0.6190791652865187
Accuracy:		0.9829107047492772
F1-score:		0.5731370745170192



True Positives:		1869
True Negatives:		158256
False Positives:	1634
False Negatives:	1150

Correct predictions:	160125
False predictions:	2784
All predictions:	162909

Therefore accuracy:	160125 ÷ 162909 = 0.9829107047492772



In [60]:
# maybe add some commentary about the attempt
comments = input("Add comments about this attempt here.")

Add comments about this attempt here.bert fine tuned on train data with 10% sentences w/o names. LR 0.0002


In [61]:
eval_results = {'attempt': attempt,
                'time': timestamp,
                'model_name': attempt[:-16],
                'precision': precision,
                'recall': recall,
                'accuracy': accuracy,
                'F1-score': f1,
                'comments': comments}

In [62]:
savepath = f'{path}/model_comparisons/model_results.csv'

if os.path.isfile(savepath):
    model_comp_df = pd.read_csv(savepath)
    model_comp_df = model_comp_df.append(eval_results, ignore_index=True)
    model_comp_df.to_csv(savepath, index=False)
    logger.info(f"Appended evaluation results to table at location {savepath}.")
else:
    model_comp_df = pd.DataFrame(columns = ['attempt', 'time', 'model_name', 'precision', 'recall', 'accuracy', 'F1-score', 'comments'])
    model_comp_df = model_comp_df.append(eval_results, ignore_index=True)
    model_comp_df.to_csv(savepath, index=False)
    logger.info(f"Couldn't find a table for evaluation results, so I created one at location {savepath}")

  model_comp_df = model_comp_df.append(eval_results, ignore_index=True)
2024-03-28 12:05:53,307 - evaluate_model.log - INFO - Appended evaluation results to table at location /content/drive/MyDrive/NLP_Project_New/model_comparisons/model_results.csv.
2024-03-28 12:05:53,307 - evaluate_model.log - INFO - Appended evaluation results to table at location /content/drive/MyDrive/NLP_Project_New/model_comparisons/model_results.csv.
2024-03-28 12:05:53,307 - evaluate_model.log - INFO - Appended evaluation results to table at location /content/drive/MyDrive/NLP_Project_New/model_comparisons/model_results.csv.
2024-03-28 12:05:53,307 - evaluate_model.log - INFO - Appended evaluation results to table at location /content/drive/MyDrive/NLP_Project_New/model_comparisons/model_results.csv.
INFO:evaluate_model.log:Appended evaluation results to table at location /content/drive/MyDrive/NLP_Project_New/model_comparisons/model_results.csv.


In [63]:
model_comp_df.head(20)

Unnamed: 0,attempt,time,model_name,precision,recall,accuracy,F1-score,comments
0,de_core_news_sm--03-06-13-32-49,03-06-13-33-47,de_core_news_sm,0.136436,0.451839,0.935309,0.209586,first attempt with default German NER for SpaCy
1,de_core_news_sm--03-06-16-32-23,03-06-22-35-12,de_core_news_sm,0.13186,0.488288,0.936221,0.207646,
2,de_core_news_sm--03-10-16-40-20,03-10-16-42-28,de_core_news_sm,0.143152,0.475171,0.93255,0.22002,Attempt with unaltered de_core_news_sm model
3,models/sermons_30it--03-10-16-43-47,03-10-16-44-31,models/sermons_30it,0.716998,0.469047,0.984187,0.567105,"First attempt with model trained from scratch,..."
4,models/sermons_60it--03-13-15-28-41,03-13-15-29-27,models/sermons_60it,0.707806,0.445117,0.984792,0.546535,custom spacy ner model trained over 60 iterations
5,bert_base_german_cased_finetuned_3ep--03-24-14...,03-24-14-44-55,bert_base_german_cased_finetuned_3ep,0.78918,0.607001,0.989681,0.686205,First attempt at finetuning generic bert
6,dbmdz/bert-base-german-cased--03-24-14-56-59,03-24-15-18-18,dbmdz/bert-base-german-cased,0.009615,0.09511,0.805223,0.017464,Attempt with unaltered bert_base_german_cased
7,dbmdz/flair-historic-ner-lft--03-24-17-27-23,03-24-17-34-38,dbmdz/flair-historic-ner-lft,0.299613,0.440241,0.970867,0.356562,Attempt with unaltered historic german ner model
8,dbmdz/flair-historic-ner-onb--03-25-09-14-23,03-25-09-21-17,dbmdz/flair-historic-ner-onb,0.299613,0.440241,0.970867,0.356562,Attempt with default hist-ner with training da...
9,dbmdz/flair-historic-ner-onb_finetuned_3ep--03...,03-26-05-27-25,dbmdz/flair-historic-ner-onb_finetuned_3ep,0.771478,0.146302,0.983101,0.24596,Attempt at finetuning historic flair
