In [1]:
import pandas as pd
import ast
from google.colab import drive
from pathlib import Path, PurePath
import numpy as np

## Set up directory to read file

Change `local_dir` based on the location of test results file.

In [2]:
drive.mount('/content/drive')
drive_path=PurePath('/content/drive/My Drive')
local_dir = drive_path/'COVID-19/dataverse/'
file = 'time_period_results.csv'

Mounted at /content/drive


In [3]:
evals = pd.read_csv(local_dir/'evaluation_labels.csv')

In [4]:
evals['revised_cf_label'] = np.where(evals.revised_cf_label.isnull(),evals.cf_label,evals.revised_cf_label)

In [5]:
evals = evals[evals.revised_cf_label!='del']

In [6]:
evals.shape

(265, 7)

In [7]:
len(evals)

265

In [8]:
evals.index = np.arange(len(evals))

In [9]:
result = pd.read_csv(local_dir/file)

In [10]:
result

Unnamed: 0,sentence,true_text,true_label,predicted_text,predicted_label
0,We determined that the mean duration from onse...,"['mean duration from onset to discharge', '18....","['TPcontext', 'TPdata']","['mean duration from onset to discharge', '18....","['TPcontext', 'TPdata']"
1,The mean incubation period was estimated at 5·...,"['mean incubation period', '5·2 days (95% CI 1...","['TPcontext', 'TPdata', 'TPcontext', 'TPdata']","['mean incubation period', '5·2 days (95% CI 1...","['TPcontext', 'TPdata', 'TPcontext', 'TPdata']"
2,The median duration of fever was 9 days (range...,"['median duration of fever', '9 days (range: 2...","['TPcontext', 'TPdata']","['median duration of fever', '9 days (range: 2...","['TPcontext', 'TPdata']"
3,There is a mean 5-day delay from symptom onset...,"['hospitalized', '3-7 days after onset']","['TPcontext', 'TPdata']",['mean 5-day delay from symptom onset to detec...,"['TPcontext', 'TPdata']"
4,This proportion was obtained by simulating val...,"['mean incubation time', '5.2 days']","['TPcontext', 'TPdata']","['mean incubation time', '5.2 days']","['TPcontext', 'TPdata']"
...,...,...,...,...,...
86,14.15 If effectiveness of intervention in cont...,"['incubation period', '5.8 days']","['TPcontext', 'TPdata']","['incubation period', '5.8 days']","['TPcontext', 'TPdata']"
87,In the provinces outside Hubei we estimate the...,"['serial interval', 'on average 5.1 days (95%C...","['TPcontext', 'TPdata']",['on average 5.1 days (95%CI: 1.3-11.6)'],['TPdata']
88,We observed the mean serial interval was 4.1 d...,"['mean serial interval', '4.1 days', '8.4-day ...","['TPcontext', 'TPdata', 'TPdata', 'TPcontext']","['mean serial interval', '4.1 days', '7.6-day ...","['TPcontext', 'TPdata', 'TPcontext']"
89,"5 During the 2003 SARS outbreak, 238 probable ...","[' aged', '65 years and older']","['TPcontext', 'TPdata']","['among people', '65 years']","['TPcontext', 'TPdata']"


# Compare actual and predicted texts and labels

Adds an additional column labeling whether the text is a true positive, false positive, or false negative. Methods outlined [here](https://towardsdatascience.com/entity-level-evaluation-for-ner-task-c21fb3a8edf).

In [11]:
def calc_cf(x):
  '''Returns a data frame of predicted and actual texts along with the classification label'''
  results_list = []
  sentence = x['sentence']
  true_text = ast.literal_eval(x['true_text'])
  predicted_text = ast.literal_eval(x['predicted_text'])
  true_label = ast.literal_eval(x['true_label'])
  predicted_label = ast.literal_eval(x['predicted_label'])
  if len(true_label) == len(predicted_label):
    texts = list(zip(true_text,predicted_text))
    labels = list(zip(true_label, predicted_label))
    for i in range(len(texts)):
      (true_text, predict_text) = texts[i]
      (true_label, predict_label) = labels[i]
      if ((true_text in predict_text) and (true_label == predict_label)) or ((predict_text in true_text) and (true_label == predict_label)):
        label = 'tp'
      elif ((true_label == predict_label) and (true_text != predict_text)) or (true_label != predict_label):
        label = 'fn & fp'
      else:
        continue
      results_list.append({'sentence':sentence,'true_text':true_text,'predicted_text':predict_text,'true_label':true_label,'predicted_label':predict_label,'cf_label':label})  
  else:
    results_list = compare_two_list(true_label, predicted_label, true_text, predicted_text, sentence)
  results_df = pd.DataFrame(results_list,columns=['sentence','true_text','predicted_text','true_label','predicted_label','cf_label'])
  return results_df

In [12]:
def compare_two_list(true_label, predicted_label, true_text, predicted_text, sentence):
  '''Compare actual and predicted text if they have different lengths'''
  # Compare predicted with actual to find false and true positives
  results_list = []
  for i in range(len(predicted_text)):
    label = None
    for j in range(len(true_text)):
      if (predicted_text[i] in true_text[j]) and (predicted_label[i] == true_label[j]):
        label = 'tp'
        predict_text = predicted_text[i]
        actual_text = true_text[j]
        actual_label = true_label[j]
        predict_label = predicted_label[i]
        break
    if label != 'tp':
      label = 'fp'
      predict_text = predicted_text[i]
      predict_label = predicted_label[i]
      actual_text = None
      actual_label = None
    results_list.append({'sentence':sentence,'true_text':actual_text,'predicted_text':predict_text,'true_label':actual_label,'predicted_label':predict_label,'cf_label':label})
  # Compare actual with predicted to find false negatives
  for k in range(len(true_text)):
    label1 = None
    for h in range(len(predicted_text)):
      if (true_text[k] in predicted_text[h]) and (true_label[k] == predicted_label[h]):
        label1 = 'tp'
        break
    if label1 != 'tp':
      label1 = 'fn'
      actual_text1 = true_text[k]
      actual_label1 = true_label[k]
      predict_text1 = None
      predict_label1 = None
      results_list.append({'sentence':sentence,'true_text':actual_text1,'predicted_text':predict_text1,'true_label':actual_label1,'predicted_label':predict_label1,'cf_label':label1})
  return results_list

In [13]:
def compare_two_list(true_label, predicted_label, true_text, predicted_text, sentence):
  '''Compare actual and predicted text if they have different lengths'''
  results_list = []
  dic = {}
  for i in range(len(true_text)):
    for j in range(len(predicted_text)):
      if ((predicted_text[j] in true_text[i]) and (true_label[i] == predicted_label[j])) or ((true_text[i] in predicted_text[j]) and (true_label[i] == predicted_label[j])):
        dic[true_text[i]] = predicted_text[j]
        label = 'tp'
        predict_text = predicted_text[j]
        actual_text = true_text[i]
        actual_label = true_label[i]
        predict_label = predicted_label[j]
        results_list.append({'sentence':sentence,'true_text':actual_text,'predicted_text':predict_text,'true_label':actual_label,'predicted_label':predict_label,'cf_label':label})
        break  
  for t in range(len(true_text)):
    if true_text[t] not in dic.keys():
      label1 = 'fn'
      actual_text1 = true_text[t]
      actual_label1 = true_label[t]
      predict_text1 = None
      predict_label1 = None
      results_list.append({'sentence':sentence,'true_text':actual_text1,'predicted_text':predict_text1,'true_label':actual_label1,'predicted_label':predict_label1,'cf_label':label1})
  for p in range(len(predicted_text)):
    if predicted_text[p] not in dic.values():
      label2 = 'fp'
      actual_text2 = None
      actual_label2 = None
      predict_text2 = predicted_text[p]
      predict_label2 = predicted_label[p]
      results_list.append({'sentence':sentence,'true_text':actual_text2,'predicted_text':predict_text2,'true_label':actual_label2,'predicted_label':predict_label2,'cf_label':label2})
  return results_list

In [14]:
# Running code for entire data frame
tot = pd.DataFrame()
for i in range(len(result)):
  #print(i)
  res = calc_cf(result.loc[i])
  tot = pd.concat([tot,res], ignore_index=True)

In [15]:
tot[tot.cf_label=='fn & fp']

Unnamed: 0,sentence,true_text,predicted_text,true_label,predicted_label,cf_label
8,There is a mean 5-day delay from symptom onset...,hospitalized,mean 5-day delay from symptom onset to detecti...,TPcontext,TPcontext,fn & fp
37,Most infected people are close to peak infecti...,"3-5 days, beginning ≈3 days after being exposed",about 3-5 days,TPdata,TPdata,fn & fp
64,The median duration between symptoms onset and...,8. days,median duration between symptoms onset and adm...,TPdata,TPcontext,fn & fp
65,The median duration between symptoms onset and...,median duration between symptoms onset and adm...,8. days in group,TPcontext,TPdata,fn & fp
78,"The median incubation duration was 6 days, ran...",8 patients got more longer incubation duration,1 to 32 days,TPcontext,TPdata,fn & fp
79,"The median incubation duration was 6 days, ran...",more than 14 days,24,TPdata,TPdata,fn & fp
103,We estimated the median incubation period to f...,"2.6 days (CI, 2.1 to 3.7 days)","within 2.6 days (CI, 2.1 to 3.7 days",TPdata,TPdata,fn & fp
132,"12 As the epidemic progressed, we observed a f...",increase in number of cases,65 years and older,TPcontext,TPdata,fn & fp
133,"12 As the epidemic progressed, we observed a f...",aged 65 years and older,(<,TPdata,TPcontext,fn & fp
143,Twenty five (40%) of the patients were aged 19...,aged,Twenty five,TPcontext,TPcontext,fn & fp


# Calculating statistics

In [16]:
evals.revised_cf_label.value_counts()

tp         189
fn          42
fp          17
fn & fp     12
tp & fn      2
fn & fn      1
Name: revised_cf_label, dtype: int64

In [17]:
def calc_stats(x, label):
  fn = 0
  fp = 0
  tp = 0
  for i in range(len(x)):
    if x.loc[i][label] == 'fn & fp':
      fn += 1
      fp += 1
    elif x.loc[i][label] == 'tp & fn':
      fn += 1
      tp += 1
    elif x.loc[i][label] == 'fn & fn':
      fn += 2
    elif x.loc[i][label] == 'fn':
      fn += 1
    elif x.loc[i][label] == 'fp':
      fp += 1
    else:
      tp += 1
  return (fn, fp, tp)

In [18]:
(fn, fp, tp) = calc_stats(tot,'cf_label')
print('Number of false negatives',fn)
print('Number of false postitives', fp)
print('Number of true positives', tp)
print('Total Number of records', fn+fp+tp)

Number of false negatives 58
Number of false postitives 52
Number of true positives 173
Total Number of records 283


In [19]:
def cf(tp, fp, fn):
  recall = tp/(tp+fn)
  precision = tp/(tp+fp)
  f1 = (2*recall*precision)/(recall+precision)
  return (recall, precision, f1)

In [20]:
recall, precision, f1 = cf(tp, fp, fn)

In [21]:
print('Precision',round(precision,2))
print('Recall',round(recall,2))
print('F1 score',round(f1,2))

Precision 0.77
Recall 0.75
F1 score 0.76
