In [1]:
import pandas as pd
import os
import pickle
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score, average_precision_score, \
                            confusion_matrix

### 1. Combine all results TSVs into one:

In [2]:
path = '/home/ndg/users/sbagga1/generalization/results/'
dfs = []
for fname in os.listdir(path):
    try:
        d = pd.read_csv(path+fname, delimiter='\t')
        print(fname, d.shape)
        dfs.append(d)
    except Exception as e:
        print(e, fname)
        
        
results_df = pd.concat(dfs, ignore_index=True)
print("\nFinal Shape: ", results_df.shape)

results_df.to_csv('/home/ndg/users/sbagga1/generalization/results/results.tsv', sep='\t', index=None)

results_df.sort_values('F1-score', ascending=False)

bilstm_ELMoTrue.tsv (1, 6)
lstm_ELMoFalse.tsv (1, 6)
cnn_ELMoFalse.tsv (1, 6)
stacked_bilstm_ELMoTrue.tsv (1, 6)
stacked_bilstm_ELMoFalse.tsv (1, 6)
lstm_ELMoTrue.tsv (1, 6)
cnn_ELMoTrue.tsv (1, 6)
bilstm_ELMoFalse.tsv (1, 6)
bert_ELMoFalse.tsv (1, 6)

Final Shape:  (9, 6)


Unnamed: 0,Model,F1-score,Precision,Recall,Accuracy,AUPRC
6,"cnn_ELMoTrue_[14, 14, 14]",0.769395,0.713134,0.837383,0.744792,0.829577
0,"bilstm_ELMoTrue_[14, 14, 14]",0.756854,0.723777,0.794763,0.740738,0.794714
3,"stacked_bilstm_ELMoTrue_[14, 14, 14]",0.750416,0.706924,0.799897,0.729164,0.794704
5,"lstm_ELMoTrue_[14, 14, 14]",0.741878,0.707792,0.779986,0.723678,0.789368
8,"bert_ELMoFalse_[14, 14, 14]",0.735699,0.6718,0.813545,0.70255,0.775601
2,"cnn_ELMoFalse_[14, 14, 14]",0.696436,0.709271,0.688494,0.695897,0.756455
4,"stacked_bilstm_ELMoFalse_[14, 14, 14]",0.664621,0.68723,0.649827,0.66869,0.70774
1,"lstm_ELMoFalse_[14, 14, 14]",0.59459,0.691009,0.539989,0.640603,0.696852
7,"bilstm_ELMoFalse_[14, 14, 14]",0.590753,0.695529,0.523624,0.638317,0.68832


In [3]:
# Sanity check:
check = pd.read_csv('/home/ndg/users/sbagga1/generalization/results/results.tsv', delimiter='\t')
check.sort_values('F1-score', ascending=False)

Unnamed: 0,Model,F1-score,Precision,Recall,Accuracy,AUPRC
6,"cnn_ELMoTrue_[14, 14, 14]",0.769395,0.713134,0.837383,0.744792,0.829577
0,"bilstm_ELMoTrue_[14, 14, 14]",0.756854,0.723777,0.794763,0.740738,0.794714
3,"stacked_bilstm_ELMoTrue_[14, 14, 14]",0.750416,0.706924,0.799897,0.729164,0.794704
5,"lstm_ELMoTrue_[14, 14, 14]",0.741878,0.707792,0.779986,0.723678,0.789368
8,"bert_ELMoFalse_[14, 14, 14]",0.735699,0.6718,0.813545,0.70255,0.775601
2,"cnn_ELMoFalse_[14, 14, 14]",0.696436,0.709271,0.688494,0.695897,0.756455
4,"stacked_bilstm_ELMoFalse_[14, 14, 14]",0.664621,0.68723,0.649827,0.66869,0.70774
1,"lstm_ELMoFalse_[14, 14, 14]",0.59459,0.691009,0.539989,0.640603,0.696852
7,"bilstm_ELMoFalse_[14, 14, 14]",0.590753,0.695529,0.523624,0.638317,0.68832


### 2. CNN with ELMo performs the best. Adding their prediction probabilities to the main dataset CSV, and verifying classification metrics:

In [4]:
with open('/home/ndg/users/sbagga1/generalization/predictions/cnn_ELMoTrue.pickle', 'rb') as f:
    map_id_pred = pickle.load(f)
    
main_df = pd.read_csv('/home/ndg/users/sbagga1/generalization/data/Gen_Sentences_Annotated_All_Final_Processed.csv')
main_df['Prob(gen)'] = main_df['ID'].map(map_id_pred)
print(main_df.shape)
main_df.head(6)

(3456, 11)


Unnamed: 0,section,sent.no,filename,sentences,neutral,generalization,exemplification,attribution,conditional,ID,Prob(gen)
0,e,156,nlh.47.1.626118_nonotes.txt,"To this end, one of the main merits of Merleau...",1,0,0,0,0,1,0.536154
1,b,207,ahr.2016.121.2.437_nonotes.txt,In their response they chastised her for her u...,1,0,0,0,0,2,0.000346
2,b,180,ahr.2016.121.1.17_nonotes.txt,VOC officials who encountered these arguments ...,1,0,0,0,0,3,0.416858
3,e,288,asr.2016.81.5.1039_nonotes.txt,In YEAR—for the first time since YEAR—white wo...,0,1,0,0,0,4,0.798453
4,e,171,sr.55.2.05_nonotes.txt,"With its large type, ostentatious margins, an...",0,1,0,0,0,5,0.845614
5,b,216,modernismmodernity.2016.23.4.771_nonotes.txt,"A plastic, adaptable subject turned into an ar...",0,1,1,0,0,6,0.260751


In [5]:
# True labels:
main_df['true_label'] = None
main_df.loc[main_df['generalization']==1, ['true_label']] = 'generalization'
main_df.loc[main_df['neutral']==1, ['true_label']] = 'neutral'

# Prediction labels:
main_df['pred_label'] = None
main_df.loc[main_df['Prob(gen)']>=0.5, ['pred_label']] = 'generalization'
main_df.loc[main_df['Prob(gen)']<0.5, ['pred_label']] = 'neutral'
# Alternative: main_df['pred_label'] = np.where(main_df['Prob(gen)']>=0.5, 'generalization', 'neutral')

# main_df.drop(['neutral', 'generalization', 'exemplification', 'attribution', 'conditional'], axis=1, inplace=True)
main_df.head(6)

Unnamed: 0,section,sent.no,filename,sentences,neutral,generalization,exemplification,attribution,conditional,ID,Prob(gen),true_label,pred_label
0,e,156,nlh.47.1.626118_nonotes.txt,"To this end, one of the main merits of Merleau...",1,0,0,0,0,1,0.536154,neutral,generalization
1,b,207,ahr.2016.121.2.437_nonotes.txt,In their response they chastised her for her u...,1,0,0,0,0,2,0.000346,neutral,neutral
2,b,180,ahr.2016.121.1.17_nonotes.txt,VOC officials who encountered these arguments ...,1,0,0,0,0,3,0.416858,neutral,neutral
3,e,288,asr.2016.81.5.1039_nonotes.txt,In YEAR—for the first time since YEAR—white wo...,0,1,0,0,0,4,0.798453,generalization,generalization
4,e,171,sr.55.2.05_nonotes.txt,"With its large type, ostentatious margins, an...",0,1,0,0,0,5,0.845614,generalization,generalization
5,b,216,modernismmodernity.2016.23.4.771_nonotes.txt,"A plastic, adaptable subject turned into an ar...",0,1,1,0,0,6,0.260751,generalization,neutral


In [6]:
# Compute metrics:
y_true = main_df['true_label'].tolist()
y_pred = main_df['pred_label'].tolist()
y_score = main_df['Prob(gen)'].tolist() # for AUPRC

print("F1 = {} | Precision = {} | Recall = {} | Accuracy = {} | AUPRC = {}"\
      .format(f1_score(y_true, y_pred, pos_label='generalization'),
              precision_score(y_true, y_pred, pos_label='generalization'),
              recall_score(y_true, y_pred, pos_label='generalization'),
              accuracy_score(y_true, y_pred),
              average_precision_score(y_true, y_score, pos_label='generalization')
              )
     )

F1 = 0.7695924764890282 | Precision = 0.7119381343644272 | Recall = 0.8374076179647527 | Accuracy = 0.7447916666666666 | AUPRC = 0.8263386716875039


In [7]:
confusion_matrix(y_true, y_pred, labels=['generalization', 'neutral'])

array([[1473,  286],
       [ 596, 1101]])

In [8]:
tp = 1473.0
fp = 596.0
tn = 1101.0
fn = 286.0

print("Precision: ", tp/(tp+fp))
print("Recall: ", tp/(tp+fn))

Precision:  0.7119381343644272
Recall:  0.8374076179647527


In [9]:
# FP
main_df.loc[(main_df['true_label']=='neutral') & (main_df['pred_label']=='generalization')].shape[0]

596

In [10]:
# FN
main_df.loc[(main_df['true_label']=='generalization') & (main_df['pred_label']=='neutral')].shape[0]

286

In [11]:
main_df = main_df[['filename', 'ID', 'sentences', 'true_label', 'pred_label', 'Prob(gen)', 'section', 'sent.no', 
                   'neutral', 'generalization', 'exemplification', 'attribution', 'conditional']]

main_df.to_csv('/home/ndg/users/sbagga1/generalization/predictions/Gen_Sentences_CNN_Predictions.csv', index=None)

# fin.