## Seed analysis
Look at the variability in the results of different seeds on the training of the models, and look which seeds perform well across models

In [1]:
import pandas as pd
import ast

In [15]:
models = ['count_naive_bayes_201021', 'count_log_reg_201022', 'tfidf_naive_bayes_201021', 'tfidf_log_reg_201021',
'bert_naive_bayes_bert_201021', 'bert_log_reg_bert_201022', 'bert_log_reg_scibert_201022', 
          'count_SVM_201022', 'tfidf_SVM_201022', 'bert_SVM_bert_201022', 'bert_SVM_scibert_201022']

In [16]:
model_scores = []
for model in models:
    with open(f'../model_repeats/{model}/repeated_results.txt', 'r') as f:
        for i, line in enumerate(f):
            model_run = ast.literal_eval(line)
            model_run['Model'] = model
            for metric, value in model_run['Train scores'].items():
                model_run[f'Train {metric}'] = value
            for metric, value in model_run['Test scores'].items():
                model_run[f'Test {metric}'] = value
            model_scores.append(model_run)

model_scores_df = pd.DataFrame(model_scores)

In [17]:
# Average results
model_scores_df.groupby('Model')[['Test accuracy','Test f1','Test precision_score','Test recall_score']].mean().round(3)

Unnamed: 0_level_0,Test accuracy,Test f1,Test precision_score,Test recall_score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
bert_SVM_bert_201022,0.767,0.785,0.743,0.838
bert_SVM_scibert_201022,0.776,0.78,0.787,0.784
bert_log_reg_bert_201022,0.759,0.761,0.771,0.761
bert_log_reg_scibert_201022,0.783,0.791,0.782,0.806
bert_naive_bayes_bert_201021,0.731,0.737,0.736,0.744
count_SVM_201022,0.753,0.75,0.772,0.736
count_log_reg_201022,0.778,0.783,0.778,0.791
count_naive_bayes_201021,0.793,0.805,0.78,0.839
tfidf_SVM_201022,0.751,0.721,0.843,0.66
tfidf_log_reg_201021,0.776,0.767,0.817,0.737


In [18]:
# Standard deviation of results
model_scores_df.groupby('Model')[['Test accuracy','Test f1','Test precision_score','Test recall_score']].std().round(3)

Unnamed: 0_level_0,Test accuracy,Test f1,Test precision_score,Test recall_score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
bert_SVM_bert_201022,0.042,0.036,0.052,0.063
bert_SVM_scibert_201022,0.039,0.038,0.08,0.08
bert_log_reg_bert_201022,0.037,0.037,0.066,0.074
bert_log_reg_scibert_201022,0.035,0.03,0.045,0.073
bert_naive_bayes_bert_201021,0.053,0.051,0.071,0.065
count_SVM_201022,0.043,0.048,0.045,0.084
count_log_reg_201022,0.035,0.033,0.036,0.055
count_naive_bayes_201021,0.044,0.04,0.063,0.07
tfidf_SVM_201022,0.062,0.089,0.081,0.178
tfidf_log_reg_201021,0.054,0.062,0.067,0.121


In [19]:
# Range of results
model_scores_df.groupby('Model')[['Test accuracy','Test f1','Test precision_score','Test recall_score']].agg(lambda x: (round(min(x),3), round(max(x),3)))

Unnamed: 0_level_0,Test accuracy,Test f1,Test precision_score,Test recall_score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
bert_SVM_bert_201022,"(0.72, 0.869)","(0.754, 0.881)","(0.647, 0.825)","(0.763, 0.945)"
bert_SVM_scibert_201022,"(0.748, 0.879)","(0.75, 0.879)","(0.656, 0.904)","(0.684, 0.904)"
bert_log_reg_bert_201022,"(0.71, 0.813)","(0.713, 0.825)","(0.65, 0.894)","(0.643, 0.855)"
bert_log_reg_scibert_201022,"(0.738, 0.832)","(0.727, 0.826)","(0.738, 0.865)","(0.643, 0.894)"
bert_naive_bayes_bert_201021,"(0.636, 0.804)","(0.636, 0.817)","(0.61, 0.812)","(0.596, 0.825)"
count_SVM_201022,"(0.701, 0.841)","(0.687, 0.847)","(0.691, 0.839)","(0.607, 0.855)"
count_log_reg_201022,"(0.729, 0.832)","(0.729, 0.842)","(0.696, 0.824)","(0.696, 0.873)"
count_naive_bayes_201021,"(0.701, 0.85)","(0.724, 0.864)","(0.609, 0.833)","(0.732, 0.927)"
tfidf_SVM_201022,"(0.654, 0.832)","(0.584, 0.833)","(0.656, 0.946)","(0.441, 0.894)"
tfidf_log_reg_201021,"(0.701, 0.841)","(0.681, 0.844)","(0.651, 0.9)","(0.571, 0.885)"


## Best seeds

In [20]:
# Best seed for highest Test F1 score
model_scores_df.groupby('Split random seed')[['Test accuracy','Test f1','Test precision_score','Test recall_score']].mean().sort_values(["Test f1"], ascending = False).round(3)

Unnamed: 0_level_0,Test accuracy,Test f1,Test precision_score,Test recall_score
Split random seed,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0.83,0.838,0.82,0.861
5,0.786,0.797,0.742,0.864
4,0.773,0.784,0.798,0.781
7,0.794,0.782,0.76,0.814
2,0.781,0.78,0.836,0.738
6,0.75,0.755,0.792,0.73
3,0.743,0.753,0.77,0.746
9,0.728,0.74,0.647,0.87
8,0.726,0.733,0.789,0.696
1,0.74,0.726,0.806,0.669


In [21]:
# Best seed for highest Test precision score
model_scores_df.groupby('Split random seed')[['Test accuracy','Test f1','Test precision_score','Test recall_score']].mean().sort_values(["Test precision_score"], ascending = False).round(3)

Unnamed: 0_level_0,Test accuracy,Test f1,Test precision_score,Test recall_score
Split random seed,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2,0.781,0.78,0.836,0.738
0,0.83,0.838,0.82,0.861
1,0.74,0.726,0.806,0.669
4,0.773,0.784,0.798,0.781
6,0.75,0.755,0.792,0.73
8,0.726,0.733,0.789,0.696
3,0.743,0.753,0.77,0.746
7,0.794,0.782,0.76,0.814
5,0.786,0.797,0.742,0.864
9,0.728,0.74,0.647,0.87


In [22]:
# Best seed for highest Test recall score
model_scores_df.groupby('Split random seed')[['Test accuracy','Test f1','Test precision_score','Test recall_score']].mean().sort_values(["Test recall_score"], ascending = False).round(3)

Unnamed: 0_level_0,Test accuracy,Test f1,Test precision_score,Test recall_score
Split random seed,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
9,0.728,0.74,0.647,0.87
5,0.786,0.797,0.742,0.864
0,0.83,0.838,0.82,0.861
7,0.794,0.782,0.76,0.814
4,0.773,0.784,0.798,0.781
3,0.743,0.753,0.77,0.746
2,0.781,0.78,0.836,0.738
6,0.75,0.755,0.792,0.73
8,0.726,0.733,0.789,0.696
1,0.74,0.726,0.806,0.669
