How well did the 2020 ensemble model do on new grants tagged
with the new definition of tech - i.e. not just UK based and not just in the health domain

In [1]:
import re
import ast

import pandas as pd
from sklearn.metrics import accuracy_score, classification_report,  f1_score, precision_score, recall_score
import numpy as np


In [3]:
cd ..

/Users/gallaghe/Code/nutrition-labels


In [3]:
from nutrition_labels.ensemble_model import EnsembleModel, get_seed_results
from nutrition_labels.utils import pretty_confusion_matrix, clean_grants_data

## Load data

In [5]:
training_data = pd.read_csv('data/processed/training_data/210126/training_data.csv')
old_training_data = pd.read_csv('data/processed/training_data/200807/training_data.csv')
grant_data = pd.read_csv('data/raw/wellcome-grants-awarded-2005-2019.csv')
old_ensemble_results = pd.read_csv('data/processed/ensemble/201118/201118_all_ensemble_results.csv')

## How many data points were changed?

In [18]:
merged_training_data = pd.merge(
    old_training_data[['Internal ID', 'Relevance code']],
    training_data[['Internal ID', 'Relevance code']], 
    how='outer', on ='Internal ID', suffixes=('_old', '_new'))
merged_training_data.fillna('nan').groupby(['Relevance code_old','Relevance code_new']).size().reset_index().rename(columns={0:'count'})

Unnamed: 0,Relevance code_old,Relevance code_new,count
0,0.0,0.0,151
1,0.0,1.0,33
2,0.0,,699
3,1.0,1.0,213
4,1.0,,1
5,,0.0,198
6,,1.0,101


## Load model and find out how well it transfers

In [5]:
f1_cutoff = 0.8
precision_cutoff = 0.82
recall_cutoff = 0.82
after_date = 201022
before_date = 201022

In [6]:
ensemble_model = EnsembleModel(
    f1_cutoff =f1_cutoff,
    precision_cutoff = precision_cutoff,
    recall_cutoff = recall_cutoff,
    before_date = before_date,
    after_date = after_date)

useful_models = ensemble_model.find_useful_models()

split_seed = [get_seed_results(model_dir) for model_dir in useful_models]
print(f'There is/are {len(set(split_seed))} unique split seeds used for these models '\
    'if this is more than 1 then the ensemble model metrics can be ignored')
split_seed = split_seed[0]

4 useful models found
There is/are 1 unique split seeds used for these models if this is more than 1 then the ensemble model metrics can be ignored


In [7]:
# Merge the training data with the tag to say whether the grant
# was used in the training or not
old_ensemble_results = old_ensemble_results[['Internal ID', 'How has this grant been used before?', 'Ensemble predictions - 3 models']]
training_data = pd.merge(training_data, old_ensemble_results, how = 'left', on = ['Internal ID'])
len(training_data)

696

In [8]:
# Process grants data for predicting
training_data = clean_grants_data(training_data)
training_data['Grant texts'] = training_data[['Title', 'Grant Programme:Title', 'Description']].agg(
            '. '.join, axis=1
            ).tolist()

In [9]:
training_data = training_data[['Internal ID', 'Relevance code',
                               'Grant texts', 'Ensemble predictions - 3 models', 'How has this grant been used before?']]

In [11]:
training_data.head(2)

Unnamed: 0,Internal ID,Relevance code,Grant texts,Ensemble predictions - 3 models,How has this grant been used before?
0,106169/Z/14/Z,1,A UK Hub to Catalyse Open Target Discovery.. S...,1,Training data
1,213494/Z/18/Z,1,Spatiotemporal dynamics of arbovirus transmiss...,0,Test data


In [12]:
# Only care about the grants not in the training data
training_data = training_data.loc[
    (pd.notnull(training_data['Relevance code'])) & (
        training_data['How has this grant been used before?'] != 'Training data')]
len(training_data)

507

In [13]:
training_data.groupby(['Relevance code', 'How has this grant been used before?']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Internal ID,Grant texts,Ensemble predictions - 3 models
Relevance code,How has this grant been used before?,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,Test data,15,15,15
0,Unseen data,304,304,304
1,Test data,55,55,55
1,Unseen data,133,133,133


In [14]:
grants_text = training_data['Grant texts'].tolist()

In [15]:
# Predict for each model
_ = ensemble_model.predict(grants_text, useful_models)
model_predictions_df = ensemble_model.model_predictions_df
del model_predictions_df['Ensemble prediction']

Predicting for count_SVM_201022...




Predicting for bert_SVM_scibert_201022...


100%|██████████| 507/507 [09:48<00:00,  1.16s/it]


Predicting for bert_SVM_bert_201022...


100%|██████████| 507/507 [10:08<00:00,  1.20s/it]


Predicting for tfidf_log_reg_201022...




In [16]:
prediction_sums = model_predictions_df['Number of models agree tech grant']

cutoff = 3

training_data[f'New Ensemble predictions - {cutoff} models'] = [1 if pred_sum >= cutoff else 0 for pred_sum in prediction_sums]

In [17]:
y = training_data['Relevance code'].tolist()
y_predict = training_data[f'New Ensemble predictions - {cutoff} models'].tolist()
# Evaluate ensemble results
scores = {
        'accuracy': accuracy_score(y, y_predict),
        'f1': f1_score(y, y_predict, average='binary'),
        'precision_score': precision_score(y, y_predict, zero_division=0, average='binary'),
        'recall_score': recall_score(y, y_predict, zero_division=0, average='binary'),
        'Test classification report': classification_report(y, y_predict),
        'Test confusion matrix': pretty_confusion_matrix(y, y_predict)}

In [18]:
scores

{'accuracy': 0.7337278106508875,
 'f1': 0.5970149253731344,
 'precision_score': 0.6802721088435374,
 'recall_score': 0.5319148936170213,
 'Test classification report': '              precision    recall  f1-score   support\n\n           0       0.76      0.85      0.80       319\n           1       0.68      0.53      0.60       188\n\n    accuracy                           0.73       507\n   macro avg       0.72      0.69      0.70       507\nweighted avg       0.73      0.73      0.73       507\n',
 'Test confusion matrix':               predicted tag 0  predicted tag 1
 actual tag 0              272               47
 actual tag 1               88              100}

## Original model scores (on original test data only)

In [32]:
scores

{'accuracy': 0.8691588785046729,
 'f1': 0.8727272727272727,
 'precision_score': 0.8727272727272727,
 'recall_score': 0.8727272727272727,
 'Test classification report': '              precision    recall  f1-score   support\n\n         0.0       0.87      0.87      0.87        52\n         1.0       0.87      0.87      0.87        55\n\n    accuracy                           0.87       107\n   macro avg       0.87      0.87      0.87       107\nweighted avg       0.87      0.87      0.87       107\n',
 'Test confusion matrix':               predicted tag 0  predicted tag 1
 actual tag 0               45                7
 actual tag 1                7               48}