## Fairness over different groups
How well does the model do on different subsets of the test data?
- 360 giving data predictions
- 42 data predictions

In [1]:
import json

import pandas as pd
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

In [2]:
cd ..

/Users/gallaghe/Code/nutrition-labels


In [3]:
from nutrition_labels.evaluate import merge_grants



## Get grants data (title, description, year, organisation)

In [4]:
grant_data = pd.read_csv('data/raw/wellcome-grants-awarded-2005-2019.csv')
grant_data.drop_duplicates(subset=['Internal ID'], inplace=True)
len(grant_data)

16854

## Get test data

In [5]:
model_date = '210402'

# To get ground truth:
model_dir = f'models/{model_date}'
model_name = f'bert_log_reg_{model_date}' # it doesn't actually matter which model you choose, since all ground truth is the same

In [6]:
training_info_file = f'{model_dir}/training_information.json'
with open(training_info_file, 'r') as file:
    for line in file:
        model_data = json.loads(line)
        model_name = list(model_data.keys())[0]
        if model_name==model_name:
            raw_test_data = [(grant_id, m['Truth']) for grant_id, m in model_data[model_name].items() if m['Test/train']=='Test']
            raw_train_data = [(grant_id, m['Truth']) for grant_id, m in model_data[model_name].items() if m['Test/train']=='Train']
            break
raw_test_data = pd.DataFrame(raw_test_data, columns = ['Reference', 'Truth'])
raw_train_data = pd.DataFrame(raw_train_data, columns = ['Reference', 'Truth'])

## Merge with 360 giving predictions

In [7]:
# To get predictions:
predictions_date = '210406'
model_360_preds = pd.read_csv(f'data/processed/predictions/{predictions_date}/wellcome-grants-awarded-2005-2019_tagged.csv')
len(model_360_preds)

16914

In [8]:
model_360_preds.rename({'Tech grant prediction': 'Tech grant 360 prediction', 'Grant ID': 'Grant ID 1'}, axis=1, inplace=True)

In [9]:
test_data = pd.merge(
            raw_test_data,
            model_360_preds.drop_duplicates(subset=['Grant ID 1']),
            how="left",
            left_on='Reference',
            right_on='Grant ID 1'
        )
test_data.head(2)

Unnamed: 0,Reference,Truth,Tech grant 360 prediction,Grant ID 1
0,103709/Z/14/A,1,1,103709/Z/14/A
1,202571/Z/16/Z,1,1,202571/Z/16/Z


In [10]:
training_label_name = 'Truth'
test_data = merge_grants(
    test_data,
    grant_data,
    'Grant ID 1',
    'Internal ID',
    training_label_name
)

In [11]:
len(test_data)

157

In [12]:
train_data = pd.merge(
            raw_train_data,
            model_360_preds.drop_duplicates(subset=['Grant ID 1']),
            how="left",
            left_on='Reference',
            right_on='Grant ID 1'
        )
print(len(train_data))

469


In [13]:
training_label_name = 'Truth'
train_data = merge_grants(
    train_data,
    grant_data,
    'Grant ID 1',
    'Internal ID',
    training_label_name
)
print(len(train_data))

469


## Merge with 42 predictions

In [14]:
# To get predictions:
predictions_date = '210403'
model_42_preds = pd.read_csv(f'data/processed/predictions/{predictions_date}/all_grants_fortytwo_info_210420_tagged.csv')
len(model_42_preds)

126341

In [15]:
model_42_preds.rename({'Tech grant prediction': 'Tech grant 42 prediction', 'Grant ID': 'Grant ID 2'}, axis=1, inplace=True)

In [16]:
test_data = pd.merge(
            test_data,
            model_42_preds.drop_duplicates(subset=['Grant ID 2']),
            how="left",
            left_on='Reference',
            right_on='Grant ID 2'
        )
len(test_data)

157

In [17]:
all(test_data['Tech grant 360 prediction']== test_data['Tech grant 42 prediction'])

True

## Evaluate fairness
All the predictions are the same, so the fairness results will be the same for both

In [43]:
# Found by manually looking at the list in the test data (so might not be conclusive!)
golden_triangle = [
    'University College London', 'Imperial College London', "King's College London",
    'University of Oxford',
    'University of Cambridge',
    'Exeter College Oxford'
    ]

In [44]:
def group_data_cols(test_data, golden_triangle):
    # Golden triangle or not
    test_data['Recipient organisation'] = ['Golden triangle' if org in golden_triangle else 'Not golden triangle' for org in test_data['Recipient Org:Name']]

    # Greater london, international or not
    region_grouped = []
    for region in test_data['Region']:
        if region == 'Greater London':
            region_grouped.append('Greater London')
        elif region == 'International':
            region_grouped.append('International')
        else:
            region_grouped.append('UK, not greater London')
    test_data['Region grouped'] = region_grouped

    test_data['Recipient Org:Country grouped'] = ['UK' if g=='United Kingdom' else 'Not UK' for g in test_data['Recipient Org:Country']]

    test_data['Financial Year grouped'] = [
        '<2010' if int(g[0:4])<2010 else (
            '2010-2015' if int(g[0:4])<2015 else (
            '2015-2017' if int(g[0:4])<2017 else '>=2017')
        ) for g in test_data['Financial Year']]

    test_data['Description length'] = test_data['Description'].agg(lambda x: len(x))
    bins = [0,1000, 1250,1500, 2000, 3000, 4000]
    test_data['Description length binned'] = pd.cut(test_data['Description length'], bins)

    test_data['Title length'] = test_data['Title'].agg(lambda x: len(x))
    bins = [0,250, 500,750, 1000, 2000]
    test_data['Title length binned'] = pd.cut(test_data['Title length'], bins)

    test_data["Title plus Description"] = test_data["Title"] + ' ' + test_data["Description"]
    test_data["Title plus Description length"] = test_data["Title plus Description"].agg(lambda x: len(x))
    bins = [0,1000, 1500, 2000, 3000, max(test_data["Title plus Description length"])]
    test_data['Title plus Description length binned'] = pd.cut(test_data['Title plus Description length'], bins)

    return test_data

In [45]:
test_data = group_data_cols(test_data, golden_triangle)

In [46]:
train_data = group_data_cols(train_data, golden_triangle)

In [47]:
data_types = [
    'Recipient organisation',
    'Region grouped',
    'Recipient Org:Country grouped',
    'Financial Year grouped',
    'Title plus Description length binned',
]

In [48]:
def evaluate_data(data, pred_col):
    y = data['Truth'].tolist()
    y_predict = data[pred_col].tolist()
    scores = {
            'Sample size': len(data),
            'accuracy': accuracy_score(y, y_predict),
            'f1': f1_score(y, y_predict, average='binary'),
            'precision_score': precision_score(y, y_predict, zero_division=0, average='binary'),
            'recall_score': recall_score(y, y_predict, zero_division=0, average='binary')}
    return scores

In [59]:
fairness_results = []
for column in data_types:
    for pred_col in ['Tech grant 360 prediction']:
        result = test_data.groupby(column).apply(lambda x: evaluate_data(x, pred_col)).to_dict()
        for column_type, type_results in result.items():
            this_test_data = test_data[test_data[column]==column_type]
            column_results = {
                'Prediction type': pred_col,
                'Data type': column,
                'Type': column_type,
                'Train proportion in this class': sum(train_data[column]==column_type)/len(train_data),
                'Test proportion true': sum(this_test_data['Truth']==1)/len(this_test_data)
            }
            for metric, value in type_results.items():
                column_results[metric] = value
            fairness_results.append(column_results)

fairness_results_df = pd.DataFrame(fairness_results).round(3)
fairness_results_df

Unnamed: 0,Prediction type,Data type,Type,Train proportion in this class,Test proportion true,Sample size,accuracy,f1,precision_score,recall_score
0,Tech grant 360 prediction,Recipient organisation,Golden triangle,0.371,0.58,69,0.913,0.925,0.925,0.925
1,Tech grant 360 prediction,Recipient organisation,Not golden triangle,0.629,0.455,88,0.886,0.875,0.875,0.875
2,Tech grant 360 prediction,Region grouped,Greater London,0.311,0.559,59,0.932,0.937,0.968,0.909
3,Tech grant 360 prediction,Region grouped,International,0.09,0.417,12,0.917,0.909,0.833,1.0
4,Tech grant 360 prediction,Region grouped,"UK, not greater London",0.599,0.488,86,0.872,0.871,0.86,0.881
5,Tech grant 360 prediction,Recipient Org:Country grouped,Not UK,0.113,0.4,15,0.933,0.923,0.857,1.0
6,Tech grant 360 prediction,Recipient Org:Country grouped,UK,0.887,0.521,142,0.894,0.898,0.904,0.892
7,Tech grant 360 prediction,Financial Year grouped,2010-2015,0.275,0.479,48,0.833,0.8,0.941,0.696
8,Tech grant 360 prediction,Financial Year grouped,2015-2017,0.324,0.489,45,0.933,0.936,0.88,1.0
9,Tech grant 360 prediction,Financial Year grouped,<2010,0.068,0.25,8,1.0,1.0,1.0,1.0


In [60]:
fairness_results_df.to_csv(f'data/processed/fairness/fairness_results_{model_date}.csv')