# Data formate suitable for Prodigy 
1. All the data for tagging (source data) for prodigy needs to be in jsonl
2. Convert original training data into the correct format for db-in

In [1]:
import pandas as pd

import json

In [2]:
cd ..

/Users/gallaghe/Code/nutrition-labels


In [3]:
from nutrition_labels.useful_functions import remove_useless_string

In [4]:
# # The list of grants in the original training data 
# training_data = pd.read_csv('data/processed/training_data/210126/training_data.csv')
# training_data_grants = training_data['Internal ID'].tolist()

In [4]:
# Grant data inc descriptions
original_grant_data = pd.read_csv('data/raw/wellcome-grants-awarded-2005-2019.csv')
len(original_grant_data)

16914

In [5]:
# Ensemble results - so we know which data points were in train/test split
ensemble_results = pd.read_csv('data/processed/ensemble/210129_all_ensemble_results.csv')
ensemble_results = ensemble_results[['Internal ID', 'How has this grant been used before?', 'Ensemble predictions - 3 models']]
len(ensemble_results)

14613

In [6]:
grant_text_cols = ['Title', 'Description']

### 1. All the data for tagging (source data) for prodigy needs to be in jsonl

In [7]:
grant_data = original_grant_data.copy()
grant_data.fillna('', inplace=True)
grant_data[grant_text_cols] = grant_data[grant_text_cols].applymap(
    remove_useless_string
    )

grant_data = grant_data[grant_data['Description'] != 'Not available']
grant_data.dropna(subset=['Description'], inplace=True)
grant_data.drop_duplicates('Internal ID', inplace=True)
    
grant_data['text'] = grant_data[grant_text_cols].agg(
        '. '.join, axis=1
        ).tolist()

grant_data['id'] = grant_data['Internal ID']

In [8]:
len(grant_data)

14613

In [9]:
result = grant_data[['id','text']].to_dict(orient="records")

In [10]:
len(result)

14613

In [11]:
with open('data/prodigy/grants_data.jsonl', 'w') as json_file:
    for entry in result:
        json.dump(entry, json_file)
        json_file.write('\n')

### 2. Convert original training data into the correct format for db-in

In [12]:
grant_data = pd.merge(grant_data, ensemble_results, how = 'left', on = ['Internal ID'])
grant_data['score'] = [0.5]*len(grant_data)
grant_data['label'] = ["Tech grant" if label==1.0 else "Not tech grant" for label in grant_data['Ensemble predictions - 3 models'].tolist()]

training_split_data = grant_data.loc[grant_data['How has this grant been used before?'] == 'Training data'].reset_index()
test_split_data = grant_data.loc[grant_data['How has this grant been used before?'] == 'Test data'].reset_index()

In [13]:
training_split_data['label'].value_counts()

Not tech grant    281
Tech grant        239
Name: label, dtype: int64

In [14]:
train_split_formatted = training_split_data[['id', 'text', 'score', 'label']].to_dict(orient="records")
test_split_formatted = test_split_data[['id', 'text', 'score', 'label']].to_dict(orient="records")

In [15]:
train_split_formatted[0]

{'id': '219414/Z/19/Z',
 'text': 'Whole Genome Sequencing (WGS) of 450,000 UK Biobank Samples. Large-scale WGS of the UK Biobank cohort to generate and evaluate therapeutic hypotheses regarding targets, biomarkers and pathways implicated in disease',
 'score': 0.5,
 'label': 'Tech grant'}

In [16]:
with open('data/prodigy/existing_training_data.jsonl', 'w') as json_file:
    for entry in train_split_formatted:
        json.dump(entry, json_file)
        json_file.write('\n')

In [17]:
with open('data/prodigy/existing_test_data.jsonl', 'w') as json_file:
    for entry in test_split_formatted:
        json.dump(entry, json_file)
        json_file.write('\n')