In [1]:
import random
from pathlib import Path
import pandas as pd
from pandas.api.types import is_string_dtype
from pandas.api.types import is_numeric_dtype
from sklearn.cluster import KMeans
from sklearn.ensemble import GradientBoostingClassifier
import sys
import time
import pickle
from datetime import datetime
from math import ceil
from joblib import dump, load
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import seaborn as sns
from util import mem_usage
from glob import glob
from copy import deepcopy
import numpy as np
import shutil

pd.options.display.float_format = '{:20,.5f}'.format
sns.set()

SEED = random.seed(0)
DATA = Path('data')
MODELS = Path('models')
EXPERIMENTS = Path('experiments')
TARGETS = ['participants','interventions','outcomes']
SUBSET = 'Train' # 'Test'
N_TRAIN_DOCS = 4500
BASE_MODEL_NAME = 'GBC_POS-PMFT' # GradBoostClassifier model trained on dataset 'raw', with POS tag and 
                                    # PubMed data-trained FastText as features.    
# DOWNSAMPLE = False 

# add required libraries etc
%load_ext jupyternotify

<IPython.core.display.Javascript object>

In [2]:
train_val = pd.read_parquet('data/split/train.parquet')

# the test set is currently withheld. Test using a validation split (see below).

Validation split (KCV currently not implemented, so just a simulation)

In [3]:
%%notify

def hotcode(df):
    
    print(df.columns)
    num_cols = df._get_numeric_data().columns
    
    cols = set(df.columns)
    cat_cols = (set(df.columns) - set(num_cols)) #- set('Word')
    
    print((cat_cols))
    dummies = pd.get_dummies(df[cat_cols])
    
    print('hotcode complete.')
    # assert check that type is numeric for all
    
    return pd.concat([dummies, df[num_cols]], axis=1)
    
pio = {"participants", "interventions", "outcomes"}
features = set(train_val.columns).difference(pio.union({'Word'}))

# print('Word' in features)
# print(train_val.shape)

print('hotcoding categorical columns...')
try:
    train_val = hotcode(train_val)
except ValueError:
    print('No categorical values found in data')

hotcoding categorical columns...
Index(['PM_0', 'PM_1', 'PM_2', 'PM_3', 'PM_4', 'PM_5', 'PM_6', 'PM_7', 'PM_8',
       'PM_9',
       ...
       'POS_LAG_1_VBN', 'POS_LAG_1_VBP', 'POS_LAG_1_VBZ', 'POS_LAG_1_WDT',
       'POS_LAG_1_WP', 'POS_LAG_1_WP$', 'POS_LAG_1_WRB', 'interventions',
       'outcomes', 'participants'],
      dtype='object', length=418)
set()
No categorical values found in data


<IPython.core.display.Javascript object>

# Parameters & Partitioning

The training and validation split is only an example for K-fold crossvalidation to simulate an 80:20 train/validation dataset. See code examples of  https://scikit-learn.org/stable/modules/cross_validation.html and https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html for more information.

In [4]:
BASE_MODEL_NAME = 'GradBoost'
k_folds = 5 # not implemented
time = datetime.now().strftime("%m%d-%H%M")
metrics = ['accuracy', 'precision', 'recall', 'f_score', 'support']

dir_name = BASE_MODEL_NAME  + f'_{time}'
print(f'Folder name set to {dir_name}')

Folder name set to GradBoost_0309-0327


In [5]:
train_idx, val_idx = train_test_split(train_val.index.unique('doc'), 
                                      train_size=1-(1/k_folds)) # default 70:30
print(f'splitting data {len(train_idx)}:{len(val_idx)}')

train_set = train_val.loc[(train_idx, slice(None)),:]
val_set   = train_val.loc[(train_idx, slice(None)),:]

X_train, X_test = train_set[features], val_set[features]
y_train, y_test = train_set[pio], val_set[pio]

# copies = [deepcopy(data) for data in [X_train, y_train, X_test, y_test]]

splitting data 3600:900


In [6]:
# convert labels to string (pd.Categorical?)
y_train, y_test = [pd.concat([y[key].astype('str') for key in y],axis=1) for y in [y_train, y_test]]

print(f"Training model on validation set(s) of size {len(val_set)} with {len(val_set.index.unique('doc'))} documents")
print(f'Prepared dataset has {len(features)} features.')

Training model on validation set(s) of size 975513 with 3600 documents
Prepared dataset has 415 features.


# Model

Train a model for each feature. Can also try a multiclass model, since only 3% of the labels overlap.

In [None]:
%%notify

targets = TARGETS # choose which variables you want to predict
n_estimators = 300
n_runs = 1

# dump params to file

for run in range(n_runs):
    
    # cross-validation parameters should be configured here

    train_weights = (y_train.astype(int) * 0.9) + .1
    test_weights = (y_test.astype(int) * 0.9) + .1
        
    for target in TARGETS:
        
        print(f"Training GradientBoosting model to predict {target} on {len(X_train.index.unique('doc'))} training instances")

        clf = GradientBoostingClassifier(n_estimators=n_estimators, learning_rate=1.0, 
                                         verbose=1, max_depth=1, random_state=0)
              
        clf = clf.fit(X_train, y_train[target], sample_weight=train_weights[target].values)
              
        score = clf.score(X_test, y_test[target], sample_weight=test_weights[target].values)
              
        y_pred = clf.predict(X_test)
        y_true = y_test[target].values 
              

        
              
        print(f'Saving data for run {run}, target {target} in {dir_name}')
              
        save_model = True
        if save_model:
            Path(f'models/{dir_name}').mkdir(exist_ok=True)
            model_name = f'run{run}_' + target[0].upper()
            dump(clf, f'models/{dir_name}/{model_name}.joblib')
        
        exp_folder = EXPERIMENTS / dir_name / model_name

        exp_folder.mkdir(exist_ok=True, parents=True)

        feat_imp = pd.DataFrame(clf.feature_importances_, index=X_test.columns)
        train_loss = pd.DataFrame(clf.train_score_)
        feat_imp.sort_index().to_csv(exp_folder /  'feature_importance.csv')
        train_loss.to_csv(exp_folder / 'training_loss.csv')
            
        
        results = precision_recall_fscore_support(y_pred, y_true)
        results = pd.DataFrame.from_dict(results)
        results.index = metrics[1:]
        results.to_csv(exp_folder / 'results.csv')
        
        with (exp_folder / 'accuracy.txt').open('a') as f:
            f.write(f'{accuracy_score(y_pred, y_true)}')
            f.write('\n')
            f.write(f'{features}')
            
        pd.DataFrame([y_pred, y_true], index=['T','P']).to_csv(exp_folder / 'predictions.csv')

print('Done')

Training GradientBoosting model to predict participants on 3600 training instances
      Iter       Train Loss   Remaining Time 
         1           1.3174           59.04m
         2           1.2962           58.68m
         3           1.2815           58.28m
         4           1.2701           57.86m
         5           1.2611           57.57m
         6           1.2519           57.26m
         7           1.2460           57.07m
         8           1.2405           56.84m
         9           1.2353           56.58m
        10           1.2298           56.38m
        20           1.1930           54.38m
        30           1.1720           52.40m
        40           1.1578           50.66m


# Analysis

important aspects still to be implemented:
- parameter tuning w/ grid search (train short models on subsets? try param_grid and GridSearch)
- KCV (in combination with above)
- some testing with different preprocessing and feature sets.
- try importing seaborn and doing some EDA on feature importance data?
- test Random Forest version?
- additional feature extraction and tagging (dep relationships and sentence level features)
- If trees are interesting as a final model, XGBoost > sklearn on most problems.

In [None]:
# drop the least predictive features
    # average the values and drop the lowest?
    # do the same for self-trained if time
    
# save the column names to a json/pickle.

In [None]:
y_test.values

In [None]:
# drop low-relevancy columns from the embedding