In [1]:
import random
from pathlib import Path
from datetime import datetime
import numpy as np
import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier
from joblib import dump, load
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import seaborn as sns


pd.options.display.float_format = '{:20,.5f}'.format
sns.set()

SEED = random.seed(0)
DATA = Path('data')
MODELS = Path('models')
EXPERIMENTS = Path('experiments')
TARGETS = ['participants','interventions','outcomes']
SUBSET = 'Train' # 'Test'
N_TRAIN_DOCS = 4500
BASE_MODEL_NAME = 'GBC_POS-PMFT' # GradBoostClassifier model trained on dataset 'raw', with POS tag and 
                                    # PubMed data-trained FastText as features.    
# DOWNSAMPLE = False 

# add required libraries etc
%load_ext jupyternotify

<IPython.core.display.Javascript object>

In [2]:
train_val = pd.read_parquet('data/split/train.parquet') # attach words to the training set

# the test set is currently withheld. Test using a validation split (see below).

In [3]:
%%notify

def hotcode(df):
    

    num_cols = df._get_numeric_data().columns
    cat_cols = (set(df.columns) - set(num_cols)) - {'Word'}
    
    print(cat_cols)
    dummies = pd.get_dummies(df[cat_cols])
    
    print('hotcode complete.')
    # assert check that type is numeric for all
    
    return pd.concat([dummies, df[num_cols]], axis=1)

# print('Word' in features)
# print(train_val.shape)

print('hotcoding categorical columns...')
try:
    train_val = hotcode(train_val)
except ValueError:
    print('No categorical values found in data')

hotcoding categorical columns...
set()
No categorical values found in data


<IPython.core.display.Javascript object>

# Parameters & Partitioning

The training and validation split is only an example for K-fold crossvalidation to simulate an 80:20 train/validation dataset. See code examples of  https://scikit-learn.org/stable/modules/cross_validation.html and https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html for more information.

In [None]:
BASE_MODEL_NAME = 'GradBoost'
k_folds = 5 # not implemented
time = datetime.now().strftime("%m%d-%H%M")

pio = {"participants", "interventions", "outcomes"}
features = set(train_val.columns).difference(pio.union({'Word'}))

metrics = ['accuracy', 'precision', 'recall', 'f_score', 'support']

dir_name = BASE_MODEL_NAME  + f'_{time}'
print(f'Folder name set to {dir_name}')

Folder name set to GradBoost_0309-1800


Validation split (KCV currently not implemented, so just a simulation)

In [None]:
train_idx, val_idx = train_test_split(train_val.index.unique('doc'), 
                                      train_size=1-(1/k_folds)) # default 70:30

print(f'splitting data {len(train_idx)}:{len(val_idx)}')

train_set = train_val.loc[(train_idx, slice(None)),:]
val_set   = train_val.loc[(val_idx, slice(None)),:]

X_train, X_test = train_set[features], val_set[features]
y_train, y_test = train_set[pio.union({'Word'})], val_set[pio.union({'Word'})]

# copies = [deepcopy(data) for data in [X_train, y_train, X_test, y_test]]

splitting data 3600:900


In [None]:
# convert labels to string (pd.Categorical?)
y_train, y_test = [pd.concat([y[key].astype('str') for key in y],axis=1) for y in [y_train, y_test]]

print(f"Evluating model on validation set(s) of size {len(val_set)} with {len(val_set.index.unique('doc'))} documents")
print(f'Prepared dataset has {len(features)} features.')

# Model

Train a model for each feature. Can also try a multiclass model, since only 3% of the labels overlap.

The model trains n_estimators, which are binary decision trees on a single feature. 'Boosting' refers to the fact that the trees learn to choose the most informative variables over generations, meaning the score gradually improves. The aggregate label prediction for each word is taken over all estimators. There are many tunable parameters for this model but for initial testing I've used 300 estimators and 416 features (meaning not all features will be explored, see the feature importance results).

In [None]:
targets = TARGETS # choose which variables you want to predict
n_estimators = 3
n_runs = 1
weight_factor = 2 # weight factor of positive class compared to negative

words = pd.read_pickle('data\\raw\\labels.pkl')

assert "Word" not in features
assert np.array([(label not in features) for label in pio]).all()

In [None]:
%%notify

# dump params to file

for run in range(n_runs):
    
    # cross-validation parameters should be configured here

    train_weights = y_train[pio].astype(int) * (1 - 1/weight_factor) + 1 / weight_factor
    test_weights = y_test[pio].astype(int) * (1 - 1/weight_factor) + 1 / weight_factor
        
    for target in TARGETS:
        
        print(f"Training GradientBoosting model to predict {target} "
              f"on {len(X_train.index.unique('doc'))} training instances " 
              f"for label {target}")

        clf = GradientBoostingClassifier(n_estimators=n_estimators, learning_rate=1.0, 
                                         verbose=1, max_depth=1, random_state=0)
              
        clf = clf.fit(X_train, y_train[target], sample_weight=train_weights[target].values)
              
        score = clf.score(X_test, y_test[target], sample_weight=test_weights[target].values)
              
        y_pred = clf.predict(X_test)
        y_true = y_test[target].values 
              
        print(f'Saving data for run {run}, target {target} in {dir_name}')
              
        save_model = True
        if save_model:
            Path(f'models/{dir_name}').mkdir(exist_ok=True)
            model_name = f'run{run}_' + target[0].upper()
            dump(clf, f'models/{dir_name}/{model_name}.joblib')
        
        exp_folder = EXPERIMENTS / dir_name / model_name

        exp_folder.mkdir(exist_ok=True, parents=True)

        feat_imp = pd.DataFrame(clf.feature_importances_, index=X_test.columns)
        train_loss = pd.DataFrame(clf.train_score_)
        feat_imp.sort_index().to_csv(exp_folder /  'feature_importance.csv')
        train_loss.to_csv(exp_folder / 'training_loss.csv')
        
        pred_results = pd.DataFrame([y_test['Word'].values, y_true, y_pred], 
                                    index=['Word', 'T', 'P']).T
        pred_results.to_csv(exp_folder / 'predictions.csv')
              
        results = precision_recall_fscore_support(y_true, y_pred, sample_weight=test_weights[target])
        results = pd.DataFrame.from_dict(results)
        results.index = metrics[1:]
        results.to_csv(exp_folder / 'results.csv')
        
        with (exp_folder / 'accuracy.txt').open('a') as f:
            f.write(f'{accuracy_score(y_true, y_pred, sample_weight=test_weights[target])}')
            f.write('\n')
            f.write(f'{features}')
            f.close()

print('Done')

important aspects still to be implemented:
- parameter tuning w/ grid search (train short models on subsets? try param_grid and GridSearch)
- KCV (in combination with above)
- some testing with different preprocessing and feature sets.
- try importing seaborn and doing some EDA on feature importance data?
- test Random Forest version?
- additional feature extraction and tagging (dep relationships and sentence level features)
- If trees are interesting as a final model, XGBoost > sklearn on most problems.