# Word-level classification with a Gradient-Boosting model

NOTE: the script can be copied to run in parallel on multiple cores.

In [1]:
import random
from pathlib import Path
from datetime import datetime
import numpy as np
import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier
from joblib import dump, load
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import seaborn as sns

pd.options.display.float_format = '{:20,.5f}'.format
sns.set()

SEED = random.seed(0)
EXPERIMENTS = Path('experiments')
SUBSET = 'Train' # 'Test'
BASE_MODEL_NAME = 'GBC_POS-PMFT' # GradBoostClassifier model trained on dataset 'raw', with POS tag and 
                                    # PubMed data-trained FastText as features.    

# add required libraries etc
%load_ext jupyternotify

<IPython.core.display.Javascript object>

In [2]:
# train_val = pd.read_parquet('data/split/train.parquet')
train_val = pd.read_parquet('data/features/km_4-8-12_10x300.parquet')

# the test set is currently withheld (493 documents). Test using a validation split (see below).

In [3]:
%%notify -m "Finished one-hot coding columns"

def hotcode(df):
    

    num_cols = df._get_numeric_data().columns
    cat_cols = (set(df.columns) - set(num_cols)) - {'Word'}
    
    print(cat_cols)
    dummies = pd.get_dummies(df[cat_cols])
    
    print('hotcode complete.')
    # assert check that type is numeric for all
    
    return pd.concat([dummies, df[num_cols]], axis=1)

print('hotcoding categorical columns...')
try:
    train_val = hotcode(train_val)
except ValueError:
    print('No categorical values found in data')

hotcoding categorical columns...
set()
No categorical values found in data


<IPython.core.display.Javascript object>

# Parameters & Partitioning

The training and validation split is only an example for K-fold crossvalidation to simulate an 80:20 train/validation dataset. See code examples of  https://scikit-learn.org/stable/modules/cross_validation.html and https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html for more information.

In [4]:
k_folds = 5 # not implemented

pio = {"participants", "interventions", "outcomes"}
features = set(train_val.columns).difference(pio.union({'Word'}))

metrics = ['accuracy', 'precision', 'recall', 'f_score', 'support']

Validation split (KCV currently not implemented, so just a simulation)

In [5]:
train_idx, val_idx = train_test_split(train_val.index.unique('doc'), 
                                      train_size=1-(1/k_folds)) # default 70:30

print(f'splitting data {len(train_idx)}:{len(val_idx)}')

train_set = train_val.loc[(train_idx, slice(None)),:]
val_set   = train_val.loc[(val_idx, slice(None)),:]

X_train, X_test = train_set[features], val_set[features]
y_train, y_test = train_set[pio.union({'Word'})], val_set[pio.union({'Word'})]

splitting data 3600:900


In [6]:
# convert labels to string (pd.Categorical?)
y_train, y_test = [pd.concat([y[key].astype('str') for key in y],axis=1) for y in [y_train, y_test]]

print(f"Evluating model on validation set(s) of size {len(val_set)} with {len(val_set.index.unique('doc'))} documents")
print(f'Prepared dataset has {len(features)} features.')

Evluating model on validation set(s) of size 243684 with 900 documents
Prepared dataset has 120 features.


# Model

Train a model for each feature. Can also try a multiclass model, since only 3% of the labels overlap.

The model trains n_estimators, which are binary decision trees on a single feature. 'Boosting' refers to the fact that the trees learn to choose the most informative variables over generations, meaning the score gradually improves. The aggregate label prediction for each word is taken over all estimators. There are many tunable parameters for this model but for initial testing I've used 300 estimators and 416 features (meaning not all features will be explored, see the feature importance results).

The class weights are set with parameter `weight_factor`. If ` weight_factor=5`, then the class weight for the positive label is five times that of the negative label.

Highly advisable to do a test run with >3 estimators to see if all the results are saved correctly before running with a long training time.

In [7]:
BASE_MODEL_NAME = 'GBM'
time = datetime.now().strftime("%m%d-%H%M")
dir_name = BASE_MODEL_NAME  + f'_{time}'

# manually override the directory name. Careful, as you won't be warned if you are overwriting existing data.
# this is useful for running several notebooks in parallel (multi-processor)

dir_name = 'kM_clusttree_4-8-12'

print(f'Folder name set to {dir_name}')

Folder name set to kM_clusttree_4-8-12


In [8]:
targets = ['participants','interventions','outcomes']
n_estimators = 300
n_runs = 10
weight_factor = 5 # weight factor of positive class compared to negative

words = pd.read_pickle('data\\raw\\labels.pkl')

assert "Word" not in features
assert np.array([(label not in features) for label in pio]).all()

# use these overrides for multiple scripts
# targets = ['participants']
# targets = ['interventions']
# targets = ['outcomes']

In [9]:
for run in range(n_runs):
    
    # cross-validation parameters defined here for now. Use sklearn.GridSearchCV for param tuning

    train_weights = y_train[pio].astype(int) * (1 - 1/weight_factor) + 1 / weight_factor
    test_weights = y_test[pio].astype(int) * (1 - 1/weight_factor) + 1 / weight_factor
        
    for target in targets:
        
        %notify -m f"Starting new iteration for {target}"    
            
        print(f"Training GradientBoosting model to predict {target} "
              f"on {len(X_train.index.unique('doc'))} training instances " 
              f"for label {target}")

        clf = GradientBoostingClassifier(n_estimators=n_estimators, learning_rate=1.0, 
                                         verbose=1, max_depth=1, random_state=0)
              
        clf = clf.fit(X_train, y_train[target], sample_weight=train_weights[target].values)
              
        score = clf.score(X_test, y_test[target], sample_weight=test_weights[target].values)
              
        y_pred = clf.predict(X_test)
        y_true = y_test[target].values 
              
        print(f'Saving data for run {run}, target {target} in {dir_name}')
              
        model_name = target[0].upper() + f'_wf{weight_factor}_run{run}'
        save_model = True
        
        if save_model:
            Path(f'models/{dir_name}').mkdir(exist_ok=True, parents=True)
            dump(clf, f'models/{dir_name}/{model_name}.joblib')
        
        exp_folder = EXPERIMENTS / dir_name / model_name
        exp_folder.mkdir(exist_ok=True, parents=True)

        feat_imp = pd.DataFrame(clf.feature_importances_, index=X_test.columns)
        train_loss = pd.DataFrame(clf.train_score_)
        feat_imp.sort_index().to_csv(exp_folder /  'feature_importance.csv')
        train_loss.to_csv(exp_folder / 'training_loss.csv')
        
        pred_results = pd.DataFrame([y_test['Word'].values, y_true, y_pred], 
                                    index=['Word', 'T', 'P']).T
        pred_results.to_csv(exp_folder / 'predictions.csv')
              
        results = precision_recall_fscore_support(y_true, y_pred, sample_weight=test_weights[target])
        results = pd.DataFrame.from_dict(results)
        results.index = metrics[1:]
        results.to_csv(exp_folder / 'results.csv')
        
        with (exp_folder / 'accuracy.txt').open('a') as f:
            f.write(f'{accuracy_score(y_true, y_pred, sample_weight=test_weights[target])}')
            f.write('\n')
            f.write(f'{features}')
            f.close()

print('Done')

<IPython.core.display.Javascript object>

Training GradientBoosting model to predict participants on 3600 training instances for label participants
      Iter       Train Loss   Remaining Time 
         1           1.3088            8.25m
         2           1.2746            8.28m
         3           1.2456            8.10m
         4           1.2259            7.96m
         5           1.2128            7.87m
         6           1.2052            7.88m
         7           1.1999            7.91m
         8           1.1941            7.87m
         9           1.1897            7.86m
        10           1.1862            7.85m
        20           1.1607            7.67m
        30           1.1498            7.56m
        40           1.1437            7.18m
        50           1.1398            6.84m
        60           1.1373            6.61m
        70           1.1359            6.88m
        80           1.1348            6.91m
        90           1.1338            6.91m
       100           1.1330           

<IPython.core.display.Javascript object>

Training GradientBoosting model to predict interventions on 3600 training instances for label interventions
      Iter       Train Loss   Remaining Time 
         1           1.2026            9.76m
         2           1.1810            9.13m
         3           1.1613            9.70m
         4           1.1508           10.28m
         5           1.1427           10.30m
         6           1.1373           10.51m
         7           1.1318           10.62m
         8           1.1273           10.69m
         9           1.1223           10.71m
        10           1.1179           10.72m
        20           1.0875           10.10m
        30           1.0748            9.67m
        40           1.0700            9.28m
        50           1.0671            8.96m
        60           1.0653            8.60m
        70           1.0642            8.23m
        80           1.0635            7.86m
        90           1.0630            7.48m
       100           1.0626         

<IPython.core.display.Javascript object>

Training GradientBoosting model to predict outcomes on 3600 training instances for label outcomes
      Iter       Train Loss   Remaining Time 
         1           1.3175           10.53m
         2           1.3035           10.88m
         3           1.2931           10.61m
         4           1.2834           10.79m
         5           1.2759           10.56m
         6           1.2696           10.57m
         7           1.2619           10.47m
         8           1.2556           10.61m
         9           1.2504           10.62m
        10           1.2462           10.58m
        20           1.2207           10.17m
        30           1.2115            9.61m
        40           1.2063            9.12m
        50           1.2023            8.79m
        60           1.1997            8.46m
        70           1.1972            8.11m
        80           1.1957            7.81m
        90           1.1940            7.44m
       100           1.1930            7.06m
 

<IPython.core.display.Javascript object>

Training GradientBoosting model to predict participants on 3600 training instances for label participants
      Iter       Train Loss   Remaining Time 
         1           1.3088            9.78m
         2           1.2746            9.74m
         3           1.2456            9.93m
         4           1.2259           10.01m
         5           1.2128           10.05m
         6           1.2052           10.04m
         7           1.1999            9.96m
         8           1.1941            9.96m
         9           1.1897            9.99m
        10           1.1862            9.92m
        20           1.1607            9.93m
        30           1.1498            9.57m
        40           1.1437            9.24m
        50           1.1398            8.85m
        60           1.1373            8.54m
        70           1.1359            8.18m
        80           1.1348            7.85m
        90           1.1338            7.52m
       100           1.1330           

<IPython.core.display.Javascript object>

Training GradientBoosting model to predict interventions on 3600 training instances for label interventions
      Iter       Train Loss   Remaining Time 
         1           1.2026           10.46m
         2           1.1810           10.24m
         3           1.1613           10.82m
         4           1.1508           10.96m
         5           1.1427           10.43m
         6           1.1373           10.06m
         7           1.1318           10.28m
         8           1.1273           10.30m
         9           1.1223           10.23m
        10           1.1179           10.11m
        20           1.0875            9.68m
        30           1.0748            9.31m
        40           1.0700            9.19m
        50           1.0671            8.86m
        60           1.0653            8.48m
        70           1.0642            8.12m
        80           1.0635            7.72m
        90           1.0630            7.39m
       100           1.0626         

<IPython.core.display.Javascript object>

Training GradientBoosting model to predict outcomes on 3600 training instances for label outcomes
      Iter       Train Loss   Remaining Time 
         1           1.3175            9.89m
         2           1.3035           10.41m
         3           1.2931           11.01m
         4           1.2834           11.23m
         5           1.2759           10.92m
         6           1.2696           10.68m
         7           1.2619           10.56m
         8           1.2556           10.41m
         9           1.2504           10.37m
        10           1.2462           10.41m
        20           1.2207            9.89m
        30           1.2115            9.37m
        40           1.2063            9.03m
        50           1.2023            8.75m
        60           1.1997            8.50m
        70           1.1972            8.16m
        80           1.1957            7.81m
        90           1.1940            7.43m
       100           1.1930            7.08m
 

<IPython.core.display.Javascript object>

Training GradientBoosting model to predict participants on 3600 training instances for label participants
      Iter       Train Loss   Remaining Time 
         1           1.3088           10.73m
         2           1.2746           11.09m
         3           1.2456           11.05m
         4           1.2259           11.04m
         5           1.2128           10.86m
         6           1.2052           10.87m
         7           1.1999           10.85m
         8           1.1941           10.68m
         9           1.1897           10.71m
        10           1.1862           10.71m
        20           1.1607           10.35m
        30           1.1498            9.82m
        40           1.1437            9.38m
        50           1.1398            9.05m
        60           1.1373            8.71m
        70           1.1359            8.36m
        80           1.1348            8.00m
        90           1.1338            7.57m
       100           1.1330           

<IPython.core.display.Javascript object>

Training GradientBoosting model to predict interventions on 3600 training instances for label interventions
      Iter       Train Loss   Remaining Time 
         1           1.2026            9.60m
         2           1.1810           10.52m
         3           1.1613           10.26m
         4           1.1508           10.18m
         5           1.1427           10.04m
         6           1.1373           10.02m
         7           1.1318           10.04m
         8           1.1273            9.79m
         9           1.1223            9.88m
        10           1.1179            9.86m
        20           1.0875            9.75m
        30           1.0748            9.53m
        40           1.0700            9.12m
        50           1.0671            8.84m
        60           1.0653            8.45m
        70           1.0642            8.05m
        80           1.0635            7.69m
        90           1.0630            7.35m
       100           1.0626         

<IPython.core.display.Javascript object>

Training GradientBoosting model to predict outcomes on 3600 training instances for label outcomes
      Iter       Train Loss   Remaining Time 
         1           1.3175            7.88m
         2           1.3035            7.56m
         3           1.2931            7.68m
         4           1.2834            7.76m
         5           1.2759            7.76m
         6           1.2696            7.66m
         7           1.2619            7.72m
         8           1.2556            7.73m
         9           1.2504            7.67m
        10           1.2462            7.69m
        20           1.2207            6.99m
        30           1.2115            6.46m
        40           1.2063            6.06m
        50           1.2023            5.71m
        60           1.1997            5.41m
        70           1.1972            5.14m
        80           1.1957            4.91m
        90           1.1940            4.66m
       100           1.1930            4.42m
 

<IPython.core.display.Javascript object>

Training GradientBoosting model to predict participants on 3600 training instances for label participants
      Iter       Train Loss   Remaining Time 
         1           1.3088            6.68m
         2           1.2746            6.53m
         3           1.2456            6.40m
         4           1.2259            6.32m
         5           1.2128            6.28m
         6           1.2052            6.29m
         7           1.1999            6.25m
         8           1.1941            6.19m
         9           1.1897            6.19m
        10           1.1862            6.16m
        20           1.1607            5.89m
        30           1.1498            5.68m
        40           1.1437            5.45m
        50           1.1398            5.24m
        60           1.1373            5.02m
        70           1.1359            4.81m
        80           1.1348            4.60m
        90           1.1338            4.40m
       100           1.1330           

<IPython.core.display.Javascript object>

Training GradientBoosting model to predict interventions on 3600 training instances for label interventions
      Iter       Train Loss   Remaining Time 
         1           1.2026            6.25m
         2           1.1810            6.22m
         3           1.1613            6.22m
         4           1.1508            6.16m
         5           1.1427            6.12m
         6           1.1373            6.12m
         7           1.1318            6.13m
         8           1.1273            6.10m
         9           1.1223            6.08m
        10           1.1179            6.06m
        20           1.0875            5.86m
        30           1.0748            5.65m
        40           1.0700            5.45m
        50           1.0671            5.25m
        60           1.0653            5.05m
        70           1.0642            4.83m
        80           1.0635            4.62m
        90           1.0630            4.41m
       100           1.0626         

<IPython.core.display.Javascript object>

Training GradientBoosting model to predict outcomes on 3600 training instances for label outcomes
      Iter       Train Loss   Remaining Time 
         1           1.3175            6.98m
         2           1.3035            6.65m
         3           1.2931            6.48m
         4           1.2834            6.37m
         5           1.2759            6.31m
         6           1.2696            6.27m
         7           1.2619            6.23m
         8           1.2556            6.20m
         9           1.2504            6.17m
        10           1.2462            6.14m
        20           1.2207            5.93m
        30           1.2115            5.75m
        40           1.2063            5.51m
        50           1.2023            5.28m
        60           1.1997            5.08m
        70           1.1972            4.87m
        80           1.1957            4.66m
        90           1.1940            4.44m
       100           1.1930            4.22m
 

<IPython.core.display.Javascript object>

Training GradientBoosting model to predict participants on 3600 training instances for label participants
      Iter       Train Loss   Remaining Time 
         1           1.3088            6.84m
         2           1.2746            6.52m
         3           1.2456            6.42m
         4           1.2259            6.35m
         5           1.2128            6.31m
         6           1.2052            6.29m
         7           1.1999            6.25m
         8           1.1941            6.23m
         9           1.1897            6.19m
        10           1.1862            6.17m
        20           1.1607            5.90m
        30           1.1498            5.71m
        40           1.1437            5.48m
        50           1.1398            5.26m
        60           1.1373            5.03m
        70           1.1359            4.83m
        80           1.1348            4.61m
        90           1.1338            4.41m
       100           1.1330           

<IPython.core.display.Javascript object>

Training GradientBoosting model to predict interventions on 3600 training instances for label interventions
      Iter       Train Loss   Remaining Time 
         1           1.2026            6.54m
         2           1.1810            6.47m
         3           1.1613            6.45m
         4           1.1508            6.39m
         5           1.1427            6.36m
         6           1.1373            6.38m
         7           1.1318            6.38m
         8           1.1273            6.35m
         9           1.1223            6.32m
        10           1.1179            6.28m
        20           1.0875            6.03m
        30           1.0748            5.77m
        40           1.0700            5.53m
        50           1.0671            5.34m
        60           1.0653            5.10m
        70           1.0642            4.87m
        80           1.0635            4.65m
        90           1.0630            4.44m
       100           1.0626         

<IPython.core.display.Javascript object>

Training GradientBoosting model to predict outcomes on 3600 training instances for label outcomes
      Iter       Train Loss   Remaining Time 
         1           1.3175            6.73m
         2           1.3035            6.57m
         3           1.2931            6.50m
         4           1.2834            6.44m
         5           1.2759            6.36m
         6           1.2696            6.31m
         7           1.2619            6.26m
         8           1.2556            6.22m
         9           1.2504            6.18m
        10           1.2462            6.16m
        20           1.2207            5.89m
        30           1.2115            5.68m
        40           1.2063            5.46m
        50           1.2023            5.24m
        60           1.1997            5.03m
        70           1.1972            4.82m
        80           1.1957            4.61m
        90           1.1940            4.40m
       100           1.1930            4.18m
 

<IPython.core.display.Javascript object>

Training GradientBoosting model to predict participants on 3600 training instances for label participants
      Iter       Train Loss   Remaining Time 
         1           1.3088            6.27m
         2           1.2746            6.29m
         3           1.2456            6.24m
         4           1.2259            6.19m
         5           1.2128            6.17m
         6           1.2052            6.15m
         7           1.1999            6.11m
         8           1.1941            6.07m
         9           1.1897            6.06m
        10           1.1862            6.03m
        20           1.1607            5.82m
        30           1.1498            5.63m
        40           1.1437            5.42m
        50           1.1398            5.21m
        60           1.1373            5.00m
        70           1.1359            4.80m
        80           1.1348            4.59m
        90           1.1338            4.38m
       100           1.1330           

<IPython.core.display.Javascript object>

Training GradientBoosting model to predict interventions on 3600 training instances for label interventions
      Iter       Train Loss   Remaining Time 
         1           1.2026            6.21m
         2           1.1810            6.44m
         3           1.1613            6.35m
         4           1.1508            6.25m
         5           1.1427            6.18m
         6           1.1373            6.18m
         7           1.1318            6.16m
         8           1.1273            6.12m
         9           1.1223            6.09m
        10           1.1179            6.08m
        20           1.0875            5.89m
        30           1.0748            5.70m
        40           1.0700            5.47m
        50           1.0671            5.25m
        60           1.0653            5.04m
        70           1.0642            4.83m
        80           1.0635            4.62m
        90           1.0630            4.40m
       100           1.0626         

<IPython.core.display.Javascript object>

Training GradientBoosting model to predict outcomes on 3600 training instances for label outcomes
      Iter       Train Loss   Remaining Time 
         1           1.3175            6.51m
         2           1.3035            6.39m
         3           1.2931            6.28m
         4           1.2834            6.27m
         5           1.2759            6.23m
         6           1.2696            6.20m
         7           1.2619            6.15m
         8           1.2556            6.14m
         9           1.2504            6.11m
        10           1.2462            6.07m
        20           1.2207            5.86m
        30           1.2115            5.66m
        40           1.2063            5.45m
        50           1.2023            5.23m
        60           1.1997            5.03m
        70           1.1972            4.84m
        80           1.1957            4.62m
        90           1.1940            4.41m
       100           1.1930            4.20m
 

<IPython.core.display.Javascript object>

Training GradientBoosting model to predict participants on 3600 training instances for label participants
      Iter       Train Loss   Remaining Time 
         1           1.3088            6.38m
         2           1.2746            6.33m
         3           1.2456            6.26m
         4           1.2259            6.21m
         5           1.2128            6.21m
         6           1.2052            6.18m
         7           1.1999            6.14m
         8           1.1941            6.10m
         9           1.1897            6.10m
        10           1.1862            6.08m
        20           1.1607            5.83m
        30           1.1498            5.64m
        40           1.1437            5.42m
        50           1.1398            5.22m
        60           1.1373            5.00m
        70           1.1359            4.79m
        80           1.1348            4.58m
        90           1.1338            4.38m
       100           1.1330           

<IPython.core.display.Javascript object>

Training GradientBoosting model to predict interventions on 3600 training instances for label interventions
      Iter       Train Loss   Remaining Time 
         1           1.2026            6.12m
         2           1.1810            6.26m
         3           1.1613            6.25m
         4           1.1508            6.24m
         5           1.1427            6.21m
         6           1.1373            6.30m
         7           1.1318            6.26m
         8           1.1273            6.21m
         9           1.1223            6.19m
        10           1.1179            6.16m
        20           1.0875            5.89m
        30           1.0748            5.67m
        40           1.0700            5.45m
        50           1.0671            5.24m
        60           1.0653            5.02m
        70           1.0642            4.81m
        80           1.0635            4.60m
        90           1.0630            4.39m
       100           1.0626         

<IPython.core.display.Javascript object>

Training GradientBoosting model to predict outcomes on 3600 training instances for label outcomes
      Iter       Train Loss   Remaining Time 
         1           1.3175            6.51m
         2           1.3035            6.53m
         3           1.2931            6.41m
         4           1.2834            6.32m
         5           1.2759            6.26m
         6           1.2696            6.28m
         7           1.2619            6.24m
         8           1.2556            6.20m
         9           1.2504            6.17m
        10           1.2462            6.15m
        20           1.2207            5.91m
        30           1.2115            5.73m
        40           1.2063            5.50m
        50           1.2023            5.27m
        60           1.1997            5.05m
        70           1.1972            4.84m
        80           1.1957            4.63m
        90           1.1940            4.41m
       100           1.1930            4.19m
 

<IPython.core.display.Javascript object>

Training GradientBoosting model to predict participants on 3600 training instances for label participants
      Iter       Train Loss   Remaining Time 
         1           1.3088            6.39m
         2           1.2746            6.30m
         3           1.2456            6.26m
         4           1.2259            6.21m
         5           1.2128            6.19m
         6           1.2052            6.19m
         7           1.1999            6.15m
         8           1.1941            6.11m
         9           1.1897            6.10m
        10           1.1862            6.08m
        20           1.1607            5.86m
        30           1.1498            5.66m
        40           1.1437            5.46m
        50           1.1398            5.25m
        60           1.1373            5.03m
        70           1.1359            4.81m
        80           1.1348            4.60m
        90           1.1338            4.39m
       100           1.1330           

<IPython.core.display.Javascript object>

Training GradientBoosting model to predict interventions on 3600 training instances for label interventions
      Iter       Train Loss   Remaining Time 
         1           1.2026            6.50m
         2           1.1810            6.38m
         3           1.1613            6.30m
         4           1.1508            6.28m
         5           1.1427            6.27m
         6           1.1373            6.22m
         7           1.1318            6.23m
         8           1.1273            6.20m
         9           1.1223            6.18m
        10           1.1179            6.16m
        20           1.0875            5.93m
        30           1.0748            5.71m
        40           1.0700            5.48m
        50           1.0671            5.26m
        60           1.0653            5.03m
        70           1.0642            4.81m
        80           1.0635            4.60m
        90           1.0630            4.40m
       100           1.0626         

<IPython.core.display.Javascript object>

Training GradientBoosting model to predict outcomes on 3600 training instances for label outcomes
      Iter       Train Loss   Remaining Time 
         1           1.3175            6.43m
         2           1.3035            6.42m
         3           1.2931            6.34m
         4           1.2834            6.37m
         5           1.2759            6.32m
         6           1.2696            6.29m
         7           1.2619            6.25m
         8           1.2556            6.21m
         9           1.2504            6.19m
        10           1.2462            6.16m
        20           1.2207            5.91m
        30           1.2115            5.70m
        40           1.2063            5.49m
        50           1.2023            5.28m
        60           1.1997            5.06m
        70           1.1972            4.85m
        80           1.1957            4.63m
        90           1.1940            4.42m
       100           1.1930            4.21m
 

<IPython.core.display.Javascript object>

Training GradientBoosting model to predict participants on 3600 training instances for label participants
      Iter       Train Loss   Remaining Time 
         1           1.3088            6.30m
         2           1.2746            6.24m
         3           1.2456            6.21m
         4           1.2259            6.29m
         5           1.2128            6.42m
         6           1.2052            6.41m
         7           1.1999            6.35m
         8           1.1941            6.32m
         9           1.1897            6.27m
        10           1.1862            6.21m
        20           1.1607            5.93m
        30           1.1498            5.70m
        40           1.1437            5.47m
        50           1.1398            5.25m
        60           1.1373            5.03m
        70           1.1359            4.81m
        80           1.1348            4.61m
        90           1.1338            4.40m
       100           1.1330           

<IPython.core.display.Javascript object>

Training GradientBoosting model to predict interventions on 3600 training instances for label interventions
      Iter       Train Loss   Remaining Time 
         1           1.2026            6.50m
         2           1.1810            6.30m
         3           1.1613            6.25m
         4           1.1508            6.19m
         5           1.1427            6.19m
         6           1.1373            6.16m
         7           1.1318            6.17m
         8           1.1273            6.18m
         9           1.1223            6.16m
        10           1.1179            6.12m
        20           1.0875            5.86m
        30           1.0748            5.67m
        40           1.0700            5.46m
        50           1.0671            5.25m
        60           1.0653            5.03m
        70           1.0642            4.82m
        80           1.0635            4.62m
        90           1.0630            4.40m
       100           1.0626         

<IPython.core.display.Javascript object>

Training GradientBoosting model to predict outcomes on 3600 training instances for label outcomes
      Iter       Train Loss   Remaining Time 
         1           1.3175            6.61m
         2           1.3035            6.58m
         3           1.2931            6.41m
         4           1.2834            6.35m
         5           1.2759            6.31m
         6           1.2696            6.27m
         7           1.2619            6.22m
         8           1.2556            6.20m
         9           1.2504            6.15m
        10           1.2462            6.12m
        20           1.2207            5.90m
        30           1.2115            5.68m
        40           1.2063            5.46m
        50           1.2023            5.25m
        60           1.1997            5.05m
        70           1.1972            4.84m
        80           1.1957            4.64m
        90           1.1940            4.52m
       100           1.1930            4.30m
 

<IPython.core.display.Javascript object>

Training GradientBoosting model to predict participants on 3600 training instances for label participants
      Iter       Train Loss   Remaining Time 
         1           1.3088            6.24m
         2           1.2746            6.19m
         3           1.2456            6.23m
         4           1.2259            6.20m
         5           1.2128            6.20m
         6           1.2052            6.19m
         7           1.1999            6.19m
         8           1.1941            6.14m
         9           1.1897            6.10m
        10           1.1862            6.08m
        20           1.1607            5.84m
        30           1.1498            5.65m
        40           1.1437            5.44m
        50           1.1398            5.23m
        60           1.1373            5.02m
        70           1.1359            4.81m
        80           1.1348            4.60m
        90           1.1338            4.39m
       100           1.1330           

<IPython.core.display.Javascript object>

Training GradientBoosting model to predict interventions on 3600 training instances for label interventions
      Iter       Train Loss   Remaining Time 
         1           1.2026            6.40m
         2           1.1810            6.27m
         3           1.1613            6.24m
         4           1.1508            6.20m
         5           1.1427            6.21m
         6           1.1373            6.17m
         7           1.1318            6.15m
         8           1.1273            6.12m
         9           1.1223            6.11m
        10           1.1179            6.08m
        20           1.0875            5.86m
        30           1.0748            5.66m
        40           1.0700            5.43m
        50           1.0671            5.22m
        60           1.0653            5.02m
        70           1.0642            4.80m
        80           1.0635            4.59m
        90           1.0630            4.38m
       100           1.0626         

<IPython.core.display.Javascript object>

Training GradientBoosting model to predict outcomes on 3600 training instances for label outcomes
      Iter       Train Loss   Remaining Time 
         1           1.3175            6.42m
         2           1.3035            6.55m
         3           1.2931            6.38m
         4           1.2834            6.35m
         5           1.2759            6.29m
         6           1.2696            6.25m
         7           1.2619            6.20m
         8           1.2556            6.19m
         9           1.2504            6.16m
        10           1.2462            6.13m
        20           1.2207            5.91m
        30           1.2115            5.71m
        40           1.2063            5.50m
        50           1.2023            5.27m
        60           1.1997            5.06m
        70           1.1972            4.85m
        80           1.1957            4.64m
        90           1.1940            4.41m
       100           1.1930            4.20m
 

important aspects still to be implemented:
- parameter tuning w/ grid search (train short models on subsets? try param_grid and GridSearch)
- KCV (in combination with above)
- some testing with different preprocessing and feature sets.
- try importing seaborn and doing some EDA on feature importance data?
- test Random Forest version?
- additional feature extraction and tagging (dep relationships and sentence level features)
- If trees are interesting as a final model, XGBoost > sklearn on most problems.