<h1 id="tocheading">Table of Contents</h1>
<div id="toc"></div>

In [34]:
%%javascript
// ToC script
$.getScript('https://kmahelona.github.io/ipython_notebook_goodies/ipython_notebook_toc.js')

<IPython.core.display.Javascript object>

In [30]:
# Code hider, source: http://chris-said.io/2016/02/13/how-to-make-polished-jupyter-presentations-with-optional-code-visibility/
from IPython.display import HTML

HTML('''
<script>
  function code_toggle() {
    if (code_shown){
      $('div.input').hide('500');
      $('#toggleButton').val('Show Code')
    } else {
      $('div.input').show('500');
      $('#toggleButton').val('Hide Code')
    }
    code_shown = !code_shown
  }

  $( document ).ready(function(){
    code_shown=false;
    $('div.input').hide()
  });
</script>
<form action="javascript:code_toggle()"><input type="submit" id="toggleButton" value="Show Code"></form>
''')

In [97]:
%matplotlib inline

from collections import Counter
import os
import pickle
import warnings

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from autosklearn.classification import AutoSklearnClassifier
import autosklearn.metrics as auto_metrics
from imblearn.over_sampling import SMOTE
import sklearn.metrics as sk_metrics
from sklearn.metrics import accuracy_score, confusion_matrix

In [19]:
# load models and data
warnings.filterwarnings('ignore')
top_5_model = pickle.load(open("bleen/top_5_best_model_final.automl", "rb"))
top_10_model = pickle.load(open("bleen/top_10_best_model_final.automl", "rb"))

top_5_train = pickle.load(open('../data/top_5_train_features.df', 'rb'))
top_5_test =  pickle.load(open('../data/top_5_test_features.df', 'rb'))

top_5_emc_train = pickle.load(open('../data/top_5_emc_train_features.df', 'rb'))
top_5_emc_test =  pickle.load(open('../data/top_5_emc_test_features.df', 'rb'))

top_10_train = pickle.load(open('../data/train_features.df', 'rb'))
top_10_test =  pickle.load(open('../data/test_features.df', 'rb'))

top_10_emc_train = pickle.load(open('../data/top_10_emc_train_features.df', 'rb'))
top_10_emc_test =  pickle.load(open('../data/top_10_emc_test_features.df', 'rb'))

replace_dict = {
    'contact_type': {
        "work": 0,
        "friend": 1,
        "task": 2,
        "family_live_separate": 3,
        "family_live_together": 4,
#        "other": 5,
        "sig_other": 6
    }
}

contact_types = list(replace_dict['contact_type'].keys())

rand_seed = 2

Could not delete output dir: /tmp/autosklearn_output_13441_3621
Could not delete tmp dir: /tmp/autosklearn_tmp_13441_3621
Could not delete output dir: /tmp/autosklearn_output_5985_2598
Could not delete tmp dir: /tmp/autosklearn_tmp_5985_2598


In [121]:
# utility functions

def print_ensemble(ensemble, latex=False):
    delim = ","
    end = ""
    for weight, pipeline in ensemble:
        if latex:
            print("{} & {} \\\\".format(weight, 
                                        pipeline.configuration['classifier:__choice__']))
        else:
            print("Weight: {}, classifier: {}".format(weight, 
                                                      pipeline.configuration['classifier:__choice__']))

            
def print_metrics(test_y, predictions):
    """
    Micro statistics take global counts of TP, FP, etc
    Macro statistics take per class metrics and averages them (not accounting for class imbalance)
    Weighted statistics weight macro metrics by number of true examples in each class
    """
    print("Accuracy:", accuracy_score(test_y, predictions))
    
    # precision, recall, F1
    metrics = np.zeros((2,3))
    
    # micro
    avgs = ['macro', 'weighted']
    for i, avg in enumerate(avgs):
        metrics[i,0] = sk_metrics.precision_score(test_y, predictions, average=avg)
        metrics[i,1] = sk_metrics.recall_score(test_y, predictions, average=avg)
        metrics[i,2] = sk_metrics.f1_score(test_y, predictions, average=avg)

    metrics_df = pd.DataFrame(metrics, index=avgs, columns=['precision', 'recall', 'F1'])
    display(metrics_df)

# Model description

Models are trained using the out-of-the-box `AutoSklearnClassifier` provided by the `auto-sklearn` module. This is the scikit-learn implementation of the autoML system as described in "efficient and robust automated machine learning."

[Module documentation](https://automl.github.io/auto-sklearn/stable/index.html)

[AutoML paper](https://papers.nips.cc/paper/5872-efficient-and-robust-automated-machine-learning)

# Top 5 contacts model

In [119]:
train_data = top_5_train.replace(replace_dict)
test_data = top_5_test.replace(replace_dict)

train_y = train_data['contact_type']
train_X = train_data.drop(['contact_type', 'pid', 'combined_hash'], axis=1)
test_y = test_data['contact_type']
test_X = test_data.drop(['contact_type', 'pid', 'combined_hash'], axis=1)

print("original shape %s" % Counter(train_y))

sm = SMOTE(random_state=rand_seed)

res_X, res_y = sm.fit_resample(train_X, train_y)

print("resampled shape %s" % Counter(res_y))

original shape Counter({1: 244, 3: 207, 6: 82, 4: 81, 2: 66, 0: 60})
resampled shape Counter({3: 244, 1: 244, 6: 244, 4: 244, 2: 244, 0: 244})


In [106]:
top_5_model.refit(train_X, train_y)

AutoSklearnClassifier(delete_output_folder_after_terminate=True,
           delete_tmp_folder_after_terminate=True,
           disable_evaluator_output=False, ensemble_memory_limit=1024,
           ensemble_nbest=50, ensemble_size=50, exclude_estimators=None,
           exclude_preprocessors=None, get_smac_object_callback=None,
           include_estimators=None, include_preprocessors=None,
           initial_configurations_via_metalearning=25, logging_config=None,
           ml_memory_limit=3072, output_folder=None,
           per_run_time_limit=360, resampling_strategy='holdout',
           resampling_strategy_arguments=None, seed=2, shared_mode=False,
           smac_scenario_args=None, time_left_for_this_task=3600,
           tmp_folder=None)

## Base results

Features:
- top 5 contacts per participant
- communication features as described in `feature_extract.ipynb`, with NaN indicators and imputation

Preprocessing: N/A

In [122]:
predictions = top_5_model.predict(test_X)
print_metrics(test_y, predictions)

# rows are truth, columns are predicted
top_5_confusion_mat = confusion_matrix(test_y, predictions)
top_5_confuse_df = pd.DataFrame(top_5_confusion_mat, index=contact_types, columns=["p_" + x for x in contact_types])
display(top_5_confuse_df)

Accuracy: 0.40476190476190477


Unnamed: 0,precision,recall,F1
macro,0.359986,0.321819,0.327312
weighted,0.381323,0.404762,0.380294


Unnamed: 0,p_work,p_friend,p_task,p_family_live_separate,p_family_live_together,p_sig_other
work,0,7,0,7,1,1
friend,1,46,2,19,0,3
task,1,3,11,3,0,1
family_live_separate,0,23,1,17,4,4
family_live_together,0,5,0,8,2,5
sig_other,0,12,0,13,1,9


## Resample results

Features:
- top 5 contacts per participant
- communication features as described in `feature_extract.ipynb`, with NaN indicators and imputation

Preprocessing: 
- SMOTE algorithm ([reference](https://imbalanced-learn.org/en/stable/generated/imblearn.over_sampling.SMOTE.html)) applied to balance training classes


In [109]:
top_5_model_res = pickle.load(open("bleen/top_5_oversample.automl", "rb"))
top_5_model_res.refit(res_X, res_y)

AutoSklearnClassifier(delete_output_folder_after_terminate=True,
           delete_tmp_folder_after_terminate=True,
           disable_evaluator_output=False, ensemble_memory_limit=1024,
           ensemble_nbest=50, ensemble_size=50, exclude_estimators=None,
           exclude_preprocessors=None, get_smac_object_callback=None,
           include_estimators=None, include_preprocessors=None,
           initial_configurations_via_metalearning=25, logging_config=None,
           ml_memory_limit=3072, output_folder=None,
           per_run_time_limit=360, resampling_strategy='holdout',
           resampling_strategy_arguments=None, seed=2, shared_mode=False,
           smac_scenario_args=None, time_left_for_this_task=3600,
           tmp_folder=None)

In [123]:
predictions = top_5_model_res.predict(test_X)
print_metrics(test_y, predictions)

# rows are truth, columns are predicted
top_5_confusion_mat = confusion_matrix(test_y, predictions)
top_5_confuse_df = pd.DataFrame(top_5_confusion_mat, index=contact_types, columns=["p_" + x for x in contact_types])
display(top_5_confuse_df)

Accuracy: 0.41904761904761906


Unnamed: 0,precision,recall,F1
macro,0.43303,0.410395,0.417029
weighted,0.439654,0.419048,0.424036


Unnamed: 0,p_work,p_friend,p_task,p_family_live_separate,p_family_live_together,p_sig_other
work,6,3,2,5,0,0
friend,6,35,2,23,2,3
task,1,1,13,3,1,0
family_live_separate,2,17,1,18,7,4
family_live_together,0,2,0,10,4,4
sig_other,1,8,1,9,4,12


## Contact EMA features results

Features:
- top 5 contacts per participant
- communication features as described in `feature_extract.ipynb`, with NaN indicators and imputation
- contact-based EMA responses per participant also included

Preprocessing: 
- SMOTE algorithm ([reference](https://imbalanced-learn.org/en/stable/generated/imblearn.over_sampling.SMOTE.html)) applied to balance training classes

In [124]:
top_5_emc_train_data = top_5_emc_train.replace(replace_dict)
top_5_emc_test_data = top_5_emc_test.replace(replace_dict)

train_y = top_5_emc_train_data['contact_type']
train_X = top_5_emc_train_data.drop(['contact_type', 'pid', 'combined_hash'], axis=1)
test_y = top_5_emc_test_data['contact_type']
test_X = top_5_emc_test_data.drop(['contact_type', 'pid', 'combined_hash'], axis=1)

print("original shape %s" % Counter(train_y))

sm = SMOTE(random_state=rand_seed)

res_X, res_y = sm.fit_resample(train_X, train_y)

print("resampled shape %s" % Counter(res_y))

original shape Counter({1: 244, 3: 207, 6: 82, 4: 81, 2: 66, 0: 60})
resampled shape Counter({3: 244, 1: 244, 6: 244, 4: 244, 2: 244, 0: 244})


In [23]:
top_5_model_emc = pickle.load(open("top_5_emc.automl", "rb"))
top_5_model_emc.refit(res_X, res_y)

AutoSklearnClassifier(delete_output_folder_after_terminate=True,
           delete_tmp_folder_after_terminate=True,
           disable_evaluator_output=False, ensemble_memory_limit=1024,
           ensemble_nbest=50, ensemble_size=50, exclude_estimators=None,
           exclude_preprocessors=None, get_smac_object_callback=None,
           include_estimators=None, include_preprocessors=None,
           initial_configurations_via_metalearning=25, logging_config=None,
           ml_memory_limit=3072, output_folder=None,
           per_run_time_limit=360, resampling_strategy='holdout',
           resampling_strategy_arguments=None, seed=2, shared_mode=False,
           smac_scenario_args=None, time_left_for_this_task=3600,
           tmp_folder=None)

In [126]:
predictions = top_5_model_emc.predict(test_X)
print_metrics(test_y, predictions)

# rows are truth, columns are predicted
top_5_confusion_mat = confusion_matrix(test_y, predictions)
top_5_confuse_df = pd.DataFrame(top_5_confusion_mat, index=contact_types, columns=["p_" + x for x in contact_types])
display(top_5_confuse_df)

Accuracy: 0.5095238095238095


Unnamed: 0,precision,recall,F1
macro,0.491516,0.499012,0.486929
weighted,0.518949,0.509524,0.508115


Unnamed: 0,p_work,p_friend,p_task,p_family_live_separate,p_family_live_together,p_sig_other
work,5,5,6,0,0,0
friend,4,43,2,15,6,1
task,0,1,18,0,0,0
family_live_separate,2,12,0,21,9,5
family_live_together,0,2,0,8,6,4
sig_other,1,4,0,12,4,14


In [79]:
top_5_emc_ensemble = top_5_model_emc.get_models_with_weights()
print_ensemble(top_5_emc_ensemble)

Weight: 0.28, classifier: lda
Weight: 0.24, classifier: xgradient_boosting
Weight: 0.12, classifier: lda
Weight: 0.06, classifier: extra_trees
Weight: 0.06, classifier: extra_trees
Weight: 0.04, classifier: gradient_boosting
Weight: 0.04, classifier: gradient_boosting
Weight: 0.04, classifier: random_forest
Weight: 0.04, classifier: lda
Weight: 0.02, classifier: random_forest
Weight: 0.02, classifier: lda
Weight: 0.02, classifier: gradient_boosting
Weight: 0.02, classifier: lda


# Top 10 contacts model

In [128]:
train_data = top_10_train.replace(replace_dict)
test_data = top_10_test.replace(replace_dict)

train_y = train_data['contact_type']
train_X = train_data.drop(['contact_type', 'pid', 'combined_hash'], axis=1)
test_y = test_data['contact_type']
test_X = test_data.drop(['contact_type', 'pid', 'combined_hash'], axis=1)

In [115]:
top_10_model.refit(train_X, train_y)

AutoSklearnClassifier(delete_output_folder_after_terminate=True,
           delete_tmp_folder_after_terminate=True,
           disable_evaluator_output=False, ensemble_memory_limit=1024,
           ensemble_nbest=50, ensemble_size=50, exclude_estimators=None,
           exclude_preprocessors=None, get_smac_object_callback=None,
           include_estimators=None, include_preprocessors=None,
           initial_configurations_via_metalearning=25, logging_config=None,
           ml_memory_limit=3072, output_folder=None,
           per_run_time_limit=360, resampling_strategy='holdout',
           resampling_strategy_arguments=None, seed=2, shared_mode=False,
           smac_scenario_args=None, time_left_for_this_task=3600,
           tmp_folder=None)

## Base results

Features:
- top 10 contacts per participant
- communication features as described in `feature_extract.ipynb`, with NaN indicators and imputation

Preprocessing: N/A

In [130]:
predictions = top_10_model.predict(test_X)
print_metrics(test_y, predictions)

# rows are truth, columns are predicted
top_10_confusion_mat = confusion_matrix(test_y, predictions)
top_10_confuse_df = pd.DataFrame(top_10_confusion_mat, index=contact_types, columns=["p_" + x for x in contact_types])
display(top_10_confuse_df)

Accuracy: 0.44285714285714284


Unnamed: 0,precision,recall,F1
macro,0.422541,0.354095,0.351753
weighted,0.436958,0.442857,0.409587


Unnamed: 0,p_work,p_friend,p_task,p_family_live_separate,p_family_live_together,p_sig_other
work,5,29,4,2,1,3
friend,2,93,7,25,2,2
task,2,15,44,2,0,0
family_live_separate,0,53,11,32,9,5
family_live_together,0,7,1,13,3,4
sig_other,0,15,4,12,4,9


## Resample results

Features:
- top 10 contacts per participant
- communication features as described in `feature_extract.ipynb`, with NaN indicators and imputation

Preprocessing: 
- SMOTE algorithm ([reference](https://imbalanced-learn.org/en/stable/generated/imblearn.over_sampling.SMOTE.html)) applied to balance training classes


In [132]:
top_10_model_res = pickle.load(open("bleen/top_10_oversample.automl", "rb"))
top_10_model_res.refit(train_X, train_y)

Could not delete output dir: /tmp/autosklearn_output_30749_5904
Could not delete tmp dir: /tmp/autosklearn_tmp_30749_5904


AutoSklearnClassifier(delete_output_folder_after_terminate=True,
           delete_tmp_folder_after_terminate=True,
           disable_evaluator_output=False, ensemble_memory_limit=1024,
           ensemble_nbest=50, ensemble_size=50, exclude_estimators=None,
           exclude_preprocessors=None, get_smac_object_callback=None,
           include_estimators=None, include_preprocessors=None,
           initial_configurations_via_metalearning=25, logging_config=None,
           ml_memory_limit=3072, output_folder=None,
           per_run_time_limit=360, resampling_strategy='holdout',
           resampling_strategy_arguments=None, seed=2, shared_mode=False,
           smac_scenario_args=None, time_left_for_this_task=3600,
           tmp_folder=None)

In [134]:
predictions = top_10_model_res.predict(test_X)
print_metrics(test_y, predictions)

# rows are truth, columns are predicted
top_10_confusion_mat = confusion_matrix(test_y, predictions)
top_10_confuse_df = pd.DataFrame(top_10_confusion_mat, index=contact_types, columns=["p_" + x for x in contact_types])
display(top_10_confuse_df)

Accuracy: 0.4380952380952381


Unnamed: 0,precision,recall,F1
macro,0.415804,0.361868,0.373656
weighted,0.438155,0.438095,0.425892


Unnamed: 0,p_work,p_friend,p_task,p_family_live_separate,p_family_live_together,p_sig_other
work,9,21,4,7,1,2
friend,5,81,5,30,8,2
task,4,11,37,10,0,1
family_live_separate,3,41,8,43,10,5
family_live_together,0,9,1,12,4,2
sig_other,0,16,4,10,4,10


## Contact EMA features results

In [136]:
top_10_emc_train_data = top_10_emc_train.replace(replace_dict)
top_10_emc_test_data = top_10_emc_test.replace(replace_dict)

train_y = top_10_emc_train_data['contact_type']
train_X = top_10_emc_train_data.drop(['contact_type', 'pid', 'combined_hash'], axis=1)
test_y = top_10_emc_test_data['contact_type']
test_X = top_10_emc_test_data.drop(['contact_type', 'pid', 'combined_hash'], axis=1)

print("original shape %s" % Counter(train_y))

sm = SMOTE(random_state=rand_seed)

res_X, res_y = sm.fit_resample(train_X, train_y)

print("resampled shape %s" % Counter(res_y))

original shape Counter({1: 484, 3: 392, 2: 196, 0: 165, 4: 133, 6: 110})
resampled shape Counter({3: 484, 1: 484, 6: 484, 0: 484, 4: 484, 2: 484})


In [29]:
top_10_model_emc = pickle.load(open("top_10_emc.automl", "rb"))
top_10_model_emc.refit(res_X, res_y)

AutoSklearnClassifier(delete_output_folder_after_terminate=True,
           delete_tmp_folder_after_terminate=True,
           disable_evaluator_output=False, ensemble_memory_limit=1024,
           ensemble_nbest=50, ensemble_size=50, exclude_estimators=None,
           exclude_preprocessors=None, get_smac_object_callback=None,
           include_estimators=None, include_preprocessors=None,
           initial_configurations_via_metalearning=25, logging_config=None,
           ml_memory_limit=3072, output_folder=None,
           per_run_time_limit=360, resampling_strategy='holdout',
           resampling_strategy_arguments=None, seed=2, shared_mode=False,
           smac_scenario_args=None, time_left_for_this_task=3600,
           tmp_folder=None)

In [80]:
top_10_ensemble = top_10_model_emc.get_models_with_weights()
print_ensemble(top_10_ensemble)

Weight: 0.32, classifier: lda
Weight: 0.14, classifier: lda
Weight: 0.1, classifier: lda
Weight: 0.06, classifier: random_forest
Weight: 0.04, classifier: gradient_boosting
Weight: 0.04, classifier: gradient_boosting
Weight: 0.04, classifier: libsvm_svc
Weight: 0.04, classifier: gradient_boosting
Weight: 0.04, classifier: libsvm_svc
Weight: 0.04, classifier: libsvm_svc
Weight: 0.02, classifier: gradient_boosting
Weight: 0.02, classifier: extra_trees
Weight: 0.02, classifier: lda
Weight: 0.02, classifier: gradient_boosting
Weight: 0.02, classifier: extra_trees
Weight: 0.02, classifier: extra_trees
Weight: 0.02, classifier: decision_tree


In [138]:
predictions = top_10_model_emc.predict(test_X)
print_metrics(test_y, predictions)

# rows are truth, columns are predicted
top_5_confusion_mat = confusion_matrix(test_y, predictions)
top_5_confuse_df = pd.DataFrame(top_5_confusion_mat, index=contact_types, columns=["p_" + x for x in contact_types])
display(top_5_confuse_df)

Accuracy: 0.5


Unnamed: 0,precision,recall,F1
macro,0.460921,0.449162,0.449866
weighted,0.490539,0.5,0.491164


Unnamed: 0,p_work,p_friend,p_task,p_family_live_separate,p_family_live_together,p_sig_other
work,13,16,12,3,0,0
friend,13,79,4,27,6,2
task,5,2,53,3,0,0
family_live_separate,8,36,4,44,12,6
family_live_together,0,4,0,13,6,5
sig_other,0,13,1,12,3,15


In [68]:
probabilities = top_10_model_emc.predict_proba(test_X)

In [78]:
top_2_predictions = np.argpartition(probabilities, -2, axis=1)[:, -2:]