<h1 id="tocheading">Table of Contents</h1>
<div id="toc"></div>

In [34]:
%%javascript
// ToC script
$.getScript('https://kmahelona.github.io/ipython_notebook_goodies/ipython_notebook_toc.js')

<IPython.core.display.Javascript object>

In [30]:
# Code hider, source: http://chris-said.io/2016/02/13/how-to-make-polished-jupyter-presentations-with-optional-code-visibility/
from IPython.display import HTML

HTML('''
<script>
  function code_toggle() {
    if (code_shown){
      $('div.input').hide('500');
      $('#toggleButton').val('Show Code')
    } else {
      $('div.input').show('500');
      $('#toggleButton').val('Hide Code')
    }
    code_shown = !code_shown
  }

  $( document ).ready(function(){
    code_shown=false;
    $('div.input').hide()
  });
</script>
<form action="javascript:code_toggle()"><input type="submit" id="toggleButton" value="Show Code"></form>
''')

In [1]:
%matplotlib inline

from collections import Counter
import os
import pickle
import warnings

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from autosklearn.classification import AutoSklearnClassifier
import autosklearn.metrics as auto_metrics
import sklearn.metrics as sk_metrics
from sklearn.metrics import accuracy_score, confusion_matrix

warnings.filterwarnings('ignore')


  self.re = re.compile( self.reString )


In [7]:
# load models and data

#top_5_age_gender_model = pickle.load(open("quadcorn/rd1/top_5_contact_type_baseline.automl", "rb"))
#top_5_demo_model = pickle.load(open("quadcorn/rd1/top_5_contact_type_baseline.automl", "rb"))
#top_5_all_model = pickle.load(open("quadcorn/rd1/top_5_contact_type_baseline.automl", "rb"))


#top_5_baseline_train = pickle.load(open('../data/top_5_baseline_train_features.df', 'rb'))
#top_5_baseline_test =  pickle.load(open('../data/top_5_baseline_test_features.df', 'rb'))

#top_5_loc_train = pickle.load(open('../data/top_5_loc_train_features.df', 'rb'))
#top_5_loc_test =  pickle.load(open('../data/top_5_loc_test_features.df', 'rb'))

replace_dict = {
    'contact_type': {
        "work": 0,
        "friend": 1,
        "task": 2,
        "family_live_separate": 3,
        "family_live_together": 4,
        "sig_other": 5
    }
}

contact_types = list(replace_dict['contact_type'].keys())

rand_seed = 2

In [8]:
# utility functions

def print_ensemble(ensemble, latex=False):
    delim = ","
    end = ""
    for weight, pipeline in ensemble:
        if latex:
            print("{} & {} \\\\".format(weight, 
                                        pipeline.configuration['classifier:__choice__']))
        else:
            print("Weight: {}, classifier: {}".format(weight, 
                                                      pipeline.configuration['classifier:__choice__']))

            
def print_metrics(test_y, predictions):
    """
    Micro statistics take global counts of TP, FP, etc
    Macro statistics take per class metrics and averages them (not accounting for class imbalance)
    Weighted statistics weight macro metrics by number of true examples in each class
    """
    print("Accuracy:", accuracy_score(test_y, predictions))
    
    # precision, recall, F1
    metrics = np.zeros((2,3))
    
    # micro
    avgs = ['macro', 'weighted']
    for i, avg in enumerate(avgs):
        metrics[i,0] = sk_metrics.precision_score(test_y, predictions, average=avg)
        metrics[i,1] = sk_metrics.recall_score(test_y, predictions, average=avg)
        metrics[i,2] = sk_metrics.f1_score(test_y, predictions, average=avg)

    metrics_df = pd.DataFrame(metrics, index=avgs, columns=['precision', 'recall', 'F1'])
    display(metrics_df)
    
    confusion_mat = confusion_matrix(test_y, predictions)
    confuse_df = pd.DataFrame(confusion_mat, index=contact_types, columns=["p_" + x for x in contact_types])
    display(confuse_df)

# Model description

Models are trained using the out-of-the-box `AutoSklearnClassifier` provided by the `auto-sklearn` module. This is the scikit-learn implementation of the autoML system as described in "efficient and robust automated machine learning."

[Module documentation](https://automl.github.io/auto-sklearn/stable/index.html)

[AutoML paper](https://papers.nips.cc/paper/5872-efficient-and-robust-automated-machine-learning)

# Baseline Random Forest

## Commmunication features only

In [16]:
# TODO switch to top 5

rand_forest_top_5_baseline_model = pickle.load(open('rand_forest_top_5_baseline_contact_type.automl', 'rb'))
rand_forest_top_5_baseline_pred = pickle.load(open('rand_forest_top_5_baseline_contact_type.predict', 'rb'))

In [17]:
top_5_baseline_train = pickle.load(open('../data/top_5_baseline_train_features.df', 'rb'))
top_5_baseline_test =  pickle.load(open('../data/top_5_baseline_test_features.df', 'rb'))

train_data = top_5_baseline_train.replace(replace_dict)
test_data = top_5_baseline_test.replace(replace_dict)

train_y = train_data['contact_type']
train_X = train_data.drop(['contact_type', 'pid', 'combined_hash'], axis=1)
test_y = test_data['contact_type']
test_X = test_data.drop(['contact_type', 'pid', 'combined_hash'], axis=1)

In [18]:
top_5_baseline_ensemble = rand_forest_top_5_baseline_model.get_models_with_weights()
print_ensemble(top_5_baseline_ensemble)
print()
print_metrics(test_y, rand_forest_top_5_baseline_pred)

Weight: 1.0, classifier: random_forest

Accuracy: 0.4


Unnamed: 0,precision,recall,F1
macro,0.315079,0.299408,0.289892
weighted,0.353976,0.4,0.355926


Unnamed: 0,p_work,p_friend,p_task,p_family_live_separate,p_family_live_together,p_sig_other
work,0,9,0,7,0,0
friend,0,47,1,22,0,1
task,0,6,10,3,0,0
family_live_separate,0,24,1,20,0,4
family_live_together,0,6,1,10,0,3
sig_other,0,15,1,12,0,7


## Commmunication + age/sex

In [19]:
# TODO switch to top 5

rand_forest_top_5_age_gender_model = pickle.load(open('rand_forest_top_5_age_gender_contact_type.automl', 'rb'))
rand_forest_top_5_age_gender_pred = pickle.load(open('rand_forest_top_5_age_gender_contact_type.predict', 'rb'))

In [20]:
top_5_age_gender_train = pickle.load(open('../data/top_5_age_gender_train_features.df', 'rb'))
top_5_age_gender_test =  pickle.load(open('../data/top_5_age_gender_test_features.df', 'rb'))

train_data = top_5_age_gender_train.replace(replace_dict)
test_data = top_5_age_gender_test.replace(replace_dict)

train_y = train_data['contact_type']
train_X = train_data.drop(['contact_type', 'pid', 'combined_hash'], axis=1)
test_y = test_data['contact_type']
test_X = test_data.drop(['contact_type', 'pid', 'combined_hash'], axis=1)

In [21]:
top_5_age_gender_ensemble = rand_forest_top_5_age_gender_model.get_models_with_weights()
print_ensemble(top_5_age_gender_ensemble)
print()
print_metrics(test_y, rand_forest_top_5_age_gender_pred)

Weight: 1.0, classifier: random_forest

Accuracy: 0.44761904761904764


Unnamed: 0,precision,recall,F1
macro,0.435293,0.354305,0.362695
weighted,0.440355,0.447619,0.412347


Unnamed: 0,p_work,p_friend,p_task,p_family_live_separate,p_family_live_together,p_sig_other
work,0,9,0,7,0,0
friend,0,51,1,19,0,0
task,0,5,11,3,0,0
family_live_separate,0,23,0,21,1,4
family_live_together,0,7,1,6,4,2
sig_other,0,16,1,8,3,7


## Commmunication + demographics

In [22]:
# TODO switch to top 5

rand_forest_top_5_demo_model = pickle.load(open('rand_forest_top_5_demo_contact_type.automl', 'rb'))
rand_forest_top_5_demo_pred = pickle.load(open('rand_forest_top_5_demo_contact_type.predict', 'rb'))

In [23]:
top_5_demo_train = pickle.load(open('../data/top_5_demo_train_features.df', 'rb'))
top_5_demo_test =  pickle.load(open('../data/top_5_demo_test_features.df', 'rb'))

train_data = top_5_demo_train.replace(replace_dict)
test_data = top_5_demo_test.replace(replace_dict)

train_y = train_data['contact_type']
train_X = train_data.drop(['contact_type', 'pid', 'combined_hash'], axis=1)
test_y = test_data['contact_type']
test_X = test_data.drop(['contact_type', 'pid', 'combined_hash'], axis=1)

In [24]:
top_5_demo_ensemble = rand_forest_top_5_demo_model.get_models_with_weights()
print_ensemble(top_5_demo_ensemble)
print()
print_metrics(test_y, rand_forest_top_5_demo_pred)

Weight: 1.0, classifier: random_forest

Accuracy: 0.44761904761904764


Unnamed: 0,precision,recall,F1
macro,0.427381,0.332982,0.329069
weighted,0.445588,0.447619,0.390508


Unnamed: 0,p_work,p_friend,p_task,p_family_live_separate,p_family_live_together,p_sig_other
work,0,13,0,3,0,0
friend,0,58,1,12,0,0
task,0,7,11,1,0,0
family_live_separate,0,26,0,19,3,1
family_live_together,0,7,0,10,2,1
sig_other,0,18,1,9,3,4


## Commmunication + location

In [25]:
# TODO switch to top 5

rand_forest_top_5_loc_model = pickle.load(open('rand_forest_top_5_loc_contact_type.automl', 'rb'))
rand_forest_top_5_loc_pred = pickle.load(open('rand_forest_top_5_loc_contact_type.predict', 'rb'))

In [26]:
top_5_loc_train = pickle.load(open('../data/top_5_loc_train_features.df', 'rb'))
top_5_loc_test =  pickle.load(open('../data/top_5_loc_test_features.df', 'rb'))

train_data = top_5_loc_train.replace(replace_dict)
test_data = top_5_loc_test.replace(replace_dict)

train_y = train_data['contact_type']
train_X = train_data.drop(['contact_type', 'pid', 'combined_hash'], axis=1)
test_y = test_data['contact_type']
test_X = test_data.drop(['contact_type', 'pid', 'combined_hash'], axis=1)

In [27]:
top_5_loc_ensemble = rand_forest_top_5_loc_model.get_models_with_weights()
print_ensemble(top_5_loc_ensemble)
print()
print_metrics(test_y, rand_forest_top_5_loc_pred)

Weight: 1.0, classifier: random_forest

Accuracy: 0.4523809523809524


Unnamed: 0,precision,recall,F1
macro,0.351493,0.325537,0.317852
weighted,0.392333,0.452381,0.396365


Unnamed: 0,p_work,p_friend,p_task,p_family_live_separate,p_family_live_together,p_sig_other
work,0,11,0,5,0,0
friend,0,59,1,10,0,1
task,0,4,10,5,0,0
family_live_separate,0,27,0,18,1,3
family_live_together,0,6,0,9,0,5
sig_other,0,15,1,9,2,8


## All features

In [28]:
# TODO switch to top 5

rand_forest_top_5_all_model = pickle.load(open('rand_forest_top_5_all_contact_type.automl', 'rb'))
rand_forest_top_5_all_pred = pickle.load(open('rand_forest_top_5_all_contact_type.predict', 'rb'))

In [29]:
top_5_all_train = pickle.load(open('../data/top_5_all_train_features.df', 'rb'))
top_5_all_test =  pickle.load(open('../data/top_5_all_test_features.df', 'rb'))

train_data = top_5_all_train.replace(replace_dict)
test_data = top_5_all_test.replace(replace_dict)

train_y = train_data['contact_type']
train_X = train_data.drop(['contact_type', 'pid', 'combined_hash'], axis=1)
test_y = test_data['contact_type']
test_X = test_data.drop(['contact_type', 'pid', 'combined_hash'], axis=1)

In [30]:
top_5_all_ensemble = rand_forest_top_5_all_model.get_models_with_weights()
print_ensemble(top_5_all_ensemble)
print()
print_metrics(test_y, rand_forest_top_5_all_pred)

Weight: 1.0, classifier: random_forest

Accuracy: 0.43333333333333335


Unnamed: 0,precision,recall,F1
macro,0.356008,0.373314,0.357717
weighted,0.399424,0.433333,0.410575


Unnamed: 0,p_work,p_friend,p_task,p_family_live_separate,p_family_live_together,p_sig_other
work,1,10,1,4,0,0
friend,4,47,1,12,2,5
task,0,4,14,1,0,0
family_live_separate,2,17,4,14,4,8
family_live_together,0,3,3,3,3,8
sig_other,0,8,2,7,6,12


# Top 5 contacts model

## Baseline communication features

Features:
- top 5 contacts per participant
- communication features as described in `feature_extract.ipynb`, with NaN indicators and imputation

Preprocessing: N/A

In [13]:
top_5_baseline_model = pickle.load(open("quadcorn/top_5_contact_type_baseline_long.automl", "rb"))
top_5_baseline_pred = pickle.load(open("quadcorn/top_5_contact_type_baseline_long.predict", "rb"))

In [14]:
top_5_baseline_train = pickle.load(open('../data/top_5_baseline_train_features.df', 'rb'))
top_5_baseline_test =  pickle.load(open('../data/top_5_baseline_test_features.df', 'rb'))

train_data = top_5_baseline_train.replace(replace_dict)
test_data = top_5_baseline_test.replace(replace_dict)

train_y = train_data['contact_type']
train_X = train_data.drop(['contact_type', 'pid', 'combined_hash'], axis=1)
test_y = test_data['contact_type']
test_X = test_data.drop(['contact_type', 'pid', 'combined_hash'], axis=1)

In [15]:
top_5_baseline_ensemble = top_5_baseline_model.get_models_with_weights()
print_ensemble(top_5_baseline_ensemble)
print()
print_metrics(test_y, top_5_baseline_pred)

Weight: 0.16, classifier: extra_trees
Weight: 0.08, classifier: extra_trees
Weight: 0.06, classifier: extra_trees
Weight: 0.06, classifier: extra_trees
Weight: 0.06, classifier: extra_trees
Weight: 0.04, classifier: extra_trees
Weight: 0.04, classifier: extra_trees
Weight: 0.04, classifier: extra_trees
Weight: 0.04, classifier: extra_trees
Weight: 0.04, classifier: extra_trees
Weight: 0.04, classifier: extra_trees
Weight: 0.04, classifier: extra_trees
Weight: 0.04, classifier: extra_trees
Weight: 0.04, classifier: extra_trees
Weight: 0.02, classifier: extra_trees
Weight: 0.02, classifier: extra_trees
Weight: 0.02, classifier: extra_trees
Weight: 0.02, classifier: extra_trees
Weight: 0.02, classifier: extra_trees
Weight: 0.02, classifier: extra_trees
Weight: 0.02, classifier: extra_trees
Weight: 0.02, classifier: extra_trees
Weight: 0.02, classifier: extra_trees
Weight: 0.02, classifier: extra_trees
Weight: 0.02, classifier: extra_trees

Accuracy: 0.4666666666666667


Unnamed: 0,precision,recall,F1
macro,0.508444,0.353287,0.349512
weighted,0.486977,0.466667,0.4224


Unnamed: 0,p_work,p_friend,p_task,p_family_live_separate,p_family_live_together,p_sig_other
work,0,10,0,6,0,0
friend,0,56,1,14,0,0
task,0,4,11,4,0,0
family_live_separate,0,21,0,19,0,9
family_live_together,0,3,1,10,1,5
sig_other,0,13,1,10,0,11


## Communication + age/sex

In [11]:
top_5_age_gender_model = pickle.load(open("results/top_5_contact_type_age_gender_long.automl", "rb"))
top_5_age_gender_pred = pickle.load(open("results/top_5_contact_type_age_gender_long.predict", "rb"))

In [12]:
top_5_age_gender_train = pickle.load(open('../data/top_5_age_gender_train_features.df', 'rb'))
top_5_age_gender_test =  pickle.load(open('../data/top_5_age_gender_test_features.df', 'rb'))

train_data = top_5_age_gender_train.replace(replace_dict)
test_data = top_5_age_gender_test.replace(replace_dict)

train_y = train_data['contact_type']
train_X = train_data.drop(['contact_type', 'pid', 'combined_hash'], axis=1)
test_y = test_data['contact_type']
test_X = test_data.drop(['contact_type', 'pid', 'combined_hash'], axis=1)

In [13]:
top_5_age_gender_ensemble = top_5_age_gender_model.get_models_with_weights()
print_ensemble(top_5_age_gender_ensemble)
print()
print_metrics(test_y, top_5_age_gender_pred)

Weight: 0.14, classifier: random_forest
Weight: 0.12, classifier: gradient_boosting
Weight: 0.1, classifier: extra_trees
Weight: 0.08, classifier: liblinear_svc
Weight: 0.06, classifier: extra_trees
Weight: 0.06, classifier: random_forest
Weight: 0.06, classifier: random_forest
Weight: 0.04, classifier: gradient_boosting
Weight: 0.04, classifier: adaboost
Weight: 0.04, classifier: gradient_boosting
Weight: 0.04, classifier: random_forest
Weight: 0.04, classifier: random_forest
Weight: 0.04, classifier: random_forest
Weight: 0.04, classifier: random_forest
Weight: 0.02, classifier: extra_trees
Weight: 0.02, classifier: liblinear_svc
Weight: 0.02, classifier: liblinear_svc
Weight: 0.02, classifier: liblinear_svc
Weight: 0.02, classifier: k_nearest_neighbors

Accuracy: 0.4380952380952381


Unnamed: 0,precision,recall,F1
macro,0.446364,0.332574,0.337835
weighted,0.442495,0.438095,0.400363


Unnamed: 0,p_work,p_friend,p_task,p_family_live_separate,p_family_live_together,p_sig_other
work,0,10,0,6,0,0
friend,0,52,1,18,0,0
task,0,5,10,4,0,0
family_live_separate,0,23,0,20,1,5
family_live_together,0,2,1,11,2,4
sig_other,0,15,1,11,0,8


## Communication + demographics

In [18]:
top_5_demo_model = pickle.load(open("results/top_5_contact_type_demo_long.automl", "rb"))
top_5_demo_pred = pickle.load(open("results/top_5_contact_type_demo_long.predict", "rb"))

In [19]:
top_5_demo_train = pickle.load(open('../data/top_5_demo_train_features.df', 'rb'))
top_5_demo_test =  pickle.load(open('../data/top_5_demo_test_features.df', 'rb'))

train_data = top_5_demo_train.replace(replace_dict)
test_data = top_5_demo_test.replace(replace_dict)

train_y = train_data['contact_type']
train_X = train_data.drop(['contact_type', 'pid', 'combined_hash'], axis=1)
test_y = test_data['contact_type']
test_X = test_data.drop(['contact_type', 'pid', 'combined_hash'], axis=1)

In [20]:
top_5_demo_ensemble = top_5_demo_model.get_models_with_weights()
print_ensemble(top_5_demo_ensemble)
print()
print_metrics(test_y, top_5_demo_pred)

Weight: 0.78, classifier: adaboost
Weight: 0.04, classifier: gradient_boosting
Weight: 0.04, classifier: random_forest
Weight: 0.04, classifier: liblinear_svc
Weight: 0.02, classifier: gradient_boosting
Weight: 0.02, classifier: gradient_boosting
Weight: 0.02, classifier: gradient_boosting
Weight: 0.02, classifier: gradient_boosting
Weight: 0.02, classifier: gradient_boosting

Accuracy: 0.4666666666666667


Unnamed: 0,precision,recall,F1
macro,0.518572,0.384015,0.403317
weighted,0.485985,0.466667,0.44745


Unnamed: 0,p_work,p_friend,p_task,p_family_live_separate,p_family_live_together,p_sig_other
work,2,9,1,4,0,0
friend,1,50,1,18,0,1
task,0,4,11,4,0,0
family_live_separate,0,21,0,18,4,6
family_live_together,0,4,1,9,2,4
sig_other,0,9,1,10,0,15


## Communication + location features

In [31]:
top_5_loc_model = pickle.load(open("quadcorn/top_5_contact_type_loc_long.automl", "rb"))
top_5_loc_pred = pickle.load(open("quadcorn/top_5_contact_type_loc_long.predict", "rb"))

Could not delete output dir: /tmp/autosklearn_output_785_6235
Could not delete tmp dir: /tmp/autosklearn_tmp_785_6235


In [32]:
top_5_loc_train = pickle.load(open('../data/top_5_loc_train_features.df', 'rb'))
top_5_loc_test =  pickle.load(open('../data/top_5_loc_test_features.df', 'rb'))

train_data = top_5_loc_train.replace(replace_dict)
test_data = top_5_loc_test.replace(replace_dict)

train_y = train_data['contact_type']
train_X = train_data.drop(['contact_type', 'pid', 'combined_hash'], axis=1)
test_y = test_data['contact_type']
test_X = test_data.drop(['contact_type', 'pid', 'combined_hash'], axis=1)

In [33]:
top_5_loc_ensemble = top_5_loc_model.get_models_with_weights()
print_ensemble(top_5_loc_ensemble)
print()
print_metrics(test_y, top_5_loc_pred)

Weight: 0.22000000000000003, classifier: adaboost
Weight: 0.18000000000000002, classifier: adaboost
Weight: 0.12000000000000001, classifier: xgradient_boosting
Weight: 0.10000000000000002, classifier: adaboost
Weight: 0.08000000000000002, classifier: adaboost
Weight: 0.060000000000000005, classifier: xgradient_boosting
Weight: 0.060000000000000005, classifier: adaboost
Weight: 0.060000000000000005, classifier: xgradient_boosting
Weight: 0.04000000000000001, classifier: xgradient_boosting
Weight: 0.020000000000000004, classifier: extra_trees
Weight: 0.020000000000000004, classifier: xgradient_boosting
Weight: 0.020000000000000004, classifier: xgradient_boosting
Weight: 0.020000000000000004, classifier: xgradient_boosting

Accuracy: 0.44285714285714284


Unnamed: 0,precision,recall,F1
macro,0.391041,0.313059,0.307652
weighted,0.410745,0.442857,0.387295


Unnamed: 0,p_work,p_friend,p_task,p_family_live_separate,p_family_live_together,p_sig_other
work,0,11,0,5,0,0
friend,0,55,1,13,0,2
task,0,8,8,3,0,0
family_live_separate,0,22,0,24,0,3
family_live_together,0,6,0,10,1,3
sig_other,0,15,1,12,2,5


## All features

In [34]:
top_5_all_model = pickle.load(open("quadcorn/top_5_contact_type_all_long.automl", "rb"))
top_5_all_pred = pickle.load(open("quadcorn/top_5_contact_type_all_long.predict", "rb"))

Could not delete output dir: /tmp/autosklearn_output_626_3378
Could not delete tmp dir: /tmp/autosklearn_tmp_626_3378


In [35]:
top_5_all_train = pickle.load(open('../data/top_5_all_train_features.df', 'rb'))
top_5_all_test =  pickle.load(open('../data/top_5_all_test_features.df', 'rb'))

train_data = top_5_all_train.replace(replace_dict)
test_data = top_5_all_test.replace(replace_dict)

train_y = train_data['contact_type']
train_X = train_data.drop(['contact_type', 'pid', 'combined_hash'], axis=1)
test_y = test_data['contact_type']
test_X = test_data.drop(['contact_type', 'pid', 'combined_hash'], axis=1)

In [36]:
top_5_all_ensemble = top_5_all_model.get_models_with_weights()
print_ensemble(top_5_all_ensemble)
print()
print_metrics(test_y, top_5_all_pred)

Weight: 0.38, classifier: adaboost
Weight: 0.28, classifier: adaboost
Weight: 0.14, classifier: xgradient_boosting
Weight: 0.04, classifier: gradient_boosting
Weight: 0.04, classifier: k_nearest_neighbors
Weight: 0.02, classifier: gradient_boosting
Weight: 0.02, classifier: gradient_boosting
Weight: 0.02, classifier: gradient_boosting
Weight: 0.02, classifier: liblinear_svc
Weight: 0.02, classifier: extra_trees
Weight: 0.02, classifier: gradient_boosting

Accuracy: 0.49523809523809526


Unnamed: 0,precision,recall,F1
macro,0.472987,0.434205,0.436866
weighted,0.482032,0.495238,0.47665


Unnamed: 0,p_work,p_friend,p_task,p_family_live_separate,p_family_live_together,p_sig_other
work,2,11,1,1,0,1
friend,3,51,1,14,0,2
task,0,4,15,0,0,0
family_live_separate,0,18,1,20,7,3
family_live_together,0,1,1,7,5,6
sig_other,0,7,1,9,7,11
