<h1 id="tocheading">Table of Contents</h1>
<div id="toc"></div>

In [29]:
%%javascript
// ToC script
$.getScript('https://kmahelona.github.io/ipython_notebook_goodies/ipython_notebook_toc.js')

<IPython.core.display.Javascript object>

In [30]:
# Code hider, source: http://chris-said.io/2016/02/13/how-to-make-polished-jupyter-presentations-with-optional-code-visibility/
from IPython.display import HTML

HTML('''
<script>
  function code_toggle() {
    if (code_shown){
      $('div.input').hide('500');
      $('#toggleButton').val('Show Code')
    } else {
      $('div.input').show('500');
      $('#toggleButton').val('Hide Code')
    }
    code_shown = !code_shown
  }

  $( document ).ready(function(){
    code_shown=false;
    $('div.input').hide()
  });
</script>
<form action="javascript:code_toggle()"><input type="submit" id="toggleButton" value="Show Code"></form>
''')

In [31]:
%matplotlib inline

import os
import pickle
import warnings

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from autosklearn.classification import AutoSklearnClassifier
from sklearn.metrics import accuracy_score, balanced_accuracy_score, confusion_matrix

In [32]:
# load models and data
warnings.filterwarnings('ignore')
top_5_model = pickle.load(open("bleen/top_5_best_model_final.automl", "rb"))
top_10_model = pickle.load(open("bleen/top_10_best_model_final.automl", "rb"))

top_5_train = pickle.load(open('../data/top_5_train_features.df', 'rb'))
top_5_test =  pickle.load(open('../data/top_5_test_features.df', 'rb'))

top_10_train = pickle.load(open('../data/train_features.df', 'rb'))
top_10_test =  pickle.load(open('../data/test_features.df', 'rb'))

replace_dict = {
    'contact_type': {
        "work": 0,
        "friend": 1,
        "task": 2,
        "family_live_separate": 3,
        "family_live_together": 4,
#        "other": 5,
        "sig_other": 6
    }
}

contact_types = list(replace_dict['contact_type'].keys())

# Top 5 contacts model

In [26]:
train_data = top_5_train.replace(replace_dict)
test_data = top_5_test.replace(replace_dict)

train_y = train_data['contact_type']
train_X = train_data.drop(['contact_type', 'pid', 'combined_hash'], axis=1)
test_y = test_data['contact_type']
test_X = test_data.drop(['contact_type', 'pid', 'combined_hash'], axis=1)

In [None]:
top_5_model.refit(train_X, train_y)

## Base results

Features:
- top 5 contacts per participant
- communication features as described in `feature_extract.ipynb`, with NaN indicators and imputation

Preprocessing: N/A

In [28]:
predictions = top_5_model.predict(test_X)
print("Accuracy:", accuracy_score(test_y, predictions))
print("Balanced accuracy:", balanced_accuracy_score(test_y, predictions))

# rows are truth, columns are predicted
top_5_confusion_mat = confusion_matrix(test_y, predictions)
top_5_confuse_df = pd.DataFrame(top_5_confusion_mat, index=contact_types, columns=["p_" + x for x in contact_types])
display(top_5_confuse_df)

Accuracy: 0.40476190476190477
Balanced accuracy: 0.32181938750296263


Unnamed: 0,p_work,p_friend,p_task,p_family_live_separate,p_family_live_together,p_sig_other
work,0,7,0,7,1,1
friend,1,46,2,19,0,3
task,1,3,11,3,0,1
family_live_separate,0,23,1,17,4,4
family_live_together,0,5,0,8,2,5
sig_other,0,12,0,13,1,9


## Resample results

Features:
- top 5 contacts per participant
- communication features as described in `feature_extract.ipynb`, with NaN indicators and imputation

Preprocessing: 
- SMOTE algorithm ([reference](https://imbalanced-learn.org/en/stable/generated/imblearn.over_sampling.SMOTE.html)) applied to balance training classes


In [None]:
top_5_model_res = pickle.load(open("bleen/top_5_oversample.automl", "rb"))
top_5_model_res.refit(train_X, train_y)

In [27]:
predictions = top_5_model_res.predict(test_X)
print("Accuracy:", accuracy_score(test_y, predictions))
print("Balanced accuracy:", balanced_accuracy_score(test_y, predictions))

# rows are truth, columns are predicted
top_5_confusion_mat = confusion_matrix(test_y, predictions)
top_5_confuse_df = pd.DataFrame(top_5_confusion_mat, index=contact_types, columns=["p_" + x for x in contact_types])
display(top_5_confuse_df)

Accuracy: 0.4238095238095238
Balanced accuracy: 0.38041885397598624


Unnamed: 0,p_work,p_friend,p_task,p_family_live_separate,p_family_live_together,p_sig_other
work,4,5,1,6,0,0
friend,1,41,2,25,1,1
task,1,3,12,2,0,1
family_live_separate,0,24,1,19,3,2
family_live_together,0,4,0,11,3,2
sig_other,0,11,1,11,2,10


# Top 10 contacts model

In [None]:
train_data = top_10_train.replace(replace_dict)
test_data = top_10_test.replace(replace_dict)

train_y = train_data['contact_type']
train_X = train_data.drop(['contact_type', 'pid', 'combined_hash'], axis=1)
test_y = test_data['contact_type']
test_X = test_data.drop(['contact_type', 'pid', 'combined_hash'], axis=1)

top_10_model.refit(train_X, train_y)

## Base results

Features:
- top 10 contacts per participant
- communication features as described in `feature_extract.ipynb`, with NaN indicators and imputation

Preprocessing: N/A

In [25]:
predictions = top_10_model.predict(test_X)
print("Accuracy:", accuracy_score(test_y, predictions))
print("Balanced accuracy:", balanced_accuracy_score(test_y, predictions))

# rows are truth, columns are predicted
top_10_confusion_mat = confusion_matrix(test_y, predictions)
top_10_confuse_df = pd.DataFrame(top_10_confusion_mat, index=contact_types, columns=["p_" + x for x in contact_types])
display(top_10_confuse_df)

Accuracy: 0.44285714285714284
Balanced accuracy: 0.35409502146143373


Unnamed: 0,p_work,p_friend,p_task,p_family_live_separate,p_family_live_together,p_sig_other
work,5,29,4,2,1,3
friend,2,93,7,25,2,2
task,2,15,44,2,0,0
family_live_separate,0,53,11,32,9,5
family_live_together,0,7,1,13,3,4
sig_other,0,15,4,12,4,9


## Resample results

Features:
- top 10 contacts per participant
- communication features as described in `feature_extract.ipynb`, with NaN indicators and imputation

Preprocessing: 
- SMOTE algorithm ([reference](https://imbalanced-learn.org/en/stable/generated/imblearn.over_sampling.SMOTE.html)) applied to balance training classes


In [None]:
top_10_model_res = pickle.load(open("bleen/top_10_oversample.automl", "rb"))
top_10_model_res.refit(train_X, train_y)

In [24]:
predictions = top_10_model_res.predict(test_X)
print("Accuracy:", accuracy_score(test_y, predictions))
print("Balanced accuracy:", balanced_accuracy_score(test_y, predictions))

# rows are truth, columns are predicted
top_10_confusion_mat = confusion_matrix(test_y, predictions)
top_10_confuse_df = pd.DataFrame(top_10_confusion_mat, index=contact_types, columns=["p_" + x for x in contact_types])
display(top_10_confuse_df)

Accuracy: 0.4380952380952381
Balanced accuracy: 0.36186776892883765


Unnamed: 0,p_work,p_friend,p_task,p_family_live_separate,p_family_live_together,p_sig_other
work,9,21,4,7,1,2
friend,5,81,5,30,8,2
task,4,11,37,10,0,1
family_live_separate,3,41,8,43,10,5
family_live_together,0,9,1,12,4,2
sig_other,0,16,4,10,4,10
