<h1 id="tocheading">Table of Contents</h1>
<div id="toc"></div>

In [4]:
%%javascript
// ToC script
$.getScript('https://kmahelona.github.io/ipython_notebook_goodies/ipython_notebook_toc.js')

<IPython.core.display.Javascript object>

In [5]:
# Code hider, source: http://chris-said.io/2016/02/13/how-to-make-polished-jupyter-presentations-with-optional-code-visibility/
from IPython.display import HTML

HTML('''
<script>
  function code_toggle() {
    if (code_shown){
      $('div.input').hide('500');
      $('#toggleButton').val('Show Code')
    } else {
      $('div.input').show('500');
      $('#toggleButton').val('Hide Code')
    }
    code_shown = !code_shown
  }

  $( document ).ready(function(){
    code_shown=false;
    $('div.input').hide()
  });
</script>
<form action="javascript:code_toggle()"><input type="submit" id="toggleButton" value="Show Code"></form>
''')

In [7]:
%matplotlib inline

from collections import Counter
import os,sys,inspect
import pickle
import warnings

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from autosklearn.regression import AutoSklearnRegressor
from sklearn.ensemble import RandomForestRegressor
import autosklearn.metrics as auto_metrics
import sklearn.metrics as sk_metrics
from sklearn.metrics import accuracy_score, confusion_matrix, r2_score, mean_squared_error

warnings.filterwarnings('ignore')

currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parentdir = os.path.dirname(currentdir)
sys.path.insert(0,parentdir) 

from model.model_util import *

In [8]:
predict_targets = [
    'q1_want',
    'q2_talk',
    'q3_loan',
    'q4_closeness'
]

rand_seed = 2

In [9]:
# utility functions

def print_ensemble(ensemble, latex=False, is_clf=True):
    delim = ","
    end = ""
    target_predictor = 'classifer' if is_clf else 'regressor'
    for weight, pipeline in ensemble:
        if latex:
            print("{} & {} \\\\".format(weight, 
                                        pipeline.configuration["{}:__choice__".format(target_predictor)]))
        else:
            print("Weight: {}, classifier: {}".format(weight, 
                                                      pipeline.configuration["{}:__choice__".format(target_predictor)]))

            
def print_metrics(test_y, predictions):
    """
    Micro statistics take global counts of TP, FP, etc
    Macro statistics take per class metrics and averages them (not accounting for class imbalance)
    Weighted statistics weight macro metrics by number of true examples in each class
    """
    print("Accuracy:", accuracy_score(test_y, predictions))
    
    # precision, recall, F1
    metrics = np.zeros((2,3))
    
    # micro
    avgs = ['macro', 'weighted']
    for i, avg in enumerate(avgs):
        metrics[i,0] = sk_metrics.precision_score(test_y, predictions, average=avg)
        metrics[i,1] = sk_metrics.recall_score(test_y, predictions, average=avg)
        metrics[i,2] = sk_metrics.f1_score(test_y, predictions, average=avg)

    metrics_df = pd.DataFrame(metrics, index=avgs, columns=['precision', 'recall', 'F1'])
    display(metrics_df)
    
    confusion_mat = confusion_matrix(test_y, predictions)
    confuse_df = pd.DataFrame(confusion_mat, index=contact_types, columns=["p_" + x for x in contact_types])
    display(confuse_df)

In [10]:
def print_reg_metrics(test_y_dict, pred_dict):
    """
    prints the MSE and R^2 of the given prediction dictionaries.
    """
    
    targets = pred_dict.keys()
    # R^2, MSE
    metrics = np.zeros((2, len(targets)))
    for i, target in enumerate(targets):
        metrics[0, i] = sk_metrics.r2_score(test_y_dict[target], pred_dict[target]) 
        metrics[1, i] = sk_metrics.mean_squared_error(test_y_dict[target], pred_dict[target]) 
        
    metrics_df = pd.DataFrame(metrics, index=['R^2', 'MSE'], columns=targets)
    display(metrics_df)
    

def get_rand_forest_results(train_data, test_data):
    
    pred_dict = {}
    actual_dict = {}

    for target in predict_targets:
        actual_dict[target] = test_data[target]

        train_y = train_data[target]
        train_X = train_data.drop(['contact_type', 'pid', 'combined_hash'] + predict_targets, axis=1)
        test_X = test_data.drop(['contact_type', 'pid', 'combined_hash'] + predict_targets, axis=1)

        clf = RandomForestRegressor(random_state=2, n_jobs=2, n_estimators=50, warm_start=True)
        clf.fit(train_X, train_y)
        pred = clf.predict(test_X)
        pred_dict[target] = pred


    return actual_dict, pred_dict

# AutoML Regression

Models are trained using the out-of-the-box `AutoSklearnRegressor` provided by the `auto-sklearn` module. This is the scikit-learn implementation of the autoML system as described in "efficient and robust automated machine learning."

[Module documentation](https://automl.github.io/auto-sklearn/stable/index.html)

[AutoML paper](https://papers.nips.cc/paper/5872-efficient-and-robust-automated-machine-learning)

# Baseline Random Forest

## Commmunication features only

In [11]:
rf_closeness_preds = []

In [13]:
top_5_baseline_train = pickle.load(open('../data/rd2_features/top_5_baseline_train_features.df', 'rb'))
top_5_baseline_test =  pickle.load(open('../data/rd2_features/top_5_baseline_test_features.df', 'rb'))

actual_dict, pred_dict = get_rand_forest_results(top_5_baseline_train, top_5_baseline_test)
print_reg_metrics(actual_dict, pred_dict)

rf_closeness_preds.append(pred_dict['q4_closeness'])

Unnamed: 0,q1_want,q2_talk,q3_loan,q4_closeness
R^2,0.061598,0.187434,0.098771,0.219407
MSE,2.262059,3.014037,5.250296,3.068172


## Commmunication + age/sex

In [15]:
top_5_age_gender_train = pickle.load(open('../data/rd2_features/top_5_age_gender_train_features.df', 'rb'))
top_5_age_gender_test =  pickle.load(open('../data/rd2_features/top_5_age_gender_test_features.df', 'rb'))

actual_dict, pred_dict = get_rand_forest_results(top_5_age_gender_train, top_5_age_gender_test)
print_reg_metrics(actual_dict, pred_dict)

rf_closeness_preds.append(pred_dict['q4_closeness'])

Unnamed: 0,q1_want,q2_talk,q3_loan,q4_closeness
R^2,0.05775,0.206931,0.120913,0.230501
MSE,2.271336,2.941715,5.121307,3.024566


## Commmunication + demographics

In [16]:
top_5_demo_train = pickle.load(open('../data/rd2_features/top_5_demo_train_features.df', 'rb'))
top_5_demo_test =  pickle.load(open('../data/rd2_features/top_5_demo_test_features.df', 'rb'))

actual_dict, pred_dict = get_rand_forest_results(top_5_demo_train, top_5_demo_test)
print_reg_metrics(actual_dict, pred_dict)

rf_closeness_preds.append(pred_dict['q4_closeness'])

Unnamed: 0,q1_want,q2_talk,q3_loan,q4_closeness
R^2,0.080359,0.231371,0.108065,0.237374
MSE,2.216835,2.851062,5.196151,2.997551


## Commmunication + location

In [17]:
top_5_loc_train = pickle.load(open('../data/rd2_features/top_5_loc_train_features.df', 'rb'))
top_5_loc_test =  pickle.load(open('../data/rd2_features/top_5_loc_test_features.df', 'rb'))

actual_dict, pred_dict = get_rand_forest_results(top_5_loc_train, top_5_loc_test)
print_reg_metrics(actual_dict, pred_dict)

rf_closeness_preds.append(pred_dict['q4_closeness'])

Unnamed: 0,q1_want,q2_talk,q3_loan,q4_closeness
R^2,0.027505,0.198899,0.063982,0.223408
MSE,2.344242,2.971512,5.452968,3.052445


## All features

In [18]:
top_5_all_train = pickle.load(open('../data/rd2_features/top_5_all_train_features.df', 'rb'))
top_5_all_test =  pickle.load(open('../data/rd2_features/top_5_all_test_features.df', 'rb'))

actual_dict, pred_dict = get_rand_forest_results(top_5_all_train, top_5_all_test)
print_reg_metrics(actual_dict, pred_dict)

rf_closeness_preds.append(pred_dict['q4_closeness'])

Unnamed: 0,q1_want,q2_talk,q3_loan,q4_closeness
R^2,0.048212,0.196893,0.091855,0.264235
MSE,2.294326,2.978949,5.290589,2.891975


# Top 5 contacts model

## Baseline communication features

Features:
- top 5 contacts per participant
- communication features as described in `feature_extract.ipynb`, with NaN indicators and imputation

Preprocessing: N/A

In [30]:
actual_dict = {}
for target in predict_targets:
    actual_dict[target] = test_data[target]

In [None]:
automl_closeness_preds = []

In [44]:
baseline_models = {}
baseline_preds = {}

for target in predict_targets:
    baseline_models[target] = pickle.load(open("quadcorn/top_5_{}_baseline.automl".format(target), "rb"))
    baseline_preds[target] = pickle.load(open("quadcorn/top_5_{}_baseline.predict".format(target), "rb"))
    
print_reg_metrics(actual_dict, baseline_preds)

for target, model in baseline_models.items():
    if target == 'q4_closeness':
        print(target)
        ensemble = model.get_models_with_weights()
        print_ensemble(ensemble, is_clf=False)

rf_closeness_preds.append(pred_dict['q4_closeness'])

Could not delete output dir: /tmp/autosklearn_output_10243_2961
Could not delete tmp dir: /tmp/autosklearn_tmp_10243_2961
Could not delete output dir: /tmp/autosklearn_output_17096_998
Could not delete tmp dir: /tmp/autosklearn_tmp_17096_998
Could not delete output dir: /tmp/autosklearn_output_22516_3175
Could not delete tmp dir: /tmp/autosklearn_tmp_22516_3175


Unnamed: 0,q1_want,q2_talk,q3_loan,q4_closeness
R^2,0.079346,0.154661,0.118709,0.286069
MSE,2.219277,3.135602,5.134146,2.806152


Could not delete output dir: /tmp/autosklearn_output_29045_2469
Could not delete tmp dir: /tmp/autosklearn_tmp_29045_2469


q1_want
Weight: 0.32, classifier: random_forest
Weight: 0.22, classifier: random_forest
Weight: 0.14, classifier: sgd
Weight: 0.12, classifier: decision_tree
Weight: 0.08, classifier: adaboost
Weight: 0.08, classifier: ridge_regression
Weight: 0.04, classifier: sgd
q2_talk
Weight: 0.22, classifier: ridge_regression
Weight: 0.2, classifier: xgradient_boosting
Weight: 0.16, classifier: ard_regression
Weight: 0.14, classifier: adaboost
Weight: 0.12, classifier: xgradient_boosting
Weight: 0.1, classifier: liblinear_svr
Weight: 0.02, classifier: ridge_regression
Weight: 0.02, classifier: ridge_regression
Weight: 0.02, classifier: libsvm_svr
q3_loan
Weight: 0.34, classifier: extra_trees
Weight: 0.24, classifier: sgd
Weight: 0.12, classifier: sgd
Weight: 0.12, classifier: liblinear_svr
Weight: 0.06, classifier: sgd
Weight: 0.04, classifier: adaboost
Weight: 0.04, classifier: liblinear_svr
Weight: 0.02, classifier: extra_trees
Weight: 0.02, classifier: liblinear_svr
q4_closeness
Weight: 0.2800

## Communication + age/sex

In [45]:
age_gender_models = {}
age_gender_preds = {}

for target in predict_targets:
    age_gender_models[target] = pickle.load(open("quadcorn/top_5_{}_age_gender.automl".format(target), "rb"))
    age_gender_preds[target] = pickle.load(open("quadcorn/top_5_{}_age_gender.predict".format(target), "rb"))
    
print_reg_metrics(actual_dict, age_gender_preds)

for target, model in age_gender_models.items():
    print(target)
    ensemble = model.get_models_with_weights()
    print_ensemble(ensemble, is_clf=False)

Unnamed: 0,q1_want,q2_talk,q3_loan,q4_closeness
R^2,0.10062,0.166397,0.123118,0.253668
MSE,2.167995,3.092068,5.108458,2.933508


q1_want
Weight: 0.24, classifier: extra_trees
Weight: 0.24, classifier: adaboost
Weight: 0.24, classifier: extra_trees
Weight: 0.16, classifier: ard_regression
Weight: 0.1, classifier: random_forest
Weight: 0.02, classifier: extra_trees
q2_talk
Weight: 0.26, classifier: ridge_regression
Weight: 0.26, classifier: liblinear_svr
Weight: 0.12, classifier: extra_trees
Weight: 0.1, classifier: ridge_regression
Weight: 0.08, classifier: extra_trees
Weight: 0.08, classifier: k_nearest_neighbors
Weight: 0.08, classifier: extra_trees
Weight: 0.02, classifier: adaboost
q3_loan
Weight: 0.2, classifier: liblinear_svr
Weight: 0.18, classifier: extra_trees
Weight: 0.14, classifier: k_nearest_neighbors
Weight: 0.12, classifier: adaboost
Weight: 0.12, classifier: ard_regression
Weight: 0.1, classifier: liblinear_svr
Weight: 0.08, classifier: ard_regression
Weight: 0.04, classifier: ard_regression
Weight: 0.02, classifier: extra_trees
q4_closeness
Weight: 0.26, classifier: k_nearest_neighbors
Weight: 0.

## Communication + demographics

In [46]:
demo_models = {}
demo_preds = {}

for target in predict_targets:
    demo_models[target] = pickle.load(open("quadcorn/top_5_{}_demo.automl".format(target), "rb"))
    demo_preds[target] = pickle.load(open("quadcorn/top_5_{}_demo.predict".format(target), "rb"))
    
print_reg_metrics(actual_dict, demo_preds)

for target, model in demo_models.items():
    print(target)
    ensemble = model.get_models_with_weights()
    print_ensemble(ensemble, is_clf=False)

Unnamed: 0,q1_want,q2_talk,q3_loan,q4_closeness
R^2,0.057559,0.174305,0.117829,0.277606
MSE,2.271794,3.062735,5.139272,2.839417


q1_want
Weight: 0.3, classifier: extra_trees
Weight: 0.3, classifier: ard_regression
Weight: 0.22, classifier: extra_trees
Weight: 0.08, classifier: ard_regression
Weight: 0.04, classifier: extra_trees
Weight: 0.02, classifier: extra_trees
Weight: 0.02, classifier: ard_regression
Weight: 0.02, classifier: liblinear_svr
q2_talk
Weight: 0.26, classifier: ridge_regression
Weight: 0.24, classifier: extra_trees
Weight: 0.2, classifier: sgd
Weight: 0.2, classifier: ard_regression
Weight: 0.06, classifier: random_forest
Weight: 0.04, classifier: extra_trees
q3_loan
Weight: 0.26, classifier: ridge_regression
Weight: 0.18, classifier: adaboost
Weight: 0.16, classifier: liblinear_svr
Weight: 0.1, classifier: adaboost
Weight: 0.08, classifier: liblinear_svr
Weight: 0.06, classifier: k_nearest_neighbors
Weight: 0.06, classifier: adaboost
Weight: 0.04, classifier: decision_tree
Weight: 0.04, classifier: k_nearest_neighbors
Weight: 0.02, classifier: ridge_regression
q4_closeness
Weight: 0.3, classif

## Communication + location features

In [47]:
loc_models = {}
loc_preds = {}

for target in predict_targets:
    loc_models[target] = pickle.load(open("quadcorn/top_5_{}_loc.automl".format(target), "rb"))
    loc_preds[target] = pickle.load(open("quadcorn/top_5_{}_loc.predict".format(target), "rb"))
    
print_reg_metrics(actual_dict, loc_preds)

for target, model in loc_models.items():
    print(target)
    ensemble = model.get_models_with_weights()
    print_ensemble(ensemble, is_clf=False)

Unnamed: 0,q1_want,q2_talk,q3_loan,q4_closeness
R^2,0.069003,0.186755,0.11608,0.259111
MSE,2.24421,3.016555,5.149463,2.912115


q1_want
Weight: 0.26, classifier: ridge_regression
Weight: 0.24, classifier: extra_trees
Weight: 0.1, classifier: extra_trees
Weight: 0.1, classifier: k_nearest_neighbors
Weight: 0.08, classifier: ard_regression
Weight: 0.08, classifier: ridge_regression
Weight: 0.08, classifier: libsvm_svr
Weight: 0.06, classifier: ridge_regression
q2_talk
Weight: 0.32, classifier: random_forest
Weight: 0.24, classifier: ridge_regression
Weight: 0.14, classifier: ard_regression
Weight: 0.12, classifier: ard_regression
Weight: 0.1, classifier: adaboost
Weight: 0.06, classifier: k_nearest_neighbors
Weight: 0.02, classifier: ard_regression
q3_loan
Weight: 0.26, classifier: sgd
Weight: 0.18, classifier: liblinear_svr
Weight: 0.18, classifier: ard_regression
Weight: 0.1, classifier: adaboost
Weight: 0.08, classifier: adaboost
Weight: 0.06, classifier: ard_regression
Weight: 0.06, classifier: sgd
Weight: 0.02, classifier: extra_trees
Weight: 0.02, classifier: extra_trees
Weight: 0.02, classifier: random_for

## All features

In [48]:
all_models = {}
all_preds = {}

for target in predict_targets:
    all_models[target] = pickle.load(open("quadcorn/top_5_{}_all.automl".format(target), "rb"))
    all_preds[target] = pickle.load(open("quadcorn/top_5_{}_all.predict".format(target), "rb"))
    
print_reg_metrics(actual_dict, all_preds)

for target, model in all_models.items():
    print(target)
    ensemble = model.get_models_with_weights()
    print_ensemble(ensemble, is_clf=False)

Unnamed: 0,q1_want,q2_talk,q3_loan,q4_closeness
R^2,0.075216,0.161481,0.137403,0.277435
MSE,2.229232,3.110304,5.025236,2.840089


q1_want
Weight: 0.36, classifier: ard_regression
Weight: 0.22, classifier: ard_regression
Weight: 0.14, classifier: extra_trees
Weight: 0.1, classifier: ard_regression
Weight: 0.06, classifier: random_forest
Weight: 0.06, classifier: extra_trees
Weight: 0.04, classifier: random_forest
Weight: 0.02, classifier: k_nearest_neighbors
q2_talk
Weight: 0.2, classifier: extra_trees
Weight: 0.18, classifier: ard_regression
Weight: 0.18, classifier: liblinear_svr
Weight: 0.16, classifier: ard_regression
Weight: 0.1, classifier: libsvm_svr
Weight: 0.08, classifier: liblinear_svr
Weight: 0.08, classifier: xgradient_boosting
Weight: 0.02, classifier: ridge_regression
q3_loan
Weight: 0.18, classifier: liblinear_svr
Weight: 0.16, classifier: ard_regression
Weight: 0.14, classifier: random_forest
Weight: 0.1, classifier: ard_regression
Weight: 0.1, classifier: extra_trees
Weight: 0.06, classifier: ard_regression
Weight: 0.06, classifier: ard_regression
Weight: 0.06, classifier: liblinear_svr
Weight: 0