# Tie Strength Regression

In [1]:
%matplotlib inline

from collections import Counter
import inspect
import os
import pickle
import sys

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from imblearn.over_sampling import SMOTE, RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GroupKFold, cross_val_score
import sklearn.metrics as sk_metrics
from sklearn.svm import SVC

currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parentdir = os.path.dirname(currentdir)
sys.path.insert(0,parentdir) 

from model.model_util import *


In [2]:
# Code hider, source: http://chris-said.io/2016/02/13/how-to-make-polished-jupyter-presentations-with-optional-code-visibility/
from IPython.display import HTML

HTML('''
<script>
  function code_toggle() {
    if (code_shown){
      $('div.input').hide('500');
      $('#toggleButton').val('Show Code')
    } else {
      $('div.input').show('500');
      $('#toggleButton').val('Hide Code')
    }
    code_shown = !code_shown
  }

  $( document ).ready(function(){
    code_shown=false;
    $('div.input').hide()
  });
</script>
<form action="javascript:code_toggle()"><input type="submit" id="toggleButton" value="Show Code"></form>
''')

In [9]:
def print_reg_metrics(test_y_dict, pred_dict):
    """
    prints the MSE and R^2 of the given prediction dictionaries.
    """
    
    targets = pred_dict.keys()
    # R^2, MSE
    metrics = np.zeros((2, len(targets)))
    for i, target in enumerate(targets):
        metrics[0, i] = sk_metrics.r2_score(test_y_dict[target], pred_dict[target]) 
        metrics[1, i] = sk_metrics.mean_squared_error(test_y_dict[target], pred_dict[target]) 
        
    metrics_df = pd.DataFrame(metrics, index=['R^2', 'MSE'], columns=targets)
    display(metrics_df)

In [29]:
# load models
features = ['baseline', 'all_orig']

rf_models = []
rf_preds = []

# baseline
for feat in features:
    with open("../model/final_results/tie_str/tie_str_{}_rf_reg.automl".format(feat), 'rb') as model_file:
            rf_models.append(pickle.load(model_file))
    with open("../model/final_results/tie_str/tie_str_{}_rf_reg.predict".format(feat), 'rb') as pred_file:
        rf_preds.append(pickle.load(pred_file))

automl_models = []
automl_preds = []

# autoML
for feat in features:
    with open("../model/final_results/tie_str/tie_str_{}_automl_reg.automl".format(feat), 'rb') as model_file:
        automl_models.append(pickle.load(model_file))
    with open("../model/final_results/tie_str/tie_str_{}_automl_reg.predict".format(feat), 'rb') as pred_file:
        automl_preds.append(pickle.load(pred_file))

Could not delete output dir: /tmp/autosklearn_output_32489_7925
Could not delete tmp dir: /tmp/autosklearn_tmp_32489_7925
Could not delete output dir: /tmp/autosklearn_output_586_5161
Could not delete tmp dir: /tmp/autosklearn_tmp_586_5161
Could not delete output dir: /tmp/autosklearn_output_31544_4532
Could not delete tmp dir: /tmp/autosklearn_tmp_31544_4532
Could not delete output dir: /tmp/autosklearn_output_31096_5338
Could not delete tmp dir: /tmp/autosklearn_tmp_31096_5338


In [30]:
# load data
train_data = pickle.load(open('../data/final_features/all_tie_str_all_train_features.df', 'rb'))
test_data =  pickle.load(open('../data/final_features/all_tie_str_all_test_features.df', 'rb'))

predict_targets = [
    'contact_type',
    'q1_want',
    'q2_talk',
    'q3_loan',
    'q4_closeness',
    'tie_str_score',
    'tie_str_rank',
    'tie_str_class'
]

train_y = train_data['tie_str_score']
train_X = train_data.drop(['contact_type', 'pid', 'combined_hash'] + predict_targets, axis=1, errors='ignore')
test_y = test_data['tie_str_score']
test_X = test_data.drop(['contact_type', 'pid', 'combined_hash'] + predict_targets, axis=1, errors='ignore')

## Results

In [31]:
actual_dict = {}
pred_dict = {}

In [32]:
actual_dict['baseline RF'] = test_y
actual_dict['all RF'] = test_y
actual_dict['baseline AutoML'] = test_y
actual_dict['all AutoML'] = test_y

pred_dict['baseline RF'] = rf_preds[0]
pred_dict['all RF'] = rf_preds[1]
pred_dict['baseline AutoML'] = automl_preds[0]
pred_dict['all AutoML'] = automl_preds[1]

In [33]:
print_reg_metrics(actual_dict, pred_dict)

Unnamed: 0,baseline RF,all RF,baseline AutoML,all AutoML
R^2,0.292312,0.28144,0.295846,0.282211
MSE,39.569794,40.177694,39.372175,40.134616


## Model analysis