In [1]:
# Imports.
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from matplotlib.backends.backend_pdf import PdfPages
from patsy import dmatrices
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split
from sklearn import metrics
from sklearn import tree
from sklearn.cross_validation import cross_val_score
from sklearn.tree import export_graphviz
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier



In [2]:
# Read csv file into a dataframe.
df = pd.read_csv('processed_data/happiness_class_data.csv' , keep_default_na=True, sep=',\s+', delimiter=',', skipinitialspace=True)

In [3]:
# Print the feature types in our dataset.
df.dtypes

happiness_class           float64
social_support            float64
healthy_life_exp_birth    float64
pos_affect                float64
dem_quality               float64
delivery_quality          float64
life_exp_60               float64
infant_mortality          float64
dtype: object

# Decision Tree Classifier
## Train a decision tree classifier model

In [4]:
X = pd.concat([df[['social_support', 'healthy_life_exp_birth', 'pos_affect', 'dem_quality', 'delivery_quality', 'life_exp_60', 'infant_mortality']]], axis=1)
y = df['happiness_class']
X = X.fillna(X.mean())
y = y.fillna(y.mean())
print("Descriptive features:\n", X)
print("\nTarget feature:\n", y)

Descriptive features:
       social_support  healthy_life_exp_birth  pos_affect  dem_quality  \
0           0.450662               49.209663    0.517637    -1.929690   
1           0.552308               49.624432    0.583926    -2.044093   
2           0.539075               50.008961    0.618265    -1.991810   
3           0.521104               50.367298    0.611387    -1.919018   
4           0.520637               50.709263    0.710385    -1.842996   
5           0.483552               51.042980    0.620585    -1.879709   
6           0.525568               51.370525    0.531691    -1.773257   
7           0.528597               51.693527    0.553553    -1.844364   
8           0.559072               52.016529    0.564953    -1.917693   
9           0.490880               52.339527    0.496349    -1.904737   
10          0.833047               67.103607    0.640024     0.048114   
11          0.733152               67.413696    0.647908    -0.033831   
12          0.759434        

In [5]:
# Instantiate sklearn estimator, fit with training set.
# Train a classification tree with max_depth=3 on all data.
DecisionTree = tree.DecisionTreeClassifier(max_depth=3, random_state=1)
DecisionTree.fit(X, y)
print(DecisionTree)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=3,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=1,
            splitter='best')


In [6]:
# Check the type of this object
type(DecisionTree)

sklearn.tree.tree.DecisionTreeClassifier

## Feature importance from the DTC model

In [7]:
# Compute the importance of each feature based on the trained decision tree classifier
pd.DataFrame({'feature': X.columns, 'importance': DecisionTree.feature_importances_})

Unnamed: 0,feature,importance
0,social_support,0.629885
1,healthy_life_exp_birth,0.140952
2,pos_affect,0.033903
3,dem_quality,0.0
4,delivery_quality,0.050985
5,life_exp_60,0.144276
6,infant_mortality,0.0


In [8]:
with open("happiness.dot", 'w') as f:
    f = export_graphviz(DecisionTree, out_file=f, feature_names=X.columns)

In [9]:
# Alternative to print a tree in text format
import operator

def tree_print(Classifier, X):
    tlevel = _tree_rprint('', Classifier, X.columns, Classifier.classes_)
    print('<',end='')
    for i in range(3*tlevel - 2):
        print('-',end='')
    print('>')
    print('Tree Depth: ',tlevel)

def _tree_rprint(kword, Classifier, features, labels, node_index=0, tlevel_index=0):
    # Note: The DecisionTreeClassifier uses the Tree structure defined in:
    # 		github.com/scikit-learn/scikit-learn/blob/master/sklearn/tree/_tree.pyx
    #       it is an array based tree implementation:

    # indent the nodes according to their tree level
    for i in range(tlevel_index):
        print('  |',end='')

	#  TODO: the following should use the TREE_LEAF constant defined in _tree.pyx
	#        instead of -1, not quite sure how to get at it from the tree user level
    if Classifier.tree_.children_left[node_index] == -1:  # indicates leaf
        print(kword, end=' ' if kword else '')
        # get the majority label
        count_list = Classifier.tree_.value[node_index, 0]
        max_index, max_value = max(enumerate(count_list), key=operator.itemgetter(1))
        max_label = labels[max_index]
        print(max_label)
        return tlevel_index
    
    else:
        # compute and print node label
        feature = features[Classifier.tree_.feature[node_index]]
        threshold = Classifier.tree_.threshold[node_index]
        print(kword, end=' ' if kword else '')
        print('if {} =< {}: '.format(feature, threshold))
        # recurse down the children
        left_index = Classifier.tree_.children_left[node_index]
        right_index = Classifier.tree_.children_right[node_index]
        ltlevel_index = _tree_rprint('then', Classifier, features, labels, left_index, tlevel_index+1)
        rtlevel_index = _tree_rprint('else', Classifier, features, labels, right_index, tlevel_index+1)
        # return the maximum depth of either one of the children
        return max(ltlevel_index,rtlevel_index)

In [10]:
tree_print(DecisionTree, X)

if social_support =< 0.8618887662887573: 
  |then if life_exp_60 =< 20.75: 
  |  |then if delivery_quality =< 0.8227246403694153: 
  |  |  |then 0.0
  |  |  |else 1.0
  |  |else if pos_affect =< 0.7344286441802979: 
  |  |  |then 0.0
  |  |  |else 1.0
  |else if healthy_life_exp_birth =< 66.58038330078125: 
  |  |then if healthy_life_exp_birth =< 57.084495544433594: 
  |  |  |then 0.0
  |  |  |else 1.0
  |  |else if social_support =< 0.8677008152008057: 
  |  |  |then 1.0
  |  |  |else 1.0
<------->
Tree Depth:  3


## Predict using trained decision tree

In [11]:
X.head()

Unnamed: 0,social_support,healthy_life_exp_birth,pos_affect,dem_quality,delivery_quality,life_exp_60,infant_mortality
0,0.450662,49.209663,0.517637,-1.92969,-1.655084,15.6,70.8
1,0.552308,49.624432,0.583926,-2.044093,-1.635025,15.7,68.2
2,0.539075,50.008961,0.618265,-1.99181,-1.617176,15.7,65.7
3,0.521104,50.367298,0.611387,-1.919018,-1.616221,15.8,63.3
4,0.520637,50.709263,0.710385,-1.842996,-1.404078,15.8,61.0


In [12]:
X_new = pd.DataFrame({'social_support': [0.55],  'healthy_life_exp_birth': [55], 'pos_affect': [0.55], 'dem_quality': [-1], 'delivery_quality': [-1.5], 'life_exp_60': [16], 'infant_mortality': [70]})
X_new.head()

Unnamed: 0,delivery_quality,dem_quality,healthy_life_exp_birth,infant_mortality,life_exp_60,pos_affect,social_support
0,-1.5,-1,55,70,16,0.55,0.55


In [13]:
DecisionTree.predict(X_new)

array([ 1.])

In [14]:
DecisionTree.predict_proba(X_new)

array([[ 0.26923077,  0.73076923]])

## Evaluate model on full dataset

In [15]:
predictions = DecisionTree.predict(X)
df_true_vs_predicted = pd.DataFrame({'ActualHappiness': y, 'PredictedHappiness': predictions})
df_true_vs_predicted

Unnamed: 0,ActualHappiness,PredictedHappiness
0,0.0,0.0
1,0.0,0.0
2,0.0,0.0
3,0.0,0.0
4,0.0,0.0
5,0.0,0.0
6,0.0,0.0
7,0.0,0.0
8,0.0,0.0
9,0.0,0.0


In [16]:
print("Accuracy: ", metrics.accuracy_score(y, predictions))
print("Confusion matrix: \n", metrics.confusion_matrix(y, predictions))
print("Classification report:\n ", metrics.classification_report(y, predictions))

Accuracy:  0.864583333333
Confusion matrix: 
 [[630 105]
 [ 77 532]]
Classification report:
               precision    recall  f1-score   support

        0.0       0.89      0.86      0.87       735
        1.0       0.84      0.87      0.85       609

avg / total       0.87      0.86      0.86      1344



## Evaluate on hold-out data
## Single stratified train/test split

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

print("Training data:\n", pd.concat([X_train, y_train], axis=1))
print("\nTest data:\n", pd.concat([X_test, y_test], axis=1))

Training data:
       social_support  healthy_life_exp_birth  pos_affect  dem_quality  \
312         0.911363               70.303185    0.750774     1.010939   
32          0.889073               66.694588    0.840048     0.251968   
222         0.818844               43.786251    0.591423    -1.334058   
290         0.796392               66.671043    0.606828     0.542446   
518         0.675075               59.177948    0.773841    -0.534819   
774         0.910142               65.589157    0.747760     0.877134   
321         0.956344               70.612503    0.832483     1.238178   
188         0.697164               56.272232    0.774445    -0.714120   
1041        0.708427               42.724106    0.520902    -0.258655   
388         0.940869               70.978622    0.768957     1.479439   
597         0.899034               63.378037    0.644645    -0.569441   
859         0.954921               71.614304    0.817431     1.461678   
293         0.751262               

In [18]:
# Refit the model on the training set only
DecisionTree.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=3,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=1,
            splitter='best')

In [19]:
# Predict on the hold-out test set
predictions_test = DecisionTree.predict(X_test)
df_true_vs_predicted_test = pd.DataFrame({'ActualHappiness': y_test, 'PredictedHappiness': predictions_test})

df_true_vs_predicted_test

Unnamed: 0,ActualHappiness,PredictedHappiness
1197,1.0,0.0
1007,0.0,0.0
251,1.0,1.0
1242,0.0,1.0
1140,1.0,1.0
527,0.0,0.0
77,1.0,1.0
828,0.0,0.0
1105,1.0,1.0
857,1.0,1.0


In [20]:
print("Accuracy: ", metrics.accuracy_score(y_test, predictions_test))
print("Confusion matrix: \n", metrics.confusion_matrix(y_test, predictions_test))
print("Classification report:\n ", metrics.classification_report(y_test, predictions_test))

Accuracy:  0.841584158416
Confusion matrix: 
 [[188  31]
 [ 33 152]]
Classification report:
               precision    recall  f1-score   support

        0.0       0.85      0.86      0.85       219
        1.0       0.83      0.82      0.83       185

avg / total       0.84      0.84      0.84       404



## Cross-validation

In [21]:
# Evaluate the model using 3-fold cross-validation.
# Uses 2 parts of data for training and the last part for testing. 
# This process is repeated 3 times. 
for i in range (1, 10):
    scores_dtc = cross_val_score(DecisionTreeClassifier(max_depth=i, random_state=3), X, y, scoring='accuracy', cv=5)
    print("max_depth = ", i)
    print("scores: ", scores_dtc)
    print("Average score: ", scores_dtc.mean())

max_depth =  1
scores:  [ 0.8401487   0.78066914  0.76208178  0.76208178  0.67164179]
Average score:  0.763324640737
max_depth =  2
scores:  [ 0.86988848  0.7732342   0.82899628  0.79553903  0.70895522]
Average score:  0.795322643289
max_depth =  3
scores:  [ 0.88847584  0.7732342   0.76951673  0.80297398  0.78731343]
Average score:  0.804302835266
max_depth =  4
scores:  [ 0.88847584  0.79553903  0.76951673  0.80297398  0.6641791 ]
Average score:  0.784136936137
max_depth =  5
scores:  [ 0.8401487   0.79553903  0.78066914  0.79553903  0.73134328]
Average score:  0.788647838873
max_depth =  6
scores:  [ 0.84386617  0.78810409  0.79925651  0.75836431  0.75373134]
Average score:  0.78866448427
max_depth =  7
scores:  [ 0.85873606  0.77695167  0.73605948  0.7732342   0.74626866]
Average score:  0.778250013871
max_depth =  8
scores:  [ 0.82156134  0.75464684  0.79925651  0.76579926  0.72014925]
Average score:  0.77228263885
max_depth =  9
scores:  [ 0.82899628  0.73605948  0.79925651  0.73

According the compared results above, it looks that max_depth=3 has the best cross-valadation score. Thus, we will choose max_depth=3 for random forest model training.

# Random Forest
## Train a random forest classifier model

In [22]:
RandomForest = RandomForestClassifier(n_estimators=100, max_depth=3, max_features='auto', oob_score=True, random_state=1)

In [23]:
# Fit model on full dataset
RandomForest.fit(X, y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=3, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=True, random_state=1, verbose=0, warm_start=False)

## Feature importance from the RFC model

In [24]:
# Compute the importance of each feature based on the trained random forest classifier
# RF finds different feature importance than the single decision tree
# It finds both Size and Floor to be important features
# A downside is that we cannot interpret the RF with 100 trees as we could by looking at a single decision tree
# So this table is what we can use to interpret the trained model
pd.DataFrame({'feature': X.columns, 'importance':RandomForest.feature_importances_})

Unnamed: 0,feature,importance
0,social_support,0.255337
1,healthy_life_exp_birth,0.175517
2,pos_affect,0.057946
3,dem_quality,0.035078
4,delivery_quality,0.133308
5,life_exp_60,0.224747
6,infant_mortality,0.118067


## Predict using trained random forest model

In [25]:
# Predicted probabilities for all examples. 
# The output is a pair for each example, 
# The first component is the probability of the UNHAPPY class (class 0).
# The second component is the probability of the HAPPY class (class 1).
RandomForest.predict_proba(X)

array([[ 0.95216454,  0.04783546],
       [ 0.95270026,  0.04729974],
       [ 0.95270026,  0.04729974],
       ..., 
       [ 0.95493864,  0.04506136],
       [ 0.94803442,  0.05196558],
       [ 0.94317263,  0.05682737]])

In [26]:
# Predicted class labels for all examples, 
# using the trained model, on in-sample data (same sample used for training and test)
RandomForest_predictions = RandomForest.predict(X)
df_true_vs_rfc_predicted = pd.DataFrame({'ActualClass': y, 'PredictedClass': RandomForest_predictions})
df_true_vs_rfc_predicted

Unnamed: 0,ActualClass,PredictedClass
0,0.0,0.0
1,0.0,0.0
2,0.0,0.0
3,0.0,0.0
4,0.0,0.0
5,0.0,0.0
6,0.0,0.0
7,0.0,0.0
8,0.0,0.0
9,0.0,0.0


In [27]:
print("Accuracy: ", metrics.accuracy_score(y, RandomForest_predictions))
print("Confusion matrix: \n", metrics.confusion_matrix(y, RandomForest_predictions))
print("Classification report:\n ", metrics.classification_report(y, RandomForest_predictions))

Accuracy:  0.871279761905
Confusion matrix: 
 [[664  71]
 [102 507]]
Classification report:
               precision    recall  f1-score   support

        0.0       0.87      0.90      0.88       735
        1.0       0.88      0.83      0.85       609

avg / total       0.87      0.87      0.87      1344



## Evaluate on hold-out data
## Single stratified train/test split

In [28]:
RandomForest.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=3, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=True, random_state=1, verbose=0, warm_start=False)

In [29]:
# Predicted class labels for all examples, 
# using the trained model, on in-sample data (same sample used for training and test)
RandomForest_predictions_test = RandomForest.predict(X_test)
df_true_vs_rfc_predicted_test = pd.DataFrame({'ActualClass': y_test, 'PredictedClass': RandomForest_predictions_test})
df_true_vs_rfc_predicted_test

Unnamed: 0,ActualClass,PredictedClass
1197,1.0,0.0
1007,0.0,0.0
251,1.0,1.0
1242,0.0,1.0
1140,1.0,1.0
527,0.0,0.0
77,1.0,1.0
828,0.0,0.0
1105,1.0,1.0
857,1.0,1.0


In [30]:
print("Accuracy: ", metrics.accuracy_score(y_test, RandomForest_predictions_test))
print("Confusion matrix: \n", metrics.confusion_matrix(y_test, RandomForest_predictions_test))
print("Classification report:\n ", metrics.classification_report(y_test, RandomForest_predictions_test))

Accuracy:  0.851485148515
Confusion matrix: 
 [[186  33]
 [ 27 158]]
Classification report:
               precision    recall  f1-score   support

        0.0       0.87      0.85      0.86       219
        1.0       0.83      0.85      0.84       185

avg / total       0.85      0.85      0.85       404



## Cross-validation

In [31]:
# Evaluate the model using 3-fold cross-validation.
# Uses 2 parts of data for training and the last part for testing. 
# This process is repeated 3 times. 
scores = cross_val_score(RandomForestClassifier(n_estimators=100, max_depth=3, max_features='auto', oob_score=True, random_state=1), X, y, scoring='accuracy', cv=3)
print(scores)
print(scores.mean())

[ 0.87946429  0.81473214  0.84821429]
0.847470238095
