In [1]:
# Imports.
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from matplotlib.backends.backend_pdf import PdfPages
from patsy import dmatrices
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split
from sklearn import metrics
from sklearn import tree
from sklearn.cross_validation import cross_val_score
from sklearn.tree import export_graphviz
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier



In [2]:
# Read csv file into a dataframe.
df = pd.read_csv('processed_data/happiness_class_data.csv' , keep_default_na=True, sep=',\s+', delimiter=',', skipinitialspace=True)

In [3]:
# Print the feature types in our dataset.
df.dtypes

happiness_class           float64
social_support            float64
healthy_life_exp_birth    float64
pos_affect                float64
dem_quality               float64
delivery_quality          float64
life_exp_60               float64
infant_mortality          float64
dtype: object

# Decision Tree Classifier
## Train a decision tree classifier model

In [4]:
X = pd.concat([df[['social_support', 'healthy_life_exp_birth', 'pos_affect', 'dem_quality', 'delivery_quality', 'life_exp_60', 'infant_mortality']]], axis=1)
y = df['happiness_class']
X = X.fillna(X.mean())
y = y.fillna(y.mean())
print("Descriptive features:\n", X)
print("\nTarget feature:\n", y)

Descriptive features:
       social_support  healthy_life_exp_birth  pos_affect  dem_quality  \
0           0.450662               49.209663    0.517637    -1.929690   
1           0.552308               49.624432    0.583926    -2.044093   
2           0.539075               50.008961    0.618265    -1.991810   
3           0.521104               50.367298    0.611387    -1.919018   
4           0.520637               50.709263    0.710385    -1.842996   
5           0.483552               51.042980    0.620585    -1.879709   
6           0.525568               51.370525    0.531691    -1.773257   
7           0.528597               51.693527    0.553553    -1.844364   
8           0.559072               52.016529    0.564953    -1.917693   
9           0.490880               52.339527    0.496349    -1.904737   
10          0.833047               67.103607    0.640024     0.048114   
11          0.733152               67.413696    0.647908    -0.033831   
12          0.759434        

In [11]:
# Instantiate sklearn estimator, fit with training set.
# Train a classification tree with max_depth=3 on all data.
DecisionTree = tree.DecisionTreeClassifier(max_depth=5, random_state=1)
DecisionTree.fit(X, y)
print(DecisionTree)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=5,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=1,
            splitter='best')


In [12]:
# Check the type of this object
type(DecisionTree)

sklearn.tree.tree.DecisionTreeClassifier

## Feature importance from the DTC model

In [13]:
# Compute the importance of each feature based on the trained decision tree classifier
pd.DataFrame({'feature': X.columns, 'importance': DecisionTree.feature_importances_})

Unnamed: 0,feature,importance
0,social_support,0.552771
1,healthy_life_exp_birth,0.137231
2,pos_affect,0.069865
3,dem_quality,0.021443
4,delivery_quality,0.053155
5,life_exp_60,0.117741
6,infant_mortality,0.047794


In [14]:
with open("happiness.dot", 'w') as f:
    f = export_graphviz(DecisionTree, out_file=f, feature_names=X.columns)

In [15]:
# Alternative to print a tree in text format
import operator

def tree_print(Classifier, X):
    tlevel = _tree_rprint('', Classifier, X.columns, Classifier.classes_)
    print('<',end='')
    for i in range(3*tlevel - 2):
        print('-',end='')
    print('>')
    print('Tree Depth: ',tlevel)

def _tree_rprint(kword, Classifier, features, labels, node_index=0, tlevel_index=0):
    # Note: The DecisionTreeClassifier uses the Tree structure defined in:
    # 		github.com/scikit-learn/scikit-learn/blob/master/sklearn/tree/_tree.pyx
    #       it is an array based tree implementation:

    # indent the nodes according to their tree level
    for i in range(tlevel_index):
        print('  |',end='')

	#  TODO: the following should use the TREE_LEAF constant defined in _tree.pyx
	#        instead of -1, not quite sure how to get at it from the tree user level
    if Classifier.tree_.children_left[node_index] == -1:  # indicates leaf
        print(kword, end=' ' if kword else '')
        # get the majority label
        count_list = Classifier.tree_.value[node_index, 0]
        max_index, max_value = max(enumerate(count_list), key=operator.itemgetter(1))
        max_label = labels[max_index]
        print(max_label)
        return tlevel_index
    
    else:
        # compute and print node label
        feature = features[Classifier.tree_.feature[node_index]]
        threshold = Classifier.tree_.threshold[node_index]
        print(kword, end=' ' if kword else '')
        print('if {} =< {}: '.format(feature, threshold))
        # recurse down the children
        left_index = Classifier.tree_.children_left[node_index]
        right_index = Classifier.tree_.children_right[node_index]
        ltlevel_index = _tree_rprint('then', Classifier, features, labels, left_index, tlevel_index+1)
        rtlevel_index = _tree_rprint('else', Classifier, features, labels, right_index, tlevel_index+1)
        # return the maximum depth of either one of the children
        return max(ltlevel_index,rtlevel_index)

In [16]:
tree_print(DecisionTree, X)

if social_support =< 0.8618887662887573: 
  |then if life_exp_60 =< 20.75: 
  |  |then if delivery_quality =< 0.8227246403694153: 
  |  |  |then if social_support =< 0.7908226251602173: 
  |  |  |  |then if delivery_quality =< -2.1355767250061035: 
  |  |  |  |  |then 1.0
  |  |  |  |  |else 0.0
  |  |  |  |else if infant_mortality =< 24.427440643310547: 
  |  |  |  |  |then 0.0
  |  |  |  |  |else 0.0
  |  |  |else if healthy_life_exp_birth =< 74.16291046142578: 
  |  |  |  |then 1.0
  |  |  |  |else if social_support =< 0.8516873121261597: 
  |  |  |  |  |then 0.0
  |  |  |  |  |else 1.0
  |  |else if pos_affect =< 0.7344286441802979: 
  |  |  |then if dem_quality =< -0.04730721563100815: 
  |  |  |  |then if social_support =< 0.8142119646072388: 
  |  |  |  |  |then 0.0
  |  |  |  |  |else 1.0
  |  |  |  |else if delivery_quality =< 0.38655751943588257: 
  |  |  |  |  |then 0.0
  |  |  |  |  |else 0.0
  |  |  |else if infant_mortality =< 15.149999618530273: 
  |  |  |  |then if soci

## Predict using trained decision tree

In [11]:
X.head()

Unnamed: 0,social_support,healthy_life_exp_birth,pos_affect,dem_quality,delivery_quality,life_exp_60,infant_mortality
0,0.450662,49.209663,0.517637,-1.92969,-1.655084,15.6,70.8
1,0.552308,49.624432,0.583926,-2.044093,-1.635025,15.7,68.2
2,0.539075,50.008961,0.618265,-1.99181,-1.617176,15.7,65.7
3,0.521104,50.367298,0.611387,-1.919018,-1.616221,15.8,63.3
4,0.520637,50.709263,0.710385,-1.842996,-1.404078,15.8,61.0


In [12]:
X_new = pd.DataFrame({'social_support': [0.55],  'healthy_life_exp_birth': [55], 'pos_affect': [0.55], 'dem_quality': [-1], 'delivery_quality': [-1.5], 'life_exp_60': [16], 'infant_mortality': [70]})
X_new.head()

Unnamed: 0,delivery_quality,dem_quality,healthy_life_exp_birth,infant_mortality,life_exp_60,pos_affect,social_support
0,-1.5,-1,55,70,16,0.55,0.55


In [13]:
DecisionTree.predict(X_new)

array([ 1.])

In [14]:
DecisionTree.predict_proba(X_new)

array([[ 0.,  1.]])

## Evaluate model on full dataset

In [15]:
predictions = DecisionTree.predict(X)
df_true_vs_predicted = pd.DataFrame({'ActualHappiness': y, 'PredictedHappiness': predictions})
df_true_vs_predicted

Unnamed: 0,ActualHappiness,PredictedHappiness
0,0.0,0.0
1,0.0,0.0
2,0.0,0.0
3,0.0,0.0
4,0.0,0.0
5,0.0,0.0
6,0.0,0.0
7,0.0,0.0
8,0.0,0.0
9,0.0,0.0


In [16]:
print("Accuracy: ", metrics.accuracy_score(y, predictions))
print("Confusion matrix: \n", metrics.confusion_matrix(y, predictions))
print("Classification report:\n ", metrics.classification_report(y, predictions))

Accuracy:  0.966517857143
Confusion matrix: 
 [[717  18]
 [ 27 582]]
Classification report:
               precision    recall  f1-score   support

        0.0       0.96      0.98      0.97       735
        1.0       0.97      0.96      0.96       609

avg / total       0.97      0.97      0.97      1344



## Evaluate on hold-out data
## Single stratified train/test split

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

print("Training data:\n", pd.concat([X_train, y_train], axis=1))
print("\nTest data:\n", pd.concat([X_test, y_test], axis=1))

Training data:
       social_support  healthy_life_exp_birth  pos_affect  dem_quality  \
1077        0.787962               62.479115    0.748686    -0.138286   
716         0.871212               65.731339    0.638737    -0.308209   
156         0.912818               63.501060    0.832505     0.344150   
1104        0.929454               72.533165    0.752165     0.351598   
1105        0.949940               72.667786    0.724312     0.402250   
1019        0.756299               53.861347    0.672881    -0.180600   
374         0.917102               67.029716    0.680431     0.975284   
458         0.826492               63.381977    0.845866    -0.509763   
1239        0.845293               61.393940    0.583073    -0.121418   
625         0.714604               58.689838    0.788452    -0.756333   
1000        0.896151               62.978783    0.710230    -0.941901   
598         0.917989               63.514286    0.643089    -0.555909   
144         0.807705               

In [18]:
# Refit the model on the training set only
DecisionTree.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=8,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=1,
            splitter='best')

In [19]:
# Predict on the hold-out test set
predictions_test = DecisionTree.predict(X_test)
df_true_vs_predicted_test = pd.DataFrame({'ActualHappiness': y_test, 'PredictedHappiness': predictions_test})

df_true_vs_predicted_test

Unnamed: 0,ActualHappiness,PredictedHappiness
817,0.0,0.0
1027,0.0,1.0
723,0.0,0.0
560,1.0,1.0
1017,1.0,1.0
631,1.0,0.0
268,0.0,0.0
875,0.0,0.0
263,0.0,0.0
563,1.0,1.0


In [20]:
print("Accuracy: ", metrics.accuracy_score(y_test, predictions_test))
print("Confusion matrix: \n", metrics.confusion_matrix(y_test, predictions_test))
print("Classification report:\n ", metrics.classification_report(y_test, predictions_test))

Accuracy:  0.846534653465
Confusion matrix: 
 [[199  25]
 [ 37 143]]
Classification report:
               precision    recall  f1-score   support

        0.0       0.84      0.89      0.87       224
        1.0       0.85      0.79      0.82       180

avg / total       0.85      0.85      0.85       404



## Cross-validation

In [21]:
# Evaluate the model using 3-fold cross-validation.
# Uses 2 parts of data for training and the last part for testing. 
# This process is repeated 3 times. 
scores_dtc = cross_val_score(DecisionTreeClassifier(max_depth=3, random_state=1), X, y, scoring='accuracy', cv=3)
print(scores_dtc)
print(scores_dtc.mean())

[ 0.86830357  0.80357143  0.80580357]
0.825892857143


# Random Forest
## Train a random forest classifier model

In [22]:
RandomForest = RandomForestClassifier(n_estimators=100, max_features='auto', oob_score=True, random_state=1)

In [23]:
# Fit model on full dataset
RandomForest.fit(X, y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=True, random_state=1, verbose=0, warm_start=False)

## Feature importance from the RFC model

In [24]:
# Compute the importance of each feature based on the trained random forest classifier
# RF finds different feature importance than the single decision tree
# It finds both Size and Floor to be important features
# A downside is that we cannot interpret the RF with 100 trees as we could by looking at a single decision tree
# So this table is what we can use to interpret the trained model
pd.DataFrame({'feature': X.columns, 'importance':RandomForest.feature_importances_})

Unnamed: 0,feature,importance
0,social_support,0.22403
1,healthy_life_exp_birth,0.164234
2,pos_affect,0.112534
3,dem_quality,0.081569
4,delivery_quality,0.13312
5,life_exp_60,0.167382
6,infant_mortality,0.117133


## Predict using trained random forest model

In [25]:
# Predicted probabilities for all examples. 
# The output is a pair for each example, 
# The first component is the probability of the UNHAPPY class (class 0).
# The second component is the probability of the HAPPY class (class 1).
RandomForest.predict_proba(X)

array([[ 0.99,  0.01],
       [ 1.  ,  0.  ],
       [ 1.  ,  0.  ],
       ..., 
       [ 1.  ,  0.  ],
       [ 1.  ,  0.  ],
       [ 0.98,  0.02]])

In [26]:
# Predicted class labels for all examples, 
# using the trained model, on in-sample data (same sample used for training and test)
RandomForest_predictions = RandomForest.predict(X)
df_true_vs_rfc_predicted = pd.DataFrame({'ActualClass': y, 'PredictedClass': RandomForest_predictions})
df_true_vs_rfc_predicted

Unnamed: 0,ActualClass,PredictedClass
0,0.0,0.0
1,0.0,0.0
2,0.0,0.0
3,0.0,0.0
4,0.0,0.0
5,0.0,0.0
6,0.0,0.0
7,0.0,0.0
8,0.0,0.0
9,0.0,0.0


In [27]:
print("Accuracy: ", metrics.accuracy_score(y, RandomForest_predictions))
print("Confusion matrix: \n", metrics.confusion_matrix(y, RandomForest_predictions))
print("Classification report:\n ", metrics.classification_report(y, RandomForest_predictions))

Accuracy:  1.0
Confusion matrix: 
 [[735   0]
 [  0 609]]
Classification report:
               precision    recall  f1-score   support

        0.0       1.00      1.00      1.00       735
        1.0       1.00      1.00      1.00       609

avg / total       1.00      1.00      1.00      1344



## Evaluate on hold-out data
## Single stratified train/test split

In [28]:
RandomForest.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=True, random_state=1, verbose=0, warm_start=False)

In [29]:
# Predicted class labels for all examples, 
# using the trained model, on in-sample data (same sample used for training and test)
RandomForest_predictions_test = RandomForest.predict(X_test)
df_true_vs_rfc_predicted_test = pd.DataFrame({'ActualClass': y_test, 'PredictedClass': RandomForest_predictions_test})
df_true_vs_rfc_predicted_test

Unnamed: 0,ActualClass,PredictedClass
817,0.0,0.0
1027,0.0,0.0
723,0.0,0.0
560,1.0,1.0
1017,1.0,1.0
631,1.0,0.0
268,0.0,0.0
875,0.0,0.0
263,0.0,0.0
563,1.0,1.0


In [30]:
print("Accuracy: ", metrics.accuracy_score(y_test, RandomForest_predictions_test))
print("Confusion matrix: \n", metrics.confusion_matrix(y_test, RandomForest_predictions_test))
print("Classification report:\n ", metrics.classification_report(y_test, RandomForest_predictions_test))

Accuracy:  0.90099009901
Confusion matrix: 
 [[209  15]
 [ 25 155]]
Classification report:
               precision    recall  f1-score   support

        0.0       0.89      0.93      0.91       224
        1.0       0.91      0.86      0.89       180

avg / total       0.90      0.90      0.90       404



## Cross-validation

In [31]:
# Evaluate the model using 3-fold cross-validation.
# Uses 2 parts of data for training and the last part for testing. 
# This process is repeated 3 times. 
scores = cross_val_score(RandomForestClassifier(n_estimators=100, max_features='auto', oob_score=True, random_state=1), X, y, scoring='accuracy', cv=3)
print(scores)
print(scores.mean())

[ 0.87946429  0.82589286  0.78571429]
0.830357142857
