# Tree & RandomForest Model (Jiali Yu, Peng Ye)

In [1]:
# Imports.
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from matplotlib.backends.backend_pdf import PdfPages
from patsy import dmatrices
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split
from sklearn import metrics
from sklearn import tree
from sklearn.cross_validation import cross_val_score
from sklearn.tree import export_graphviz
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier



In [2]:
# Read csv file into a dataframe. We use the output dataframe file produce
# by previous model training (Logistic Regression). Thus 'happines_class' has 
# already been classfied with class=0 (happiness_score <= 5.0) and class=1
# (happiness_score > 5.0)
df = pd.read_csv('processed_data/happiness_class_data.csv' , keep_default_na=True, sep=',\s+', delimiter=',', skipinitialspace=True)

In [3]:
df.head()

Unnamed: 0,happiness_class,social_support,healthy_life_exp_birth,pos_affect,dem_quality,delivery_quality,life_exp_60,infant_mortality
0,0.0,0.450662,49.209663,0.517637,-1.92969,-1.655084,15.6,70.8
1,0.0,0.552308,49.624432,0.583926,-2.044093,-1.635025,15.7,68.2
2,0.0,0.539075,50.008961,0.618265,-1.99181,-1.617176,15.7,65.7
3,0.0,0.521104,50.367298,0.611387,-1.919018,-1.616221,15.8,63.3
4,0.0,0.520637,50.709263,0.710385,-1.842996,-1.404078,15.8,61.0


In [4]:
# Print the feature types in our dataset.
df.dtypes

happiness_class           float64
social_support            float64
healthy_life_exp_birth    float64
pos_affect                float64
dem_quality               float64
delivery_quality          float64
life_exp_60               float64
infant_mortality          float64
dtype: object

# Decision Tree Classifier
## Train a decision tree classifier model

In [5]:
# X Descriptive features (6 continous features)
# y Target feature (happiness_class)
X = pd.concat([df[['social_support',
                   'healthy_life_exp_birth',
                   'pos_affect', 'dem_quality',
                   'delivery_quality',
                   'life_exp_60',
                   'infant_mortality']]], axis=1)
y = df['happiness_class']
X = X.fillna(X.mean())
y = y.fillna(y.mean())

In [6]:
# Instantiate sklearn estimator, fit with training set.
# Train a classification tree with max_depth=5 as the init example on all data.
DecisionTree = tree.DecisionTreeClassifier(max_depth=5, random_state=1)
DecisionTree.fit(X, y)
print(DecisionTree)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=5,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=1,
            splitter='best')


In [7]:
# Check the type of this object
type(DecisionTree)

sklearn.tree.tree.DecisionTreeClassifier

## Feature importance from the DTC model

In [8]:
# Compute the importance of each feature based on the trained decision tree classifier
# We can see that social_support (0.629885)is most important on the trained decision tree classifier 
pd.DataFrame({'feature': X.columns, 'importance': DecisionTree.feature_importances_})

Unnamed: 0,feature,importance
0,social_support,0.552771
1,healthy_life_exp_birth,0.137231
2,pos_affect,0.069865
3,dem_quality,0.021443
4,delivery_quality,0.053155
5,life_exp_60,0.117741
6,infant_mortality,0.047794


In [9]:
with open("happiness.dot", 'w') as f:
    f = export_graphviz(DecisionTree, out_file=f, feature_names=X.columns)

In [10]:
# Alternative to print a tree in text format
import operator

def tree_print(Classifier, X):
    tlevel = _tree_rprint('', Classifier, X.columns, Classifier.classes_)
    print('<',end='')
    for i in range(3*tlevel - 2):
        print('-',end='')
    print('>')
    print('Tree Depth: ',tlevel)

def _tree_rprint(kword, Classifier, features, labels, node_index=0, tlevel_index=0):
    # Note: The DecisionTreeClassifier uses the Tree structure defined in:
    # 		github.com/scikit-learn/scikit-learn/blob/master/sklearn/tree/_tree.pyx
    #       it is an array based tree implementation:

    # indent the nodes according to their tree level
    for i in range(tlevel_index):
        print('  |',end='')

	#  TODO: the following should use the TREE_LEAF constant defined in _tree.pyx
	#        instead of -1, not quite sure how to get at it from the tree user level
    if Classifier.tree_.children_left[node_index] == -1:  # indicates leaf
        print(kword, end=' ' if kword else '')
        # get the majority label
        count_list = Classifier.tree_.value[node_index, 0]
        max_index, max_value = max(enumerate(count_list), key=operator.itemgetter(1))
        max_label = labels[max_index]
        print(max_label)
        return tlevel_index
    
    else:
        # compute and print node label
        feature = features[Classifier.tree_.feature[node_index]]
        threshold = Classifier.tree_.threshold[node_index]
        print(kword, end=' ' if kword else '')
        print('if {} =< {}: '.format(feature, threshold))
        # recurse down the children
        left_index = Classifier.tree_.children_left[node_index]
        right_index = Classifier.tree_.children_right[node_index]
        ltlevel_index = _tree_rprint('then', Classifier, features, labels, left_index, tlevel_index+1)
        rtlevel_index = _tree_rprint('else', Classifier, features, labels, right_index, tlevel_index+1)
        # return the maximum depth of either one of the children
        return max(ltlevel_index,rtlevel_index)

In [11]:
tree_print(DecisionTree, X)

if social_support =< 0.8618887662887573: 
  |then if life_exp_60 =< 20.75: 
  |  |then if delivery_quality =< 0.8227246403694153: 
  |  |  |then if social_support =< 0.7908226251602173: 
  |  |  |  |then if delivery_quality =< -2.1355767250061035: 
  |  |  |  |  |then 1.0
  |  |  |  |  |else 0.0
  |  |  |  |else if infant_mortality =< 24.427440643310547: 
  |  |  |  |  |then 0.0
  |  |  |  |  |else 0.0
  |  |  |else if healthy_life_exp_birth =< 74.16291046142578: 
  |  |  |  |then 1.0
  |  |  |  |else if social_support =< 0.8516873121261597: 
  |  |  |  |  |then 0.0
  |  |  |  |  |else 1.0
  |  |else if pos_affect =< 0.7344286441802979: 
  |  |  |then if dem_quality =< -0.04730721563100815: 
  |  |  |  |then if social_support =< 0.8142119646072388: 
  |  |  |  |  |then 0.0
  |  |  |  |  |else 1.0
  |  |  |  |else if delivery_quality =< 0.38655751943588257: 
  |  |  |  |  |then 0.0
  |  |  |  |  |else 0.0
  |  |  |else if infant_mortality =< 15.149999618530273: 
  |  |  |  |then if soci

## Predict using trained decision tree

In [12]:
X.head()

Unnamed: 0,social_support,healthy_life_exp_birth,pos_affect,dem_quality,delivery_quality,life_exp_60,infant_mortality
0,0.450662,49.209663,0.517637,-1.92969,-1.655084,15.6,70.8
1,0.552308,49.624432,0.583926,-2.044093,-1.635025,15.7,68.2
2,0.539075,50.008961,0.618265,-1.99181,-1.617176,15.7,65.7
3,0.521104,50.367298,0.611387,-1.919018,-1.616221,15.8,63.3
4,0.520637,50.709263,0.710385,-1.842996,-1.404078,15.8,61.0


In [13]:
X_new = pd.DataFrame({'social_support': [0.55],
                      'healthy_life_exp_birth': [55],
                      'pos_affect': [0.55],
                      'dem_quality': [-1],
                      'delivery_quality': [-1.5],
                      'life_exp_60': [16],
                      'infant_mortality': [70]})
X_new.head()

Unnamed: 0,delivery_quality,dem_quality,healthy_life_exp_birth,infant_mortality,life_exp_60,pos_affect,social_support
0,-1.5,-1,55,70,16,0.55,0.55


In [14]:
DecisionTree.predict(X_new)

array([ 1.])

In [15]:
DecisionTree.predict_proba(X_new)

array([[ 0.,  1.]])

## Evaluate model on full dataset

In [16]:
predictions = DecisionTree.predict(X)
df_true_vs_predicted = pd.DataFrame({'ActualHappiness': y, 'PredictedHappiness': predictions})
#df_true_vs_predicted

In [17]:
print("Accuracy: ", metrics.accuracy_score(y, predictions))
print("Confusion matrix: \n", metrics.confusion_matrix(y, predictions))
print("Classification report:\n ", metrics.classification_report(y, predictions))

Accuracy:  0.894345238095
Confusion matrix: 
 [[686  49]
 [ 93 516]]
Classification report:
               precision    recall  f1-score   support

        0.0       0.88      0.93      0.91       735
        1.0       0.91      0.85      0.88       609

avg / total       0.90      0.89      0.89      1344



## Evaluate on hold-out data
## Single stratified train/test split

In [18]:
# Get train and test data set on 7:3
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [19]:
# Refit the model on the training set only
DecisionTree.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=5,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=1,
            splitter='best')

In [20]:
# Predict on the hold-out test set
predictions_test = DecisionTree.predict(X_test)
df_true_vs_predicted_test = pd.DataFrame({'ActualHappiness': y_test, 'PredictedHappiness': predictions_test})

In [21]:
# Try to get an initialized confusion matrix evaulation with test data set on the prediction
print("Accuracy: ", metrics.accuracy_score(y_test, predictions_test))
print("Confusion matrix: \n", metrics.confusion_matrix(y_test, predictions_test))
print("Classification report:\n ", metrics.classification_report(y_test, predictions_test))

Accuracy:  0.866336633663
Confusion matrix: 
 [[201  26]
 [ 28 149]]
Classification report:
               precision    recall  f1-score   support

        0.0       0.88      0.89      0.88       227
        1.0       0.85      0.84      0.85       177

avg / total       0.87      0.87      0.87       404



## Cross-validation

In [22]:
# Evaluate the model using 10-fold cross-validation.
# Uses 9 parts of data for training and the last part for testing. 
# This process is repeated 10 times. We just check with the average score
for i in range (1, 10):
    scores_dtc = cross_val_score(DecisionTreeClassifier(max_depth=i, random_state=3), X, y, scoring='accuracy', cv=10)
    scores_dtc2 = cross_val_score(DecisionTreeClassifier(max_depth=i, random_state=3), X, y, scoring='f1', cv=10)
    scores_dtc3 = cross_val_score(DecisionTreeClassifier(max_depth=i, random_state=3), X, y, scoring='precision', cv=10)
    scores_dtc4 = cross_val_score(DecisionTreeClassifier(max_depth=i, random_state=3), X, y, scoring='recall', cv=10)
    print("max_depth = ", i)
    print("Average accuracy: ", scores_dtc.mean())
    print("Average f1-score:  {}".format(scores_dtc2.mean()))
    print("Average precision: {}".format(scores_dtc3.mean()))
    print("Average recall:    {}".format(scores_dtc4.mean()))
    print("====="*10)

max_depth =  1
Average accuracy:  0.752074090699
Average f1-score:  0.6648746766386393
Average precision: 0.814874173966426
Average recall:    0.5920765027322404
max_depth =  2
Average accuracy:  0.774362647913
Average f1-score:  0.7008523971488989
Average precision: 0.8140506476401257
Average recall:    0.662568306010929
max_depth =  3
Average accuracy:  0.804292946296
Average f1-score:  0.7855944162308762
Average precision: 0.7932293559178565
Average recall:    0.8013661202185792
max_depth =  4
Average accuracy:  0.805830288823
Average f1-score:  0.7869936812940386
Average precision: 0.7855679339550553
Average recall:    0.801584699453552
max_depth =  5
Average accuracy:  0.82080641072
Average f1-score:  0.8000028931349206
Average precision: 0.8170672294203557
Average recall:    0.796639344262295
max_depth =  6
Average accuracy:  0.787807287705
Average f1-score:  0.7643208854579461
Average precision: 0.7809992323705855
Average recall:    0.7601366120218579
max_depth =  7
Average accu

According the compared results above, it looks that max_depth=5 has the best cross-valadation scores. Thus, we will choose max_depth=5 for random forest model training.

# Random Forest
## Train a random forest classifier model

In [23]:
RandomForest = RandomForestClassifier(n_estimators=100, max_depth=5, max_features='auto', oob_score=True, random_state=1)

In [24]:
# Fit model on full dataset
RandomForest.fit(X, y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=5, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=True, random_state=1, verbose=0, warm_start=False)

## Feature importance from the RFC model

In [25]:
# Compute the importance of each feature based on the trained random forest classifier
# RF finds different feature importance than the single decision tree
# It finds both Size and Floor to be important features
# A downside is that we cannot interpret the RF with 100 trees as we could by looking at a single decision tree
# So this table is what we can use to interpret the trained model
pd.DataFrame({'feature': X.columns, 'importance':RandomForest.feature_importances_})

Unnamed: 0,feature,importance
0,social_support,0.247351
1,healthy_life_exp_birth,0.175307
2,pos_affect,0.09984
3,dem_quality,0.054054
4,delivery_quality,0.121725
5,life_exp_60,0.191418
6,infant_mortality,0.110304


## Predict using trained random forest model

In [26]:
# Predicted probabilities for all examples. 
# The output is a pair for each example, 
# The first component is the probability of the UNHAPPY class (class 0).
# The second component is the probability of the HAPPY class (class 1).
RandomForest.predict_proba(X)

array([[ 0.95961162,  0.04038838],
       [ 0.97606331,  0.02393669],
       [ 0.97577205,  0.02422795],
       ..., 
       [ 0.98191252,  0.01808748],
       [ 0.98646919,  0.01353081],
       [ 0.98572764,  0.01427236]])

In [27]:
# Predicted class labels for all examples, 
# using the trained model, on in-sample data (same sample used for training and test)
RandomForest_predictions = RandomForest.predict(X)
df_true_vs_rfc_predicted = pd.DataFrame({'ActualClass': y, 'PredictedClass': RandomForest_predictions})

In [28]:
# Get the initialized confusion matrix evaluation on this model with train data set, the Accuracy is 0.915
print("Accuracy: ", metrics.accuracy_score(y, RandomForest_predictions))
print("Confusion matrix: \n", metrics.confusion_matrix(y, RandomForest_predictions))
print("Classification report:\n ", metrics.classification_report(y, RandomForest_predictions))

Accuracy:  0.915178571429
Confusion matrix: 
 [[684  51]
 [ 63 546]]
Classification report:
               precision    recall  f1-score   support

        0.0       0.92      0.93      0.92       735
        1.0       0.91      0.90      0.91       609

avg / total       0.92      0.92      0.92      1344



## Evaluate on hold-out data
## Single stratified train/test split

In [29]:
RandomForest.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=5, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=True, random_state=1, verbose=0, warm_start=False)

In [30]:
# Predicted class labels for all examples, 
# using the trained model, on in-sample data (same sample used for training and test)
RandomForest_predictions_test = RandomForest.predict(X_test)
df_true_vs_rfc_predicted_test = pd.DataFrame({'ActualClass': y_test, 'PredictedClass': RandomForest_predictions_test})

In [31]:
# Get an initialized confusion matrix evaulation on this model with test data set. The accuracy is 0.893
print("Accuracy: ", metrics.accuracy_score(y_test, RandomForest_predictions_test))
print("Confusion matrix: \n", metrics.confusion_matrix(y_test, RandomForest_predictions_test))
print("Classification report:\n ", metrics.classification_report(y_test, RandomForest_predictions_test))

Accuracy:  0.908415841584
Confusion matrix: 
 [[213  14]
 [ 23 154]]
Classification report:
               precision    recall  f1-score   support

        0.0       0.90      0.94      0.92       227
        1.0       0.92      0.87      0.89       177

avg / total       0.91      0.91      0.91       404



## Cross-validation

In [32]:
# Evaluate the model using 10-fold cross-validation.
# Uses 9 parts of data for training and the last part for testing. 
# This process is repeated 10 times.
for i in range(2, 10):
    scores = cross_val_score(RandomForestClassifier(n_estimators=100, max_depth=i, max_features='auto', oob_score=True, random_state=1), X, y, scoring='precision', cv=10)
    
    print("max_depth = {}, Average precision: {}".format(i, scores.mean()))

max_depth = 2, Average precision: 0.8404014149557903
max_depth = 3, Average precision: 0.8480553654916323
max_depth = 4, Average precision: 0.8448730685168264
max_depth = 5, Average precision: 0.8363942428778618
max_depth = 6, Average precision: 0.8247230472118581
max_depth = 7, Average precision: 0.8289427573608851
max_depth = 8, Average precision: 0.8166602216766929
max_depth = 9, Average precision: 0.823107368543018


    Conclusion:
    In this section, we used the dataFrame file that was produced by Logistic Regession Model section, in which  'Happiess_class' has already been classified with class=0 (Happiness Score <= 5.0) and class=1 (Happiness Score > 5.0). Thus all input data set have been well prepared and we donot need to shape them any more.
    In terms of DTC (Decision Tree Classifier), by utilizing cross-validation evaluation, we can see that max_depth = 5 is the best choice to get most accuracy on prediction.
    However, with the same evaluation method, in Random Forest Classifier, it looks that when max_depth = 3, the precision score is highest (0.848). Thus choosing max_depth = 3 for Random Forest model traning could provide the better happiness prediction.