# Task 2. Predictive Modeling Using Decision Trees

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from dm_tools import data_prep
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import GridSearchCV
import pydot
from io import StringIO
from sklearn.tree import export_graphviz

# read the organics dataset

df2 = data_prep()
#Dropping all categorical values because decision trees takes just numeric values
#Drop ORGYN because is it represent the same informastion as ORGANICS
#df3 = df2.drop(['ORGANICS'], axis=1) <--- Have placed this in the code

df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22223 entries, 0 to 22222
Data columns (total 37 columns):
AGE                    22223 non-null float64
ORGANICS               22223 non-null int64
ORGYN                  22223 non-null int64
AFFL                   22223 non-null float64
LTIME                  22223 non-null float64
GENDER_F               22223 non-null uint8
GENDER_M               22223 non-null uint8
GENDER_U               22223 non-null uint8
TV_REG_Border          22223 non-null uint8
TV_REG_C Scotland      22223 non-null uint8
TV_REG_East            22223 non-null uint8
TV_REG_London          22223 non-null uint8
TV_REG_Midlands        22223 non-null uint8
TV_REG_N East          22223 non-null uint8
TV_REG_N Scot          22223 non-null uint8
TV_REG_N West          22223 non-null uint8
TV_REG_S & S East      22223 non-null uint8
TV_REG_S West          22223 non-null uint8
TV_REG_Ulster          22223 non-null uint8
TV_REG_Wales & West    22223 non-null uint8
TV_RE

In [2]:
#hot-encoding, categorical to numbers, placed it in the data_prep function
#df4 = pd.get_dummies(df3)

In [3]:
# target/input split
#Decision tree can only be used with int values
y = df2['ORGYN']
x = df2.drop(['ORGYN'], axis=1)

# setting random state
rs = 10

x_mat = x.as_matrix()
x_train, x_test, y_train, y_test = train_test_split(x_mat, y, test_size=0.3, stratify=y, random_state=rs)

# simple decision tree training
model = DecisionTreeClassifier(random_state=rs)
#model = DecisionTreeClassifier(criterion='gini', random_state=rs, max_depth=2, min_samples_leaf=20)
model.fit(x_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=10,
            splitter='best')

In [4]:
print("Train accuracy:", model.score(x_train, y_train))
print("Test accuracy:", model.score(x_test, y_test))

Train accuracy: 1.0
Test accuracy: 1.0


In [5]:
y_pred = model.predict(x_test)
print(classification_report(y_test, y_pred))

             precision    recall  f1-score   support

          0       1.00      1.00      1.00      5015
          1       1.00      1.00      1.00      1652

avg / total       1.00      1.00      1.00      6667



In [6]:
clf = model.fit(x_train, y_train)
treeObj = clf.tree_
print(treeObj.node_count)

dotfile = StringIO()
export_graphviz(model, out_file=dotfile, feature_names=x.columns)
graph = pydot.graph_from_dot_data(dotfile.getvalue())
graph[0].write_png("test_6.png") # saved in the following file - will return True if successful

3


In [15]:
children_left = model.tree_.children_left
children_right = model.tree_.children_right

print(model.tree_)
print(np.count_nonzero((children_left == -1))
print(np.count_nonzero(children_right == -1))


<sklearn.tree._tree.Tree object at 0x105af5ed0>
2
2


In [63]:
# grab feature importances from the model and feature name from the original X
importances = model.feature_importances_
feature_names = x.columns

# sort them out in descending order
indices = np.argsort(importances)
indices = np.flip(indices, axis=0)

# limit to 20 features, you can leave this out to print out everything
indices = indices[:5]

for i in indices:
    print(feature_names[i], ':', importances[i])

AGE : 0.30247520278256285
AFFL : 0.15931629607578177
LTIME : 0.12873886240743665
GENDER_F : 0.04475206326727791
NGROUP_C : 0.026306389394501743


# GridSearchCV

In [64]:
# grid search CV
params = {'criterion': ['gini', 'entropy'],
          'max_depth': range(2, 7),
          'min_samples_leaf': range(20, 60, 10)}

cv = GridSearchCV(param_grid=params, estimator=DecisionTreeClassifier(random_state=rs), cv=10)
cv.fit(x_train, y_train)

print("Train accuracy:", cv.score(x_train, y_train))
print("Test accuracy:", cv.score(x_test, y_test))

# test the best model
y_pred = cv.predict(x_test)
print(classification_report(y_test, y_pred))

# print parameters of the best model
print(cv.best_params_)

Train accuracy: 0.8131267678066341
Test accuracy: 0.8116094195290235
             precision    recall  f1-score   support

          0       0.83      0.94      0.88      5015
          1       0.70      0.41      0.52      1652

avg / total       0.80      0.81      0.79      6667

{'criterion': 'entropy', 'max_depth': 4, 'min_samples_leaf': 20}


In [66]:
model = DecisionTreeClassifier(criterion='entropy', random_state=rs, max_depth=4, min_samples_leaf=20)
model.fit(x_train, y_train)

print("Train accuracy:", model.score(x_train, y_train))
print("Test accuracy:", model.score(x_test, y_test))

Train accuracy: 0.8131267678066341
Test accuracy: 0.8116094195290235


In [41]:
# visualize
dotfile = StringIO()
export_graphviz(model, out_file=dotfile, feature_names=x.columns)
graph = pydot.graph_from_dot_data(dotfile.getvalue())
graph[0].write_png("Case1-task2-rs10.png") # saved in the following file - will return True if successful

In [69]:
# grab feature importances from the model and feature name from the original X
importances = model.feature_importances_
feature_names = x.columns

# sort them out in descending order
indices = np.argsort(importances)
indices = np.flip(indices, axis=0)

# limit to 20 features, you can leave this out to print out everything
indices = indices[:5]

for i in indices:
    print(feature_names[i], ':', importances[i])

AGE : 0.4604840471690019
AFFL : 0.3459657735007613
GENDER_F : 0.1849788387302646
GENDER_M : 0.008571340599972204
TV_REG_London : 0.0


In [70]:
dotfile = StringIO()
export_graphviz(model, out_file=dotfile, feature_names=x.columns)
graph = pydot.graph_from_dot_data(dotfile.getvalue())
graph[0].write_png("best.png") # saved in the following file - will return True if successful