# Task 2. Predictive Modeling Using Decision Trees

In [16]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from dm_tools import data_prep
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import GridSearchCV
import pydot
from io import StringIO
from sklearn.tree import export_graphviz

# read the organics dataset

df2 = data_prep()
#Dropping all categorical values because decision trees takes just numeric values
df3 = df2.drop(['GENDER', 'AGEGRP1', 'AGEGRP2', 'TV_REG', 'NGROUP', 'REGION', 'CLASS'], axis=1)

df3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22223 entries, 0 to 22222
Data columns (total 6 columns):
AGE         22223 non-null float64
ORGANICS    22223 non-null int64
BILL        22223 non-null float64
ORGYN       22223 non-null int64
AFFL        22223 non-null float64
LTIME       22223 non-null float64
dtypes: float64(4), int64(2)
memory usage: 1.0 MB


In [18]:
# target/input split
#Decision tree can only be used with int values
y = df3['ORGANICS']
x = df3.drop(['ORGANICS'], axis=1)

# setting random state
rs = 0

x_mat = x.as_matrix()
x_train, x_test, y_train, y_test = train_test_split(x_mat, y, test_size=0.3, stratify=y, random_state=rs)

# simple decision tree training
model = DecisionTreeClassifier(random_state=rs)
#model = DecisionTreeClassifier(criterion='gini', random_state=rs, max_depth=2, min_samples_leaf=20)
model.fit(x_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=0,
            splitter='best')

In [19]:
print("Train accuracy:", model.score(x_train, y_train))
print("Test accuracy:", model.score(x_test, y_test))

Train accuracy: 0.9956929802005657
Test accuracy: 0.9548522573871306


In [20]:
y_pred = model.predict(x_test)
print(classification_report(y_test, y_pred))

             precision    recall  f1-score   support

          0       1.00      1.00      1.00      5015
          1       0.90      0.90      0.90      1388
          2       0.37      0.40      0.38       215
          3       0.45      0.41      0.43        49

avg / total       0.96      0.95      0.96      6667



In [21]:
# grab feature importances from the model and feature name from the original X
importances = model.feature_importances_
feature_names = x.columns

# sort them out in descending order
indices = np.argsort(importances)
indices = np.flip(indices, axis=0)

# limit to 20 features, you can leave this out to print out everything
indices = indices[:5]

for i in indices:
    print(feature_names[i], ':', importances[i])

ORGYN : 0.8348832041644922
AFFL : 0.0574525910005876
AGE : 0.04974334620403602
LTIME : 0.029945542285722198
BILL : 0.02797531634516191


# GridSearchCV

In [22]:
# grid search CV
params = {'criterion': ['gini', 'entropy'],
          'max_depth': range(2, 7),
          'min_samples_leaf': range(20, 60, 10)}

cv = GridSearchCV(param_grid=params, estimator=DecisionTreeClassifier(random_state=rs), cv=10)
cv.fit(x_train, y_train)

print("Train accuracy:", cv.score(x_train, y_train))
print("Test accuracy:", cv.score(x_test, y_test))

# test the best model
y_pred = cv.predict(x_test)
print(classification_report(y_test, y_pred))

# print parameters of the best model
print(cv.best_params_)

Train accuracy: 0.9685008999742865
Test accuracy: 0.967301634918254
             precision    recall  f1-score   support

          0       1.00      1.00      1.00      5015
          1       0.89      0.98      0.93      1388
          2       0.73      0.17      0.28       215
          3       0.55      0.71      0.62        49

avg / total       0.96      0.97      0.96      6667

{'criterion': 'entropy', 'max_depth': 4, 'min_samples_leaf': 20}


In [23]:
model = DecisionTreeClassifier(criterion='gini', random_state=rs, max_depth=6, min_samples_leaf=40)
model.fit(x_train, y_train)

print("Train accuracy:", model.score(x_train, y_train))
print("Test accuracy:", model.score(x_test, y_test))

Train accuracy: 0.9689508871175109
Test accuracy: 0.9691015449227539


In [24]:
# visualize
dotfile = StringIO()
export_graphviz(model, out_file=dotfile, feature_names=x.columns)
graph = pydot.graph_from_dot_data(dotfile.getvalue())
graph[0].write_png("Case1-task2-v2.png") # saved in the following file - will return True if successful