# Task 2. Predictive Modeling Using Decision Trees

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from dm_tools import data_prep
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import GridSearchCV
import pydot
from io import StringIO
from sklearn.tree import export_graphviz

# read the organics dataset

df2 = data_prep()
#Dropping all categorical values because decision trees takes just numeric values
#Drop ORGYN because is it represent the same informastion as ORGANICS
df3 = df2.drop(['ORGANICS'], axis=1)

df3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22223 entries, 0 to 22222
Data columns (total 9 columns):
GENDER    22223 non-null object
AGE       22223 non-null float64
TV_REG    21758 non-null object
NGROUP    21549 non-null object
REGION    21758 non-null object
CLASS     22223 non-null object
ORGYN     22223 non-null int64
AFFL      22223 non-null float64
LTIME     22223 non-null float64
dtypes: float64(3), int64(1), object(5)
memory usage: 1.5+ MB


In [3]:
#hot-encoding, categorical to numbers
df4 = pd.get_dummies(df3)

In [4]:
# target/input split
#Decision tree can only be used with int values
y = df4['ORGYN']
x = df4.drop(['ORGYN'], axis=1)

# setting random state
rs = 0

x_mat = x.as_matrix()
x_train, x_test, y_train, y_test = train_test_split(x_mat, y, test_size=0.3, stratify=y, random_state=rs)

# simple decision tree training
model = DecisionTreeClassifier(random_state=rs)
#model = DecisionTreeClassifier(criterion='gini', random_state=rs, max_depth=2, min_samples_leaf=20)
model.fit(x_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=0,
            splitter='best')

In [5]:
print("Train accuracy:", model.score(x_train, y_train))
print("Test accuracy:", model.score(x_test, y_test))

Train accuracy: 0.9974286448958601
Test accuracy: 0.7304634768261586


In [6]:
y_pred = model.predict(x_test)
print(classification_report(y_test, y_pred))

             precision    recall  f1-score   support

          0       0.82      0.82      0.82      5015
          1       0.46      0.47      0.46      1652

avg / total       0.73      0.73      0.73      6667



In [7]:
# grab feature importances from the model and feature name from the original X
importances = model.feature_importances_
feature_names = x.columns

# sort them out in descending order
indices = np.argsort(importances)
indices = np.flip(indices, axis=0)

# limit to 20 features, you can leave this out to print out everything
indices = indices[:5]

for i in indices:
    print(feature_names[i], ':', importances[i])

AGE : 0.2836303468142094
AFFL : 0.1565937960439934
LTIME : 0.14091070264420394
GENDER_F : 0.04592557415720339
NGROUP_C : 0.024730850805792546


# GridSearchCV

In [8]:
# grid search CV
params = {'criterion': ['gini', 'entropy'],
          'max_depth': range(2, 7),
          'min_samples_leaf': range(20, 60, 10)}

cv = GridSearchCV(param_grid=params, estimator=DecisionTreeClassifier(random_state=rs), cv=10)
cv.fit(x_train, y_train)

print("Train accuracy:", cv.score(x_train, y_train))
print("Test accuracy:", cv.score(x_test, y_test))

# test the best model
y_pred = cv.predict(x_test)
print(classification_report(y_test, y_pred))

# print parameters of the best model
print(cv.best_params_)

Train accuracy: 0.815826690665981
Test accuracy: 0.8173091345432728
             precision    recall  f1-score   support

          0       0.85      0.92      0.88      5015
          1       0.68      0.49      0.57      1652

avg / total       0.81      0.82      0.81      6667

{'criterion': 'entropy', 'max_depth': 6, 'min_samples_leaf': 30}


In [9]:
model = DecisionTreeClassifier(criterion='gini', random_state=rs, max_depth=6, min_samples_leaf=40)
model.fit(x_train, y_train)

print("Train accuracy:", model.score(x_train, y_train))
print("Test accuracy:", model.score(x_test, y_test))

Train accuracy: 0.81544098740036
Test accuracy: 0.8134093295335233


In [10]:
# visualize
dotfile = StringIO()
export_graphviz(model, out_file=dotfile, feature_names=x.columns)
graph = pydot.graph_from_dot_data(dotfile.getvalue())
graph[0].write_png("Case1-task2-v5.png") # saved in the following file - will return True if successful