# Implementing a decision tree with scikit-learn

In [1]:
import pandas as pd
import numpy as np

In [2]:
data.head()

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [3]:
data['Species'].unique()

array(['setosa', 'versicolor', 'virginica'], dtype=object)

# Splitting the predictor and target variables

In [4]:
colnames=data.columns.values.tolist()
predictors=colnames[:4]
target=colnames[4]

# Splitting the dataset into train and test variables

In [8]:
data['is_train'] = np.random.uniform(0, 1, len(data)) <= .75
train, test = data[data['is_train']==True], data[data['is_train']==False]

# Creating and fitting a Decision Tree

In [10]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier(criterion='entropy',min_samples_split=20, random_state=99)
dt.fit(train[predictors], train[target])

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=20,
            min_weight_fraction_leaf=0.0, presort=False, random_state=99,
            splitter='best')

# Predicting the values using the decision tree

In [11]:
preds=dt.predict(test[predictors])
pd.crosstab(test['Species'],preds,rownames=['Actual'],colnames=['Predictions'])

Predictions,setosa,versicolor,virginica
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
setosa,9,0,0
versicolor,0,10,0
virginica,0,3,10


# Visualising the Decision Tree

In [12]:
from sklearn.tree import export_graphviz
with open('E:/dtree2.dot', 'w') as dotfile:
    export_graphviz(dt, out_file = dotfile, feature_names = predictors)
dotfile.close()

In [15]:
from os import system
system("dot -Tpng /E:/dtree2.dot -o /E:/New folder/dtree2.png")


1

# Cross Validating and Pruning the Decision Tree

In [18]:
X=data[predictors]
Y=data[target]
dt1 = DecisionTreeClassifier(criterion='entropy',max_depth=5, min_samples_split=20, random_state=99)
dt1.fit(X,Y)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=5,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=20,
            min_weight_fraction_leaf=0.0, presort=False, random_state=99,
            splitter='best')

In [19]:
from sklearn.cross_validation import KFold
crossvalidation = KFold(n=X.shape[0], n_folds=10, shuffle=True, random_state=1)



In [20]:
from sklearn.cross_validation import cross_val_score
score = np.mean(cross_val_score(dt1, X, Y, scoring='accuracy', cv=crossvalidation, n_jobs=1))
score

0.93333333333333335

# Feature importance of the tree

In [21]:
dt1.feature_importances_

array([ 0.        ,  0.        ,  0.66869158,  0.33130842])