In [1]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split
from sklearn.metrics import roc_auc_score, accuracy_score, confusion_matrix
%matplotlib inline
from matplotlib import pyplot as plt
import seaborn as sns

In [2]:
data = pd.read_csv("../../data/diam.csv")

In [3]:
data.head(10)

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,target
0,0.23,Ideal,E,SI2,61.5,55.0,3.95,3.98,2.43,dewevo
1,0.21,Premium,E,SI1,59.8,61.0,3.89,3.84,2.31,dewevo
2,0.23,Good,E,VS1,56.9,65.0,4.05,4.07,2.31,dewevo
3,0.29,Premium,I,VS2,62.4,58.0,4.2,4.23,2.63,dewevo
4,0.31,Good,J,SI2,63.3,58.0,4.34,4.35,2.75,dewevo
5,0.24,Very Good,J,VVS2,62.8,57.0,3.94,3.96,2.48,dewevo
6,0.24,Very Good,I,VVS1,62.3,57.0,3.95,3.98,2.47,dewevo
7,0.26,Very Good,H,SI1,61.9,55.0,4.07,4.11,2.53,dewevo
8,0.22,Fair,E,VS2,65.1,61.0,3.87,3.78,2.49,dewevo
9,0.23,Very Good,H,VS1,59.4,61.0,4.0,4.05,2.39,dewevo


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53940 entries, 0 to 53939
Data columns (total 10 columns):
carat      53940 non-null float64
cut        53940 non-null object
color      53940 non-null object
clarity    53940 non-null object
depth      53940 non-null float64
table      53940 non-null float64
x          53940 non-null float64
y          53940 non-null float64
z          53940 non-null float64
target     53940 non-null object
dtypes: float64(6), object(4)
memory usage: 4.1+ MB


In [5]:
data.drop(['color', 'clarity'], axis = 1, inplace = True)

In [6]:
data.head(1)

Unnamed: 0,carat,cut,depth,table,x,y,z,target
0,0.23,Ideal,61.5,55.0,3.95,3.98,2.43,dewevo


In [7]:
data['cut'] = data['cut'].map({'Fair':0, 'Good':1, 'Very Good': 2, 'Premium': 3, 'Ideal':4})

In [8]:
data.head(1)

Unnamed: 0,carat,cut,depth,table,x,y,z,target
0,0.23,4,61.5,55.0,3.95,3.98,2.43,dewevo


In [9]:
y = data['target']

In [10]:
X = data.drop(['target'], axis = 1)

In [11]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size = 0.3, random_state = 17)

In [12]:
X_train.shape, X_valid.shape

((37758, 7), (16182, 7))

In [13]:
y_train.shape, y_valid.shape


((37758,), (16182,))

In [14]:
tree = DecisionTreeClassifier(random_state = 17)

In [15]:
cross_val_score(tree, X_train, y_train, cv = 5)

array([0.93167373, 0.92703919, 0.93604343, 0.93008475, 0.92940397])

In [16]:
tree_params = {'max_depth': np.arange(1,11)}

In [17]:
%%time
tree_grid = GridSearchCV(tree, tree_params, cv = 5, n_jobs=-1)

CPU times: user 28 µs, sys: 2 µs, total: 30 µs
Wall time: 35.8 µs


In [18]:
tree_grid.fit(X_train,y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=17,
            splitter='best'),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'max_depth': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [19]:
tree_grid.best_estimator_

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=3,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=17,
            splitter='best')

In [37]:
tree_grid.best_params_, tree_grid.best_score_

({'max_depth': 3}, 0.9535197838868584)

In [20]:
tree_valid_pred = tree_grid.predict(X_valid)

In [21]:
accuracy_score(y_valid, tree_valid_pred)

0.9535286120380669