In [1]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, accuracy_score, roc_curve, roc_auc_score

In [2]:
data = pd.read_csv("winequality-red.csv")

In [3]:
data.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [4]:
data.describe()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
count,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0
mean,8.319637,0.527821,0.270976,2.538806,0.087467,15.874922,46.467792,0.996747,3.311113,0.658149,10.422983,5.636023
std,1.741096,0.17906,0.194801,1.409928,0.047065,10.460157,32.895324,0.001887,0.154386,0.169507,1.065668,0.807569
min,4.6,0.12,0.0,0.9,0.012,1.0,6.0,0.99007,2.74,0.33,8.4,3.0
25%,7.1,0.39,0.09,1.9,0.07,7.0,22.0,0.9956,3.21,0.55,9.5,5.0
50%,7.9,0.52,0.26,2.2,0.079,14.0,38.0,0.99675,3.31,0.62,10.2,6.0
75%,9.2,0.64,0.42,2.6,0.09,21.0,62.0,0.997835,3.4,0.73,11.1,6.0
max,15.9,1.58,1.0,15.5,0.611,72.0,289.0,1.00369,4.01,2.0,14.9,8.0


In [12]:
data.columns.isna()

array([False, False, False, False, False, False, False, False, False,
       False, False, False])

In [13]:
X = data.drop(columns='quality')
y = data.quality

In [14]:
x_train, x_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=355)

### Decision Tree Classification before hyperparameter tuning

In [36]:
dtc = DecisionTreeClassifier()
dtc.fit(x_train,y_train)

DecisionTreeClassifier()

In [37]:
dtc.score(x_train,y_train)

1.0

In [38]:
y_predict = dtc.predict(x_train)

In [39]:
dtc.score(x_test,y_test)

0.6291666666666667

### The process of hyperparameter tuning

In [31]:
grid_param = {
    'criterion': [ 'gini', 'entropy'], 
    'max_depth': range(2,32,1),
    'min_samples_leaf': range(1,10,1),
    'min_samples_split': range(2,10,1),
    'splitter': ['best', 'random']
}

In [37]:
#hyperparameter tuning using GridSearchCV to find the best value of parameters to be used in DecisionTreeClassifier
grid_search = GridSearchCV(estimator=dtc, param_grid=grid_param, cv=5, n_jobs=-1)

In [39]:
grid_search.fit(x_train,y_train)

GridSearchCV(cv=5,
             estimator=DecisionTreeClassifier(criterion='entropy', max_depth=25,
                                              min_samples_leaf=6,
                                              min_samples_split=7,
                                              splitter='random'),
             n_jobs=-1,
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': range(2, 32),
                         'min_samples_leaf': range(1, 10),
                         'min_samples_split': range(2, 10),
                         'splitter': ['best', 'random']})

In [40]:
grid_search.best_params_ #best parameters

{'criterion': 'gini',
 'max_depth': 21,
 'min_samples_leaf': 7,
 'min_samples_split': 4,
 'splitter': 'random'}

In [41]:
grid_search.best_score_

0.6094610826393337

### Decision Tree Classification after hyperparameter tuning

In [43]:
dtc = DecisionTreeClassifier(criterion='gini', max_depth=21, min_samples_leaf=7, min_samples_split=4, splitter='random')

In [44]:
dtc.fit(x_train,y_train)

DecisionTreeClassifier(max_depth=21, min_samples_leaf=7, min_samples_split=4,
                       splitter='random')

In [45]:
dtc.score(x_test,y_test)

0.6041666666666666

In [46]:
y_predict = dtc.predict(x_test)

In [47]:
conf_mat = confusion_matrix(y_test, y_predict)

In [48]:
conf_mat

array([[  0,   0,   1,   1,   0,   0],
       [  0,   0,  12,   5,   0,   0],
       [  0,   1, 159,  51,   2,   0],
       [  1,   0,  66, 108,  17,   0],
       [  0,   0,   8,  23,  23,   0],
       [  0,   0,   0,   2,   0,   0]], dtype=int64)