In [1]:
import pandas as pd
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from joblib import dump, load
import numpy as np
from sklearn.model_selection import train_test_split

### Import Data Set

In [2]:
df_X = pd.read_csv("winequality_white_x_train.csv")
df_y = pd.read_csv("winequality_white_y_train.csv")

x = df_X.values
y = df_y.values

### Selecting Classifier based on cross val score

In [3]:
# cv_score_MLP = (cross_val_score(MLPClassifier(random_state=2, max_iter=1000), x, y, cv=10, scoring="accuracy")).mean()
# cv_score_RF = (cross_val_score(RandomForestClassifier(random_state=1), x, y, cv=10, scoring="accuracy")).mean()
# cv_score_KNN = (cross_val_score(KNeighborsClassifier(n_neighbors=3), x, y, cv=10, scoring="accuracy")).mean()
# cv_score_DCT = (cross_val_score(DecisionTreeClassifier(random_state=1), x, y, cv=10, scoring="accuracy")).mean()

# print("MLP: ", cv_score_MLP)
# print("RF: ", cv_score_RF)
# print("KNN: ", cv_score_KNN)
# print("DCT: ", cv_score_DCT)

### Fit dataset using DecisionTree: dataset will be split into training and test data

In [4]:
#Fit dataset using Train Test Split
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=.2, random_state=1)

clf = DecisionTreeClassifier(random_state=1)
clf.fit(x_train, y_train)

y_pred = clf.predict(x_test)
print(classification_report(y_test, y_pred)) #Evaluation


              precision    recall  f1-score   support

           0       0.00      0.00      0.00         2
           1       0.50      0.26      0.34        31
           2       0.55      0.55      0.55       187
           3       0.57      0.59      0.58       290
           4       0.52      0.56      0.54       120
           5       0.40      0.37      0.38        27
           6       0.00      0.00      0.00         0

   micro avg       0.55      0.55      0.55       657
   macro avg       0.36      0.33      0.34       657
weighted avg       0.54      0.55      0.54       657
 samples avg       0.55      0.55      0.55       657



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [5]:
# cv_score_MLP = (cross_val_score(MLPClassifier(random_state=2, max_iter=5000), x_train, y_train, cv=10, scoring="accuracy")).mean()
# cv_score_RF = (cross_val_score(RandomForestClassifier(random_state=1),  x_train, y_train, cv=10, scoring="accuracy")).mean()
# cv_score_KNN = (cross_val_score(KNeighborsClassifier(n_neighbors=3), x_train, y_train, cv=10, scoring="accuracy")).mean()
# cv_score_DCT = (cross_val_score(DecisionTreeClassifier(random_state=1), x_train, y_train, cv=10, scoring="accuracy")).mean()

# print("MLP: ", cv_score_MLP)
# print("RF: ", cv_score_RF)
# print("KNN: ", cv_score_KNN)
# print("DCT: ", cv_score_DCT)

### Test DecisionTree Classifier using the whole dataset and evaluate using cross val score

In [6]:
#x = df_X.values
#y = df_y.values

# Decision Tree Classifier: Fit the training data
clf = DecisionTreeClassifier(random_state=1)
clf.fit(x, y)

cv_score_DCT_best_param = (cross_val_score(DecisionTreeClassifier(random_state=1), x, y, cv=10, scoring="accuracy")).mean()
cv_score_DCT_best_param

0.5696400770998592

### Hyperparamter Tuning using GridSearchCV

In [7]:
param_dist = {
    "criterion":["gini", "entropy"],
    "max_depth":[1,2,3,4,5,6,7,None],
    "splitter":["best", "random"]        
}

grid = GridSearchCV(clf, param_grid = param_dist, cv=10, n_jobs=-1)
grid.fit(x, y)
print("best estimator: ", grid.best_estimator_)
print("best score: ", grid.best_score_)
print("best params: ", grid.best_params_)

best estimator:  DecisionTreeClassifier(random_state=1)
best score:  0.5696400770998592
best params:  {'criterion': 'gini', 'max_depth': None, 'splitter': 'best'}


In [8]:
# Decision Tree Classifier: Fit the training data
clf = DecisionTreeClassifier(random_state=1, max_depth=None, criterion="gini")
clf.fit(x, y)

cv_score_DCT_best_param = (cross_val_score(DecisionTreeClassifier(random_state=1, max_depth=None, criterion="gini"), x, y, cv=10, scoring="accuracy")).mean()
cv_score_DCT_best_param


0.5696400770998592

### test KNeighborsClassifier

In [9]:
# KNN Classifier: Fit the training data
clf_KN = KNeighborsClassifier(n_neighbors=2, algorithm = "ball_tree", weights ="distance")
clf_KN.fit(x, y)
#clf_KN.fit(x_train, y_train)

#y_pred = clf_KN.predict(x_test)
#print(classification_report(y_test, y_pred)) #Evaluation


In [10]:
param_dist = {
    "weights" : ["uniform", "distance"],
    "n_neighbors":[2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20],
    "algorithm":["ball_tree", "kd_tree","brute"]
}

grid = GridSearchCV(clf_KN, param_grid = param_dist, cv=10, n_jobs=-1)
grid.fit(x, y)
print("best estimator: ", grid.best_estimator_)
print("best score: ", grid.best_score_)
print("best params: ", grid.best_params_)

best estimator:  KNeighborsClassifier(algorithm='ball_tree', n_neighbors=2, weights='distance')
best score:  0.6098691526428942
best params:  {'algorithm': 'ball_tree', 'n_neighbors': 2, 'weights': 'distance'}


In [11]:
cv_score_KNN = (cross_val_score(KNeighborsClassifier(n_neighbors=2, algorithm="ball_tree", weights="distance"), x, y, cv=10, scoring="accuracy")).mean()
cv_score_KNN

0.6098691526428942

### Dump Model

In [12]:
model_file = "ml_WineQuality_WHITE_v2.joblib"
dump(clf_KN, model_file)

['ml_WineQuality_WHITE_v2.joblib']