In [2]:
import pandas as pd
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from joblib import dump, load

In [3]:
df_X = pd.read_csv("winequality_white_x_train.csv")
df_y = pd.read_csv("winequality_white_y_train.csv")

x = df_X.values
y = df_y.values


In [6]:
cv_score_MLP = (cross_val_score(MLPClassifier(random_state=2, max_iter=1000), x, y, cv=10, scoring="accuracy")).mean()
cv_score_RF = (cross_val_score(RandomForestClassifier(random_state=1), x, y, cv=10, scoring="accuracy")).mean()
cv_score_KNN = (cross_val_score(KNeighborsClassifier(n_neighbors=3), x, y, cv=10, scoring="accuracy")).mean()
cv_score_DCT = (cross_val_score(DecisionTreeClassifier(random_state=1), x, y, cv=10, scoring="accuracy")).mean()

print("MLP: ", cv_score_MLP)
print("RF: ", cv_score_RF)
print("KNN: ", cv_score_KNN)
print("DCT: ", cv_score_DCT)

MLP:  0.39164967751501223
RF:  0.5217788568463193
KNN:  0.5169165616428202
DCT:  0.5696400770998592


### Test DecisionTree Classifier

In [7]:
# Decision Tree Classifier: Fit the training data
clf = DecisionTreeClassifier(random_state=1)
clf.fit(x, y)

In [8]:
param_dist = {
    "criterion":["gini", "entropy"],
    "max_depth":[1,2,3,4,5,6,7,None],
    "splitter":["best", "random"]        
}

grid = GridSearchCV(clf, param_grid = param_dist, cv=10, n_jobs=-1)
grid.fit(x, y)
print("best estimator: ", grid.best_estimator_)
print("best score: ", grid.best_score_)
print("best params: ", grid.best_params_)

best estimator:  DecisionTreeClassifier(random_state=1)
best score:  0.5696400770998592
best params:  {'criterion': 'gini', 'max_depth': None, 'splitter': 'best'}


In [10]:
# Decision Tree Classifier: Fit the training data
clf = DecisionTreeClassifier(random_state=1, max_depth=None, criterion="gini")
clf.fit(x, y)

cv_score_DCT_best_param = (cross_val_score(DecisionTreeClassifier(random_state=1, max_depth=None, criterion="gini"), x, y, cv=10, scoring="accuracy")).mean()
cv_score_DCT_best_param


0.5696400770998592

### test KNeighborsClassifier

In [11]:
# KNN Classifier: Fit the training data
clf_KN = KNeighborsClassifier(n_neighbors=2, algorithm = "ball_tree", weights ="distance")
clf_KN.fit(x, y)

In [12]:
param_dist = {
    "weights" : ["uniform", "distance"],
    "n_neighbors":[2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20],
    "algorithm":["ball_tree", "kd_tree","brute"]
}

grid = GridSearchCV(clf_KN, param_grid = param_dist, cv=10, n_jobs=-1)
grid.fit(x, y)
print("best estimator: ", grid.best_estimator_)
print("best score: ", grid.best_score_)
print("best params: ", grid.best_params_)

best estimator:  KNeighborsClassifier(algorithm='ball_tree', n_neighbors=2, weights='distance')
best score:  0.6098691526428942
best params:  {'algorithm': 'ball_tree', 'n_neighbors': 2, 'weights': 'distance'}


In [13]:
cv_score_KNN = (cross_val_score(KNeighborsClassifier(n_neighbors=2, algorithm="ball_tree", weights="distance"), x, y, cv=10, scoring="accuracy")).mean()
cv_score_KNN

0.6098691526428942

### Dump Model

In [14]:
model_file = "ml_WineQuality_WHITE.joblib"
dump(clf_KN, model_file)

['ml_WineQuality_WHITE.joblib']