In [22]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,confusion_matrix, accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score, cross_validate, KFold
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.model_selection import GridSearchCV

In [23]:
#Import the wine dataset
wine_1 = pd.read_csv('winequality-white-v4.csv')
#Check dimension
wine_1.shape

(2337, 7)

In [24]:
#Check for NA for all columns
wine_1.isna().any()

alcohol                 False
density                 False
chlorides                True
total sulfur dioxide    False
citric acid             False
residual sugar          False
quality                 False
dtype: bool

In [25]:
#Check how many NA in chlorides
print(pd.isna(wine_1['chlorides']).sum())

100


In [26]:
#Finding the mean of chlorides
chlorides_mean=wine_1['chlorides'].mean(axis=0, skipna=True)
print(chlorides_mean)

0.04654269110415769


In [27]:
#Replace NA with mean value
wine_1['chlorides'].fillna(value=chlorides_mean, inplace=True)

In [28]:
#Recheck for any NA left
print(pd.isna(wine_1['chlorides']).sum())

0


In [29]:
#Export to csv
wine_1.to_csv('winequality-white-v4-processed.csv', index=False)

In [30]:
#Import the new dataset that was just processed
wine_2 = pd.read_csv('winequality-white-v4-processed.csv')

In [31]:
#Check datatype of the dataset
print(wine_2.dtypes)

alcohol                 float64
density                 float64
chlorides               float64
total sulfur dioxide    float64
citric acid             float64
residual sugar          float64
quality                   int64
dtype: object


In [32]:
#Convert label column to categorical and recheck its datatype
wine_2['quality'] = pd.Categorical(wine_2.quality)
wine_2.dtypes

alcohol                  float64
density                  float64
chlorides                float64
total sulfur dioxide     float64
citric acid              float64
residual sugar           float64
quality                 category
dtype: object

In [33]:
#Describe the dataset
print(wine_2.describe())

           alcohol      density    chlorides  total sulfur dioxide  \
count  2337.000000  2337.000000  2337.000000           2337.000000   
mean     10.395919     0.994204     0.046543            141.193410   
std       1.266020     0.002962     0.022580             42.083747   
min       8.000000     0.987110     0.009000              9.000000   
25%       9.400000     0.991820     0.036000            111.000000   
50%      10.100000     0.993960     0.044000            138.000000   
75%      11.300000     0.996440     0.050000            171.000000   
max      14.200000     1.002410     0.346000            344.000000   

       citric acid  residual sugar  
count  2337.000000     2337.000000  
mean      0.333124        6.525952  
std       0.121458        5.072101  
min       0.000000        0.600000  
25%       0.260000        1.700000  
50%       0.320000        5.400000  
75%       0.390000       10.200000  
max       1.000000       23.500000  


In [34]:
#Slit into features and labels
features = wine_2.drop('quality',axis=1)
labels = wine_2['quality']
#Encode label so that 5 is 0 and 7 is 1
labels = labels.apply(lambda x: 0 if x == 5 else 1)
# Splitting Data
X_train, X_test, y_train, y_test = train_test_split(features, labels, random_state=42)

In [35]:
print(X_train.shape)
print(X_test.shape)

(1752, 6)
(585, 6)


In [36]:
#Normalise features in dataset
scaler = StandardScaler()
StandardScaler(copy=True, with_mean=True, with_std=True)
X_train_norm = scaler.fit_transform(X_train)
X_test_norm = scaler.fit_transform(X_test)
features_norm = scaler.fit_transform(features)

# Neural Network

In [60]:
#Using 10-fold cross validation
kfold = KFold(n_splits=10, shuffle=True, random_state=123)

#Search for best parameters in Neural network
grid_params_lr = {
    'solver': ['lbfgs', 'sgd', 'adam'],
    'activation': ['identity', 'logistic', 'tanh', 'relu'],
    'hidden_layer_sizes': [(5,5), (5,5,5), (5,2)],
    'max_iter': [500,1000]
}

mlp_grid = MLPClassifier() #Train the model
mlp_grid_result = GridSearchCV(mlp_grid, grid_params_lr,cv=kfold).fit(X_train_norm,y_train) #Use Gridsearch on the model
print(mlp_grid_result.best_score_) #Print the highest accuracy of the most optimal model

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("

0.8207759740259741


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


In [68]:
#Print best Parameter
print(mlp_grid_result.best_params_)

{'activation': 'relu', 'hidden_layer_sizes': (5, 5, 5), 'max_iter': 1000, 'solver': 'lbfgs'}


In [70]:
#Using accuracy, check for most optimal learning rate between 0.001, 0.01 and 0.1 based on the model found by GridSearchCV
def printaccuracy():
    learningRates = [0.001, 0.01, 0.1]
    for rate in learningRates:
        # Initialise the algorithm
        mlp = MLPClassifier(hidden_layer_sizes=(5, 5, 5),learning_rate_init=rate, activation = 'relu',
                            solver = 'lbfgs', max_iter=1000)
        mlp.fit(X_train, y_train) # Train the algorithm
        predictions = mlp.predict(X_test) # Make predictions
    
        print("Accuracy =", accuracy_score(y_test, predictions))
    pass #When the pass statement is executed, nothing happens, but you avoid getting an error when empty code is not allowed.

printaccuracy()

Accuracy = 0.7931623931623931
Accuracy = 0.788034188034188
Accuracy = 0.6085470085470085


In [71]:
#At every fold cross-validation, print out accuracy and roc_auc on the test set based on the optimal model found by GridSearchCV
#Because the default learning rate is already 0.001 so no need to define it in the MLPClassifier function
mlp = MLPClassifier(hidden_layer_sizes=(5,5,5),solver="lbfgs", activation = 'relu', max_iter = 1000,verbose=False) # Initialise the algorithm
results = cross_validate(mlp, X=features_norm, y=labels, cv=kfold, scoring=["accuracy","roc_auc"])
for i, score in enumerate(results["test_accuracy"]):
    print(f"Accuracy for the fold no. {i} on the test set: {score}")
for i, score in enumerate(results["test_roc_auc"]):
    print(f"ROC_AUC for the fold no. {i} on the test set: {score}")

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


Accuracy for the fold no. 0 on the test set: 0.8247863247863247
Accuracy for the fold no. 1 on the test set: 0.811965811965812
Accuracy for the fold no. 2 on the test set: 0.8205128205128205
Accuracy for the fold no. 3 on the test set: 0.811965811965812
Accuracy for the fold no. 4 on the test set: 0.8034188034188035
Accuracy for the fold no. 5 on the test set: 0.811965811965812
Accuracy for the fold no. 6 on the test set: 0.8547008547008547
Accuracy for the fold no. 7 on the test set: 0.8197424892703863
Accuracy for the fold no. 8 on the test set: 0.7725321888412017
Accuracy for the fold no. 9 on the test set: 0.7896995708154506
ROC_AUC for the fold no. 0 on the test set: 0.891550925925926
ROC_AUC for the fold no. 1 on the test set: 0.8829984668764626
ROC_AUC for the fold no. 2 on the test set: 0.9010740971357408
ROC_AUC for the fold no. 3 on the test set: 0.8885116462692461
ROC_AUC for the fold no. 4 on the test set: 0.8804272650426497
ROC_AUC for the fold no. 5 on the test set: 0.892

In [72]:
#Get the average accuracy
print("Average accuracy:" ,results["test_accuracy"].mean())
#Get the average roc_auc
print("Average roc_auc:" ,results["test_roc_auc"].mean())

Average accuracy: 0.8121290488243279
Average roc_auc: 0.8877033478652162


In [73]:
#Using the optimal model found above, fit the dataset without 10-fold
mlp.fit(X_train_norm, y_train) # Train the algorithm

MLPClassifier(hidden_layer_sizes=(5, 5, 5), max_iter=1000, solver='lbfgs')

In [74]:
#Use the fitted model to predict the test set
predictions = mlp.predict(X_test_norm) # Make predictions

In [75]:
# Compute performance metrics without 10-fold cross-validation
print("Accuracy =", accuracy_score(y_test, predictions))
print("ROC_AUC =", roc_auc_score(y_test, predictions))
print(confusion_matrix(y_test, predictions))
print(classification_report(y_test,predictions))

Accuracy = 0.8102564102564103
ROC_AUC = 0.795029684510083
[[308  48]
 [ 63 166]]
              precision    recall  f1-score   support

           0       0.83      0.87      0.85       356
           1       0.78      0.72      0.75       229

    accuracy                           0.81       585
   macro avg       0.80      0.80      0.80       585
weighted avg       0.81      0.81      0.81       585



# Decision tree

In [25]:
#Search for best parameters
grid_params_lr = {
    'criterion': ['gini', 'entropy'],
    'splitter': ['best', 'random'],
    'random_state': [0]
}

# Create Decision Tree classifer object
tree = DecisionTreeClassifier()
#Use GridSearchCV to find optimal parameters for Decision Tree classifer object
tree_grid_result = GridSearchCV(tree, grid_params_lr,cv=kfold).fit(X_train_norm,y_train)
print(tree_grid_result.best_score_)

#Print best Parameter
print(tree_grid_result.best_params_)

0.8230779220779221
{'criterion': 'entropy', 'random_state': 0, 'splitter': 'best'}


In [26]:
# Train Decision Tree Classifer with the optimal parameters found by GridSearchCV
tree_best = DecisionTreeClassifier(criterion = 'entropy', splitter = 'best', random_state = 0)
# Fit the model without 10-fold cross-validation
tree_best = tree_best.fit(X_train_norm,y_train)

#Predict the response for test dataset without 10-fold cross-validation
y_pred_tree = tree_best.predict(X_test_norm)

In [50]:
#Get performance metrics such as accuracy, roc_auc and confusion metrics without 10-fold cross-validation
accuracy = computeAccuracy (y_test, y_pred_tree)
print("Accuracy =", accuracy_score(y_test, y_pred_tree))
print("ROC_AUC =", roc_auc_score(y_test, y_pred_tree))
print(confusion_matrix(y_test, y_pred_tree))
print(classification_report(y_test,y_pred_tree))

Accuracy = 0.7863247863247863
ROC_AUC = 0.7644619989205632
[[308  48]
 [ 77 152]]
              precision    recall  f1-score   support

           0       0.80      0.87      0.83       356
           1       0.76      0.66      0.71       229

    accuracy                           0.79       585
   macro avg       0.78      0.76      0.77       585
weighted avg       0.78      0.79      0.78       585



In [34]:
#10 fold cross-validation using the optimal model found by GridSearchCV
cv = KFold(n_splits=10)
result_tree = cross_validate(tree_best, X=features,y=labels, cv=cv, scoring=["accuracy","roc_auc"])
#At every fold cross-validation, print accuracy and roc_auc score on the test set
for i, score in enumerate(result_tree["test_accuracy"]):
    print(f"Accuracy for the fold no. {i} on the test set: {score}")
for i, score in enumerate(result_tree["test_roc_auc"]):
    print(f"ROC_AUC for the fold no. {i} on the test set: {score}")

Accuracy for the fold no. 0 on the test set: 0.7606837606837606
Accuracy for the fold no. 1 on the test set: 0.7948717948717948
Accuracy for the fold no. 2 on the test set: 0.6410256410256411
Accuracy for the fold no. 3 on the test set: 0.7136752136752137
Accuracy for the fold no. 4 on the test set: 0.8205128205128205
Accuracy for the fold no. 5 on the test set: 0.7606837606837606
Accuracy for the fold no. 6 on the test set: 0.6965811965811965
Accuracy for the fold no. 7 on the test set: 0.6909871244635193
Accuracy for the fold no. 8 on the test set: 0.7725321888412017
Accuracy for the fold no. 9 on the test set: 0.7896995708154506
ROC_AUC for the fold no. 0 on the test set: 0.7340106595602932
ROC_AUC for the fold no. 1 on the test set: 0.7547471974376573
ROC_AUC for the fold no. 2 on the test set: 0.6307692307692307
ROC_AUC for the fold no. 3 on the test set: 0.6840516171726363
ROC_AUC for the fold no. 4 on the test set: 0.7650273224043714
ROC_AUC for the fold no. 5 on the test set: 0

In [44]:
#Get the average accuracy of 10-fold cross-validation
print("Average accuracy:" ,result_tree["test_accuracy"].mean())
#Get the average roc_auc of 10-fold cross-validation
print("Average roc_auc:" ,result_tree["test_roc_auc"].mean())

Average accuracy: 0.7441253072154359
Average roc_auc: 0.7214099564311268


In [36]:
#Graph and print Decision Tree
from sklearn.tree import export_graphviz
import graphviz

#Convert class labels to a list of strings
labels = y_train.astype(str).unique().tolist()

#Export the decision tree in DOT format
dot_data = export_graphviz (tree_best, out_file=None,
                           feature_names=X_train.columns,
                            class_names=labels,
                            filled=True, rounded=True,
                            special_characters=True)

graph= graphviz.Source(dot_data)
graph.render("decision_tree") # Save the tree as a PDF file
graph.view() # Display the tree in a GUI window

'decision_tree.pdf'