In [76]:
# Import required models
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeClassifier

import matplotlib.pyplot as plt
import pandas as pd
from pathlib import Path

## Split the Data into Training and Testing Sets

In [59]:
# Read csv files and display the DataFrame
data_white = Path('Resources/white_trans.csv')
df_white = pd.read_csv(data_white, sep = ',')

display(df_white.head())

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,1.846915,0.629961,0.603681,1.488806,0.363424,3.448217,161.0,0.997514,3.16,0.793701,10.5,6
1,1.983192,0.669433,0.661911,2.563697,0.377976,2.843867,135.0,0.999933,3.16,0.724316,9.0,6
2,1.948695,0.724316,0.64633,1.957434,0.344822,2.884499,160.0,0.998448,3.17,0.754784,10.0,5
3,1.948695,0.542884,0.788374,1.062659,0.380295,2.620741,150.0,0.997226,3.23,0.777498,11.2,6
4,1.930979,0.64633,0.654213,2.477125,0.358305,1.817121,41.0,0.998882,3.17,0.730614,10.9,6


In [60]:
# Define a function that maps values from 1-4, 5-6 and 7-10.
def map_wine_quality(value):
    if value >= 1 and value <= 4:
        return "bad"
    elif value >= 5 and value <= 6:
        return "mediocre"
    elif value >= 7 and value <= 10:
        return "delicious"
    else:
        return None

In [61]:
# use the apply method to create a new column based on the existing column
df_white['wine quality'] = df_white['quality'].apply(map_wine_quality)

display(df_white.head())

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,wine quality
0,1.846915,0.629961,0.603681,1.488806,0.363424,3.448217,161.0,0.997514,3.16,0.793701,10.5,6,mediocre
1,1.983192,0.669433,0.661911,2.563697,0.377976,2.843867,135.0,0.999933,3.16,0.724316,9.0,6,mediocre
2,1.948695,0.724316,0.64633,1.957434,0.344822,2.884499,160.0,0.998448,3.17,0.754784,10.0,5,mediocre
3,1.948695,0.542884,0.788374,1.062659,0.380295,2.620741,150.0,0.997226,3.23,0.777498,11.2,6,mediocre
4,1.930979,0.64633,0.654213,2.477125,0.358305,1.817121,41.0,0.998882,3.17,0.730614,10.9,6,mediocre


In [62]:
# Drop 'quality' column
df_white = df_white.drop(columns='quality')

In [63]:
# Split the data into X and y
X = df_white.drop(columns=["wine quality"])
y = df_white["wine quality"]

In [65]:
X.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,1.846915,0.629961,0.603681,1.488806,0.363424,3.448217,161.0,0.997514,3.16,0.793701,10.5
1,1.983192,0.669433,0.661911,2.563697,0.377976,2.843867,135.0,0.999933,3.16,0.724316,9.0
2,1.948695,0.724316,0.64633,1.957434,0.344822,2.884499,160.0,0.998448,3.17,0.754784,10.0
3,1.948695,0.542884,0.788374,1.062659,0.380295,2.620741,150.0,0.997226,3.23,0.777498,11.2
4,1.930979,0.64633,0.654213,2.477125,0.358305,1.817121,41.0,0.998882,3.17,0.730614,10.9


In [66]:
# Split into testing and training sets using train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Create a Descision Tree Model

In [68]:
# Declare a Descision Tree model
decision_tree_model = DecisionTreeClassifier(max_depth=10)

# Fit and save the model using the training model
model = decision_tree_model.fit(X_train, y_train)

In [69]:
# Use the model to make predictions on the testing data
training_predictions = model.predict(X_train)
testing_predictions = decision_tree_model.predict(X_test)

In [70]:
training_accuracy_score = balanced_accuracy_score(y_train, training_predictions)
training_accuracy_score

0.920624283479842

In [71]:
testing_accuracy_score = balanced_accuracy_score(y_test, testing_predictions)
testing_accuracy_score

0.8883624654537744

In [72]:
# Create and save the confusion matrix for the testing data
training_matrix = confusion_matrix(y_train, training_predictions)

# Print the confusion matrix for the testing data
print(training_matrix)

[[2498   22    2]
 [  22 3573  181]
 [ 138  300 2067]]


In [73]:
# Create and save the confusion matrix for the testing data
test_matrix = confusion_matrix(y_test, testing_predictions)

# Print the confusion matrix for the testing data
print(test_matrix)

[[611   8   3]
 [  5 886  49]
 [ 48 118 473]]


In [74]:
# Create and save the training classification report
training_report = classification_report(y_train, training_predictions)

# Print the training classification report
print(training_report)

              precision    recall  f1-score   support

         bad       0.94      0.99      0.96      2522
   delicious       0.92      0.95      0.93      3776
    mediocre       0.92      0.83      0.87      2505

    accuracy                           0.92      8803
   macro avg       0.93      0.92      0.92      8803
weighted avg       0.92      0.92      0.92      8803



In [75]:
# Create and save the testing classification report
testing_report = classification_report(y_test, testing_predictions)

# Print the testing classification report
print(testing_report)

              precision    recall  f1-score   support

         bad       0.92      0.98      0.95       622
   delicious       0.88      0.94      0.91       940
    mediocre       0.90      0.74      0.81       639

    accuracy                           0.90      2201
   macro avg       0.90      0.89      0.89      2201
weighted avg       0.90      0.90      0.89      2201

