In [1]:
# Import required models
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeClassifier

import matplotlib.pyplot as plt
import pandas as pd
from pathlib import Path

## Split the Data into Training and Testing Sets

In [3]:
# Read csv file and display the DataFrame
data_red = Path('Resources/red_trans.csv')
df_red = pd.read_csv(data_red, sep = ',')

display(df_red.head())

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,1.948695,0.887904,0.0,1.238562,0.423582,2.22398,3.239612,0.9978,3.51,0.824257,2.110454,5
1,1.983192,0.958284,0.0,1.375069,0.461044,2.924018,4.061548,0.9968,3.2,0.879366,2.139975,5
2,1.983192,0.912581,0.04,1.320006,0.451436,2.466212,3.779763,0.997,3.26,0.866239,2.139975,5
3,2.237378,0.654213,0.56,1.238562,0.421716,2.571282,3.914868,0.998,3.16,0.833955,2.139975,6
4,1.948695,0.887904,0.0,1.238562,0.423582,2.22398,3.239612,0.9978,3.51,0.824257,2.110454,5


In [4]:
# Define a function that maps values from 1-4, 5-6 and 7-10.
def map_wine_quality(value):
    if value >= 1 and value <= 4:
        return "bad"
    elif value >= 5 and value <= 6:
        return "mediocre"
    elif value >= 7 and value <= 10:
        return "delicious"
    else:
        return None

In [5]:
# use the apply method to create a new column based on the existing column
df_red['wine quality'] = df_red['quality'].apply(map_wine_quality)

display(df_red.head())

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,wine quality
0,1.948695,0.887904,0.0,1.238562,0.423582,2.22398,3.239612,0.9978,3.51,0.824257,2.110454,5,mediocre
1,1.983192,0.958284,0.0,1.375069,0.461044,2.924018,4.061548,0.9968,3.2,0.879366,2.139975,5,mediocre
2,1.983192,0.912581,0.04,1.320006,0.451436,2.466212,3.779763,0.997,3.26,0.866239,2.139975,5,mediocre
3,2.237378,0.654213,0.56,1.238562,0.421716,2.571282,3.914868,0.998,3.16,0.833955,2.139975,6,mediocre
4,1.948695,0.887904,0.0,1.238562,0.423582,2.22398,3.239612,0.9978,3.51,0.824257,2.110454,5,mediocre


In [6]:
# Drop 'quality' column
df_red = df_red.drop(columns='quality')

In [7]:
# Split the data into X and y
X = df_red.drop(columns=["wine quality"])
y = df_red["wine quality"]

In [8]:
# Split into testing and training sets using train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42)

## Create a Decision Tree Model

In [9]:
# Declare a Descision Tree model
decision_tree_model = DecisionTreeClassifier(max_depth=10)

# Fit and save the model using the training model
model = decision_tree_model.fit(X_train, y_train)

In [10]:
# Use the model to make predictions on the testing data
training_predictions = model.predict(X_train)
testing_predictions = decision_tree_model.predict(X_test)

In [11]:
# Create and save the balanced accuracy score
training_accuracy_score = balanced_accuracy_score(y_train, training_predictions)

# Print the balance accuracy score for the training data
training_accuracy_score

0.9724216220063342

In [12]:
# Create and save the balanced accuracy score
testing_accuracy_score = balanced_accuracy_score(y_test, testing_predictions)

# Print the balance accuracy score for the testing data
testing_accuracy_score

0.9327152014652015

In [13]:
# Create and save the confusion matrix for the testing data
training_matrix = confusion_matrix(y_train, training_predictions)

# Print the confusion matrix for the testing data
print(training_matrix)

[[968   0   0]
 [  0 975  11]
 [ 44  24 882]]


In [14]:
# Create and save the confusion matrix for the testing data
test_matrix = confusion_matrix(y_test, testing_predictions)

# Print the confusion matrix for the testing data
print(test_matrix)

[[242   0   0]
 [  0 221   3]
 [ 24  25 211]]


In [15]:
# Create and save the training classification report
training_report = classification_report(y_train, training_predictions)

# Print the training classification report
print(training_report)

              precision    recall  f1-score   support

         bad       0.96      1.00      0.98       968
   delicious       0.98      0.99      0.98       986
    mediocre       0.99      0.93      0.96       950

    accuracy                           0.97      2904
   macro avg       0.97      0.97      0.97      2904
weighted avg       0.97      0.97      0.97      2904



In [16]:
# Create and save the testing classification report
testing_report = classification_report(y_test, testing_predictions)

# Print the testing classification report
print(testing_report)

              precision    recall  f1-score   support

         bad       0.91      1.00      0.95       242
   delicious       0.90      0.99      0.94       224
    mediocre       0.99      0.81      0.89       260

    accuracy                           0.93       726
   macro avg       0.93      0.93      0.93       726
weighted avg       0.93      0.93      0.93       726

