In [1]:
# Import required models
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.metrics import classification_report
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
from sklearn.metrics import balanced_accuracy_score
from sklearn.tree import DecisionTreeClassifier
from collections import Counter
from sklearn.metrics import confusion_matrix

import matplotlib.pyplot as plt
import pandas as pd
from pathlib import Path

## Split the Data into Training and Testing Sets
### Step 1: Read the csv data from the resources folder into a DataFrame

In [2]:
# Read csv file and display the DataFrame
data_white = Path('Resources/winequality-white.csv')
df_white = pd.read_csv(data_white, sep = ';')
display(df_white.head())
display(df_white.tail())

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6


Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
4893,6.2,0.21,0.29,1.6,0.039,24.0,92.0,0.99114,3.27,0.5,11.2,6
4894,6.6,0.32,0.36,8.0,0.047,57.0,168.0,0.9949,3.15,0.46,9.6,5
4895,6.5,0.24,0.19,1.2,0.041,30.0,111.0,0.99254,2.99,0.46,9.4,6
4896,5.5,0.29,0.3,1.1,0.022,20.0,110.0,0.98869,3.34,0.38,12.8,7
4897,6.0,0.21,0.38,0.8,0.02,22.0,98.0,0.98941,3.26,0.32,11.8,6


In [3]:
# Define a function that maps values from 1-6 to 0 and values from 7-10 to 1
def map_wine_quality(value):
    if value >= 1 and value <= 4:
        return "bad wine"
    elif value >= 5 and value <= 6:
        return "mediocre wine"
    elif value >= 7 and value <= 10:
        return "delicious wine"
    else:
        return None

In [4]:
# use the apply method to create a new column based on the existing column
df_white['binary_quality'] = df_white['quality'].apply(map_wine_quality)
df_white.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,binary_quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6,mediocre wine
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6,mediocre wine
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6,mediocre wine
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6,mediocre wine
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6,mediocre wine


In [5]:
# Drop 'quality' column and drop duplicates
df_white = df_white.drop(columns='quality')
df_white = df_white.drop_duplicates()

### Step 2: Create labels set for good and bad wines

In [6]:
# Split the data into X and y
X = df_white.drop(columns=["binary_quality"])
y = df_white["binary_quality"]

In [7]:
X.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9
6,6.2,0.32,0.16,7.0,0.045,30.0,136.0,0.9949,3.18,0.47,9.6


In [60]:
# Split into testing and training sets using train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Step 3: Scale the data

In [73]:
# Instantiate a StandardScaler instance
scaler = StandardScaler()

# Fit the training data to the standard scaler
X_scaler = scaler.fit_transform(X_train)

## Create a Descision Tree Model

In [74]:
# Declare a Descision Tree model
descision_tree_model = DecisionTreeClassifier(max_depth=9)

# Fit and save the model using the training model
model = descision_tree_model.fit(X_train, y_train)

In [75]:
# Use the model to make predictions on the testing data
prediction = model.predict(X_test)

In [76]:
accuracy = balanced_accuracy_score(y_test, prediction)
accuracy

0.44073838111573965

In [77]:
# Create and save the confusion matrix for the testing data
test_matrix = confusion_matrix(y_test, prediction)

# Print the confusion matrix for the testing data
print(test_matrix)

[[  3   1  31]
 [  1  63 111]
 [  7  65 511]]


In [78]:
# Create and save the training classification report
training_report = classification_report(y_test, prediction)

# Print the training classification report
print(training_report)

                precision    recall  f1-score   support

      bad wine       0.27      0.09      0.13        35
delicious wine       0.49      0.36      0.41       175
 mediocre wine       0.78      0.88      0.83       583

      accuracy                           0.73       793
     macro avg       0.51      0.44      0.46       793
  weighted avg       0.70      0.73      0.71       793



## Predict a Decision Tree Model with Resampled Training Data

In [79]:
# Import the RandomOverSampler module form imbalanced-learn
from imblearn.over_sampling import RandomOverSampler

# Instantiate the random oversampler model
ros = RandomOverSampler(random_state = 42)

# Fit the original training data to the random_oversampler model
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

In [80]:
# Count the distinct values of the resampled labels data
Counter(y_resampled)

Counter({'mediocre wine': 2380, 'delicious wine': 2380, 'bad wine': 2380})

In [81]:
# Instantiate the Logistic Regression model
descision_tree_model = DecisionTreeClassifier(max_depth=10)
# Fit the model using the resampled training data
model.fit(X_resampled, y_resampled)

# Make a prediction using the testing data
prediction = model.predict(X_test)
prediction

array(['delicious wine', 'bad wine', 'mediocre wine', 'mediocre wine',
       'delicious wine', 'mediocre wine', 'delicious wine',
       'mediocre wine', 'mediocre wine', 'mediocre wine', 'mediocre wine',
       'mediocre wine', 'mediocre wine', 'mediocre wine', 'mediocre wine',
       'mediocre wine', 'mediocre wine', 'delicious wine',
       'mediocre wine', 'bad wine', 'delicious wine', 'mediocre wine',
       'mediocre wine', 'delicious wine', 'mediocre wine',
       'delicious wine', 'delicious wine', 'mediocre wine',
       'mediocre wine', 'mediocre wine', 'mediocre wine', 'mediocre wine',
       'delicious wine', 'mediocre wine', 'mediocre wine',
       'mediocre wine', 'delicious wine', 'mediocre wine', 'bad wine',
       'mediocre wine', 'delicious wine', 'delicious wine',
       'delicious wine', 'mediocre wine', 'mediocre wine', 'bad wine',
       'delicious wine', 'delicious wine', 'mediocre wine',
       'mediocre wine', 'delicious wine', 'delicious wine',
       'medioc

In [82]:
accuracy_score(y_test, prediction)

0.6431273644388399

In [83]:
# Create and save the confusion matrix for the testing data
test_matrix = confusion_matrix(y_test, prediction)

# Print the confusion matrix for the testing data
print(test_matrix)

[[ 16   3  16]
 [  8 110  57]
 [ 62 137 384]]


In [84]:
# Create and save the testing classification report
testing_report = classification_report(y_test, prediction)

# Print the testing classification report
print(testing_report)

                precision    recall  f1-score   support

      bad wine       0.19      0.46      0.26        35
delicious wine       0.44      0.63      0.52       175
 mediocre wine       0.84      0.66      0.74       583

      accuracy                           0.64       793
     macro avg       0.49      0.58      0.51       793
  weighted avg       0.72      0.64      0.67       793

