In [1]:
# Import required models
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.metrics import classification_report
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
from sklearn.metrics import balanced_accuracy_score
from sklearn.tree import DecisionTreeClassifier
from collections import Counter
from sklearn.metrics import confusion_matrix

import matplotlib.pyplot as plt
import pandas as pd
from pathlib import Path

## Split the Data into Training and Testing Sets
### Step 1: Read the csv data from the resources folder into a DataFrame

In [2]:
# Read csv file and display the DataFrame
data_red = Path('Resources/winequality-red.csv')
df_red = pd.read_csv(data_red, sep = ';')
display(df_red.head())
display(df_red.tail())

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
1594,6.2,0.6,0.08,2.0,0.09,32.0,44.0,0.9949,3.45,0.58,10.5,5
1595,5.9,0.55,0.1,2.2,0.062,39.0,51.0,0.99512,3.52,0.76,11.2,6
1596,6.3,0.51,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0,6
1597,5.9,0.645,0.12,2.0,0.075,32.0,44.0,0.99547,3.57,0.71,10.2,5
1598,6.0,0.31,0.47,3.6,0.067,18.0,42.0,0.99549,3.39,0.66,11.0,6


In [3]:
# Define a function that maps values from 1-6 to 0 and values from 7-10 to 1
def map_wine_quality(value):
    if value >= 1 and value <= 4:
        return "bad wine"
    elif value >= 5 and value <= 6:
        return "mediocre wine"
    elif value >= 7 and value <= 10:
        return "delicious wine"
    else:
        return None

In [4]:
# use the apply method to create a new column based on the existing column
df_red['binary_quality'] = df_red['quality'].apply(map_wine_quality)
df_red.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,binary_quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,mediocre wine
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5,mediocre wine
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5,mediocre wine
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6,mediocre wine
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,mediocre wine


In [5]:
# Drop 'quality' column and drop duplicates
df_red = df_red.drop(columns='quality')
df_red = df_red.drop_duplicates()

### Step 2: Create labels set for good and bad wines

In [6]:
# Split the data into X and y
X = df_red.drop(columns=["binary_quality"])
y = df_red["binary_quality"]

In [7]:
X.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8
5,7.4,0.66,0.0,1.8,0.075,13.0,40.0,0.9978,3.51,0.56,9.4


In [8]:
# Split into testing and training sets using train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42)

### Step 3: Scale the data

In [23]:
# Instantiate a StandardScaler instance
scaler = StandardScaler()

# Fit the training data to the standard scaler
X_scaler = scaler.fit_transform(X_train)

## Create a Decision Tree Model

In [24]:
# Declare a Descision Tree model
decision_tree_model = DecisionTreeClassifier(max_depth=10)

# Fit and save the model using the training model
model = decision_tree_model.fit(X_train, y_train)

In [25]:
# Use the model to make predictions on the testing data
prediction = model.predict(X_test)

In [26]:
accuracy = balanced_accuracy_score(y_test, prediction)
accuracy

0.4987660834139336

In [27]:
# Create and save the confusion matrix for the testing data
test_matrix = confusion_matrix(y_test, prediction)

# Print the confusion matrix for the testing data
print(test_matrix)

[[  2   1  12]
 [  1  16  17]
 [  8  16 199]]


In [28]:
# Create and save the training classification report
training_report = classification_report(y_test, prediction)

# Print the training classification report
print(training_report)

                precision    recall  f1-score   support

      bad wine       0.18      0.13      0.15        15
delicious wine       0.48      0.47      0.48        34
 mediocre wine       0.87      0.89      0.88       223

      accuracy                           0.80       272
     macro avg       0.51      0.50      0.50       272
  weighted avg       0.79      0.80      0.79       272



## Predict a Desicion Tree Model with Resampled Training Data

In [29]:
# Import the RandomOverSampler module form imbalanced-learn
from imblearn.over_sampling import RandomOverSampler

# Instantiate the random oversampler model
ros = RandomOverSampler(random_state = 42)

# Fit the original training data to the random_oversampler model
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

In [30]:
# Count the distinct values of the resampled labels data
Counter(y_resampled)

Counter({'delicious wine': 889, 'mediocre wine': 889, 'bad wine': 889})

In [31]:
# Instantiate the Logistic Regression model
descision_tree_model = DecisionTreeClassifier()
# Fit the model using the resampled training data
model.fit(X_resampled, y_resampled)

# Make a prediction using the testing data
prediction = model.predict(X_test)
prediction

array(['bad wine', 'mediocre wine', 'delicious wine', 'bad wine',
       'bad wine', 'delicious wine', 'mediocre wine', 'mediocre wine',
       'mediocre wine', 'bad wine', 'delicious wine', 'mediocre wine',
       'mediocre wine', 'mediocre wine', 'mediocre wine', 'mediocre wine',
       'delicious wine', 'mediocre wine', 'mediocre wine',
       'mediocre wine', 'mediocre wine', 'delicious wine',
       'mediocre wine', 'mediocre wine', 'mediocre wine', 'mediocre wine',
       'mediocre wine', 'mediocre wine', 'mediocre wine', 'mediocre wine',
       'mediocre wine', 'mediocre wine', 'mediocre wine', 'mediocre wine',
       'bad wine', 'delicious wine', 'mediocre wine', 'delicious wine',
       'mediocre wine', 'delicious wine', 'bad wine', 'delicious wine',
       'mediocre wine', 'mediocre wine', 'mediocre wine', 'mediocre wine',
       'mediocre wine', 'mediocre wine', 'mediocre wine',
       'delicious wine', 'mediocre wine', 'mediocre wine',
       'delicious wine', 'bad wine', '

In [32]:
accuracy_score(y_test, prediction)

0.7242647058823529

In [33]:
# Create and save the confusion matrix for the testing data
test_matrix = confusion_matrix(y_test, prediction)

# Print the confusion matrix for the testing data
print(test_matrix)

[[  4   2   9]
 [  0  26   8]
 [ 12  44 167]]


In [34]:
# Create and save the testing classification report
testing_report = classification_report(y_test, prediction)

# Print the testing classification report
print(testing_report)

                precision    recall  f1-score   support

      bad wine       0.25      0.27      0.26        15
delicious wine       0.36      0.76      0.49        34
 mediocre wine       0.91      0.75      0.82       223

      accuracy                           0.72       272
     macro avg       0.51      0.59      0.52       272
  weighted avg       0.80      0.72      0.75       272

