## FIRST APPROACH - MAX-DEPTH OF THE DECISION TREE CLASSIFIER IS NOT OPTIMAL

In [110]:
# Import Libraries
import pandas as pd
from sklearn import tree
from sklearn import metrics

In [111]:
# Load data
df = pd.read_csv("winequality-red.csv")

In [112]:
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [113]:
# Describe data
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
fixed acidity,1599.0,8.319637,1.741096,4.6,7.1,7.9,9.2,15.9
volatile acidity,1599.0,0.527821,0.17906,0.12,0.39,0.52,0.64,1.58
citric acid,1599.0,0.270976,0.194801,0.0,0.09,0.26,0.42,1.0
residual sugar,1599.0,2.538806,1.409928,0.9,1.9,2.2,2.6,15.5
chlorides,1599.0,0.087467,0.047065,0.012,0.07,0.079,0.09,0.611
free sulfur dioxide,1599.0,15.874922,10.460157,1.0,7.0,14.0,21.0,72.0
total sulfur dioxide,1599.0,46.467792,32.895324,6.0,22.0,38.0,62.0,289.0
density,1599.0,0.996747,0.001887,0.99007,0.9956,0.99675,0.997835,1.00369
pH,1599.0,3.311113,0.154386,2.74,3.21,3.31,3.4,4.01
sulphates,1599.0,0.658149,0.169507,0.33,0.55,0.62,0.73,2.0


In [114]:
# Datatypes in the dataframe
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1599 entries, 0 to 1598
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         1599 non-null   float64
 1   volatile acidity      1599 non-null   float64
 2   citric acid           1599 non-null   float64
 3   residual sugar        1599 non-null   float64
 4   chlorides             1599 non-null   float64
 5   free sulfur dioxide   1599 non-null   float64
 6   total sulfur dioxide  1599 non-null   float64
 7   density               1599 non-null   float64
 8   pH                    1599 non-null   float64
 9   sulphates             1599 non-null   float64
 10  alcohol               1599 non-null   float64
 11  quality               1599 non-null   int64  
dtypes: float64(11), int64(1)
memory usage: 150.0 KB


In [115]:
#Missing values
df.isnull().sum()

fixed acidity           0
volatile acidity        0
citric acid             0
residual sugar          0
chlorides               0
free sulfur dioxide     0
total sulfur dioxide    0
density                 0
pH                      0
sulphates               0
alcohol                 0
quality                 0
dtype: int64

In [116]:
df['quality'].value_counts()
# This datasetconsists of only six types of quality values.
# We will thus map all quality values from 0 to 5

quality
5    681
6    638
7    199
4     53
8     18
3     10
Name: count, dtype: int64

In [117]:
# a mapping dictionary that maps the quality values from 0 to 5
quality_mapping = {
    3: 0,
    4: 1,
    5: 2,
    6: 3,
    7: 4,
    8: 5
}

In [118]:
# use the map function of pandas with any dictionary to convert the values in a given
# column to values in the dictionary
df.loc['quality'] = df['quality'].map(quality_mapping)

In [119]:
df['quality']

0          5.0
1          5.0
2          5.0
3          6.0
4          5.0
          ... 
1595       6.0
1596       6.0
1597       5.0
1598       6.0
quality    NaN
Name: quality, Length: 1600, dtype: float64

In [120]:
# Drop missing values
df.dropna(axis=0, inplace=True)

In [121]:
df.shape

(1599, 12)

In [122]:
# Data splitting
# Shuffle the data before splitting to prevent any bias that might occur from the original ordering of the data
# frac=1 means the entire DataFrame will be shuffled. Each row has an equal chance of being selected.
df = df.sample(frac=1).reset_index(drop=True)

# Top 1000 rows to be used for training
df_train = df.head(1000)

# Bottom 599 rows to be used for testing/validation
df_test = df.tail(599)

In [123]:
# We will now train a decision tree model on the training set.
# Initialize decision tree classifier with a max_depth of 3
clf = tree.DecisionTreeClassifier(max_depth=3)
# choose the columns you want to train the model on (features)
cols = ['fixed acidity', 
 'volatile acidity', 
 'citric acid',
 'residual sugar',
 'chlorides',
 'free sulfur dioxide',
 'total sulfur dioxide',
 'density',
 'pH',
 'sulphates',
 'alcohol']

In [124]:
# Train the model on the provided features and mapped quality from before
clf.fit(df_train[cols], df_train.quality)

In [128]:
# Testing the accuracy of the model on the training set and test set
# Generate predictions on the training set
train_predictions = clf.predict(df_train[cols])

# Generate predictions on the testing set
test_predictions = clf.predict(df_test[cols])

# Calculate the accuracy of predictions on the training data set
train_accuracy = metrics.accuracy_score(df_train.quality, train_predictions)

# Calculate the accuracy of predictions on the test data set
test_accuracy = metrics.accuracy_score(df_test.quality, test_predictions)
print(f"Train and Test acuracy with a max depth of 3: {train_accuracy}, {test_accuracy}")

Train and Test acuracy with a max depth of 3: 0.618, 0.5459098497495827


In [129]:
# Increase max_depth to 7
clf = tree.DecisionTreeClassifier(max_depth=7)

# Train the model again
clf.fit(df_train[cols], df_train.quality)

In [130]:
# Calculate Accuracy when max depth is 7
# Testing the accuracy of the model on the training set and test set
# Generate predictions on the training set
train_predictions = clf.predict(df_train[cols])

# Generate predictions on the testing set
test_predictions = clf.predict(df_test[cols])

# Calculate the accuracy of predictions on the training data set
train_accuracy = metrics.accuracy_score(df_train.quality, train_predictions)

# Calculate the accuracy of predictions on the test data set
test_accuracy = metrics.accuracy_score(df_test.quality, test_predictions)
print(f"Train and Test acuracy with a max depth of 3: {train_accuracy}, {test_accuracy}")

Train and Test acuracy with a max depth of 3: 0.777, 0.5442404006677797
