In [20]:
import os
import numpy as np
import pandas as pd
import csv

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score


from sklearn.tree import DecisionTreeClassifier

In [4]:
os.chdir('C:\\Users\\An-94\\desktop\\ucr\\CS235\\project') #use os library to find the correct directory

In [11]:
#this is the full dataset
#wine quality marked at or above 7 are labeled 1 and those below are labeled 0 on the above_average column
red_wine_full = pd.read_csv("winequality-red-undelimited-preprocessed_2-thanzinN.csv") 
red_wine_full.head(5)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,above_average
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,0
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5,0
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5,0
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6,0
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,0


In [12]:
#import the dataset as pandas dataframe; exclude the wine quality column since I am designating a seperate column of binary values as the target variable

red_wine = pd.read_csv("winequality-red-undelimited-preprocessed_2-thanzinN.csv", 
                           usecols = ['fixed acidity','volatile acidity', 'citric acid', 'residual sugar', 'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol', 'above_average'])
red_wine.head(5)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,above_average
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,0
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,0
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,0
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,0
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,0


In [14]:
x = red_wine.loc[:,red_wine.columns != "above_average"] #dataframe of feature variables
y = red_wine["above_average"] #dataframe of the target variable 

## I want to then split these dataframes into the testing and training data
## 20 percent of data reserved for testing and the rest for training

In [18]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size= 0.20)

In [16]:
y.value_counts() #find the counts of classes in the target variable

0    1382
1     217
Name: above_average, dtype: int64

## As we can see, this is a highly imbalanced dataset as one class has many more values than the other.
## An imblanced dataset will be highly biased towards the majority class. Therefore I will employ an oversampling technique where I oversample the minority class to reduce this bias

In [33]:
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state=42) #a function to automatically oversample the overrepresented class
x_train, y_train = sm.fit_resample(x_train, y_train)



#rus = RandomUnderSampler(random_state=42)


## I also want to undersample the majority class to reduce bias further

In [34]:
from imblearn.under_sampling import RandomUnderSampler

rus = RandomUnderSampler(random_state=42)
x_train, y_train = rus.fit_resample(x_train, y_train)

## Now lets train the decision tree classifier

In [35]:
clf = DecisionTreeClassifier()
clf.fit(x_train, y_train)

## Now I will utilize the testing data to test the prediction of the classifier

In [36]:
y_predict = clf.predict(x_test)

In [37]:
accuracy_score(y_test, y_predict)

0.834375

## The classifier has an accuracy score of 84 percent

In [38]:
precision_score(y_test, y_predict)

0.421875

## a precision score of 44 percent

In [39]:
recall_score(y_test, y_predict)

0.627906976744186

## a recall score of 63 percent

In [40]:
f1_score(y_test, y_predict)

0.5046728971962616

## and a f1 score of 51 percent

## Can we do better? How about using utlilizing cross validation instead of simple train test split?

In [43]:
from sklearn.model_selection import StratifiedKFold


In [74]:
def score_model(model, cv= None):
    if cv == None:
        cv = StratifiedKFold(n_splits= 10, random_state= None)
        sm = SMOTE(random_state= None)
        rus = RandomUnderSampler(random_state= None)
        
        r_scores = [] #recall scores
        p_scores = [] #precision scores
        f_scores = [] #f1 scores

    for train_fold_index, val_fold_index in cv.split(x_train, y_train):
        # Get the training data
        x_train_fold, y_train_fold = x_train.iloc[train_fold_index], y_train[train_fold_index]
        # Get the validation data
        x_val_fold, y_val_fold = x_train.iloc[val_fold_index], y_train[val_fold_index]

        # oversample only the data in the training section
        x_train_fold_os, y_train_fold_os = sm.fit_resample(x_train_fold, y_train_fold)
        # undersample only the data in the training section
        x_train_fold_os, y_train_fold_os = rus.fit_resample(x_train_fold, y_train_fold)
        
        # Fit the model on the over and undersampled training data
        model_obj = model.fit(x_train_fold_os, y_train_fold_os)
        # Score the model on the validation data
        r_score = recall_score(y_val_fold, model_obj.predict(x_val_fold))
        r_scores.append(r_score)
        # same thing on the precision and f1 score
        p_score = precision_score(y_val_fold, model_obj.predict(x_val_fold))
        p_scores.append(p_score)
        
        f_score = f1_score(y_val_fold, model_obj.predict(x_val_fold))
        f_scores.append(f_score)
        
        avg_r_scores = sum(r_scores)/len(r_scores) #find the averages of the 10 folds for each performance metric 
        avg_p_scores = sum(p_scores)/len(p_scores)
        avg_f_scores = sum(f_scores)/len(f_scores)
        

    return print("The recall is " + str(sum(r_scores)/len(r_scores)), 
                 "The precision is " + str(sum(p_scores)/len(p_scores)), 
                 "The f1 is " + str(sum(f_scores)/len(f_scores)))
        

In [75]:
score_model(clf)

The recall is 0.8995167895167896 The precision is 0.8904800841635805 The f1 is 0.8946559205806762


## We can see that using stratified 10 fold cross validation to partition the data into training and test sets, we were able to achieve much higher performance scores of 90 percent, 89 percent, and 89 percent for the recall, precision, and f1 respectively