In [85]:
import os
import numpy as np
import pandas as pd
import csv
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import StratifiedKFold

from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn import metrics

#import our libraries

In [86]:
os.chdir('C:\\Users\\An-94\\desktop\\ucr\\CS235\\project') #use os library to find the correct directory

In [87]:
red_wine = pd.read_csv("winequality-red-undelimited-preprocessed_2.csv", 
                           usecols = ['fixed acidity','volatile acidity', 'citric acid', 'residual sugar', 'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol', 'above_average'])

#import the dataset as pandas dataframe; exclude the wine quality column since we are designating a seperate column of binary
#values 1 and 0 where 1 represents the wines whos quality is equal to or above 7

In [42]:
red_wine.head(5)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,above_average
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,0
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,0
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,0
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,0
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,0


In [88]:
x = red_wine.loc[:,red_wine.columns != "above_average"]
y = red_wine["above_average"]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size= 0.20)

In [None]:
clf = DecisionTreeClassifier()
clf.fit(x_train, y_train)

In [None]:
y_predict = clf.predict(x_test)

In [None]:
recall = cross_val_score(clf, x_train, y_train, cv = 10, scoring = 'recall')
np.mean(recall)

In [None]:
np.std(recall) 

In [None]:
stats.sem(recall) #recall sem

In [None]:
cross_val_score(clf, x_train, y_train, cv = 10, scoring = 'precision')
precision = cross_val_score(clf, x_train, y_train, cv = 10, scoring = 'precision')
np.mean(precision)

In [None]:
np.std(precision) 

In [None]:
stats.sem(precision)

In [None]:
#------------------------------------------------------------------

In [None]:
#IMPLEMENTATION 

In [89]:
def partition(data, feature, boundary): #create a function the partition's data based on where it falls on the boundary condition
    true_rows = data[data[feature] >= boundary ]
    false_rows = data[data[feature] < boundary ]
    
    return true_rows, false_rows

In [90]:
def gini(data, target): #the gini impurity for calculating the impurity of parent, and child nodes

    counts = data[target].value_counts()
    impurity = 1
    
    for c in counts:
        prob_of_label = c/float(len(data))
        impurity -= prob_of_label**2
        
    return impurity

In [91]:
def info_gain(left, right, current_uncertainty, target): #the function to calculate information gain

    p = float(len(left))/(len(left)+len(right))
    split_uncertainty = p*gini(left, target) + (1-p)*gini(right, target)
    ig = current_uncertainty - split_uncertainty
    
    return ig

In [51]:
def best_split(data, target):#pick the best feature and boundary combination that produces the highest information gain 
    #initialize the best gain, feature, and boundary variables

    best_gain = 0
    best_feature = None
    best_boundary = None
    current_uncertainty = gini(data, target)
    features = data.columns.drop(target)

    for f in features: #interate over features
        if best_feature == None:
            best_feature = f
            
        boundaries = list(set(data[f]))

        for b in boundaries: #interate over boundaries
            if best_boundary == None:
                best_boundary = b
                
            tr, fr = partition(data, f, b) #output the true rows and false rows at feature and boundary combination
            
            if (len(tr) == 0) or (len(fr) == 0): #if there are no true rows or false rows continue
                continue

            gain = info_gain(tr, fr, current_uncertainty, target) #fnd the information gain at the current impurity level

            if gain >= best_gain: #if the current information gain is greater than the best gain then make that the new best gain
                #(along with its feature and boundary)
                best_gain, best_feature, best_boundary = gain, f, b

    return best_gain, best_feature, best_boundary

In [92]:
def make_node(data, target, split_level, min_leaf, max_depth):
    #create leaf nodes or decision nodes based on whether there was information gain
    gain, feature, boundary = best_split(data, target)
    node = {
            'info_gain': gain, 
            'size': len(data), 
            'feature': feature, 
            'boundary': boundary
           }
    # create a node variable with keys info_gain, size, feature, and boundary
    if (gain == 0) or (len(data) <= min_leaf) or (split_level > max_depth):
        node['node_type'] = 'leaf' #if all conditions(no information gain, length less than min for leaf, and less than max depth)
        # are met then create a node type called leaf
        prediction = {} #create a dictionary called prediction
        counts = data[target].value_counts() #get the counts of the classes
        prediction = 0
        prediction_chance = 0
        for i in counts.index: #for rows in the counts index which consists of two classes
            prediction_chance_i = counts[i]/float(len(data)) #find the general probability of the class at that index
            if prediction_chance_i > prediction_chance: #if the general prob is greater than the current prob 
                prediction_chance = prediction_chance_i #assign the new prob to prediction chance
                prediction = i #and make row equal to the prediction
        node['prediction'] = prediction #after that assign the prediction row class to column prediction
        node['prediction_chance'] = prediction_chance #and assign the new probability to the prediction chance column 
        
    elif gain > 0:
        node['node_type'] = 'branch' #if the information gain is greater than zero then create a node type called a branch
        
    return node

In [95]:
def get_metrics(pred, target='above_average'):
    
    correct_preds = pred[pred[target]==pred['predictions']] #true positive or true negative
    correct = len(correct_preds)  # # of correct predictions
    correct_pos_preds = correct_preds[correct_preds[target]== 1] #true positives
    correct_positive = len(correct_pos_preds) # # of true positives
    positive_actual = len(pred[pred[target]== 1]) #counts where real value is positive
    positive_pred = len(pred[pred['predictions']== 1]) #counts where predicted value is positive
    total = len(pred) #total number of predictions
    acc = correct/total #percent of predictions that were correct
    precision = correct_positive/positive_pred  #percentage of how often a classifier is right about the prediction
    recall = correct_positive/positive_actual #percentage of how often a classifier successfully predict the real value
    f1 = 2*precision*recall/(precision+recall) #harmonic mean of precision and recall
    print('accuracy:',round(acc, 4),', precision:',round(precision, 4),', recall:',round(recall, 4),', f1:',round(f1, 4))
    
    return acc, precision, recall, f1

In [72]:
best_split(red_wine, 'above_average')

(0.03431894532028967, 'alcohol', 11.6)

In [73]:
make_node(red_wine, 'above_average', 0, 5, 7)

{'info_gain': 0.03431894532028967,
 'size': 1599,
 'feature': 'alcohol',
 'boundary': 11.6,
 'node_type': 'branch'}

In [61]:
def dtree(data = {}, decision_tree = {}, 
          split_level = 0, max_depth = 7, min_leaf = 5, 
          target = 'above_average'):
    
    if split_level == 0: #if the depth is at the root node than continue
        root = make_node(data, target, split_level, min_leaf, max_depth) #create a node and set it to the root variable
        root['node_type'] = 'root' #create a column in root variable called 'node type' and assign string 'root' to it
        decision_tree[split_level] = [(data, root)] #the decision tree is a dictionary where every level consists of data and 
        #node; the data is the partitioned data for each node
        
    decision_tree[split_level + 1] = [] #add 1 to the split level after assignment of data and node to the root
    keep_splitting = 0 
    
    
    for d, n in decision_tree[split_level]: #d is the interator for the data and n is the interator for the nodes
        
        if (n['node_type'] != 'leaf'): #if the node is not a leaf it is a decision node
            
            gain, feature, boundary = best_split(d, target) #assign the best gain, feature and boundary at best split
            tr, fr = partition(d, feature, boundary) #assign true rows(rows greater than or equal to that particular boundary) 
            #and false rows(rows less than boundary) at that split

            right_branch = make_node(tr, target, split_level + 1, min_leaf, max_depth) #create the right branch of the tree with
            #true rows and add one to split level
            n['right_child_feature'] = right_branch['feature'] #for the node create a column right child feature and assign
            # the feature variable for the left branches
            n['right_child_boundary'] = right_branch['boundary'] #for the node create a column right child boundary and assign
            # the boundary condition for the right branches
            decision_tree[split_level + 1].append((tr, right_branch)) #append to the decision tree the true rows and the left
            #branch and add one to the split level or depth
            if (right_branch['node_type'] == 'branch'): #if the node is a left branch keep splitting
                keep_splitting = 1

            left_branch = make_node(fr, target, split_level + 1, min_leaf, max_depth)  #create the left branch of the tree with
            #false rows and add one to split level
            n['left_child_feature'] = left_branch['feature'] #for the node create a column right child feature and assign
            # the feature variable for the right branches
            n['left_child_boundary'] = left_branch['boundary'] #for the node create a column right child boundary and assign
            # the boundary condition for the right branches
            decision_tree[split_level + 1].append((fr, left_branch)) #append to the decision tree the true rows and the right
            #branch and add one to the split level
            if (left_branch['node_type'] == 'branch'):  #if the node type is a right branch keep splitting
                keep_splitting = 1
    
    split_level += 1 #after the left branches and right branches are created move to the next depth level
    #print('tree level', split_level, 'complete')
    
    if (keep_splitting == 1): #a recursive function that output the results of the decision tree at each split level
        decision_tree, split_level = dtree(decision_tree = decision_tree, split_level = split_level)
        
    return decision_tree, split_level

In [96]:
dt, sl = dtree(data=train_df.copy(), target='above_average')

In [77]:
a = dtree(red_wine)

In [97]:
def dtree_predict(data, tree, target = 'quality_class'):

    predictions = {}
            
    for i in range(len(data)): #iterate over the length of the data
        
        x = data.iloc[i] #assign the data at each i row to x
        
        for s in range(len(tree)):  #interate over the split levels of the tree dictionary
            
            if (s == 0): #if the level is at the root node
                node = tree[s][0][1] # the first index represent the whole tree, the second represent the # of branches of the tree
                #, and the last represents a single node at the split level in a branch
                current_feature = node['feature'] #set the current feature and boundary
                current_boundary = node['boundary']
                
            for n in range(len(tree[s])):  #for node in each length split levels
                
                node = tree[s][n][1]
                
                if ((node['feature'] == current_feature) and (node['boundary'] == current_boundary)):
                    
                    if (node['node_type']!='leaf'):   #if the node is not a leaf 
                        
                        if (x[current_feature] >= current_boundary): #if the row in the current feature is greater than 
                            # or euqal to the boundary, append the features and boundaries from the right child feature and boundary columns
                            next_feature = node['right_child_feature']
                            next_boundary = node['right_child_boundary']

                        elif (x[current_feature] < current_boundary): #if the row in the current feature is less than the boundary,
                            #append the features and boundaries from the left child feature and boundary columns
                            next_feature = node['left_child_feature']
                            next_boundary = node['left_child_boundary']
                            
                    if (node['node_type']=='leaf'):
                        prediction = node['prediction']  #if the node type is a leaf make a prediction
        
            current_feature = next_feature #new feature and boundary of the previous depth becomes the current feature and boundary
            # of the current depth
            current_boundary = next_boundary
        
        predictions[i] = prediction
        
    data['predictions'] = pd.Series(predictions) #generate a series of predictions and append to the predictions column in the 
    #dataframe

    return data

In [98]:
dtpreds = dtree_predict(test_df.copy(), dt, target = 'above_average')

In [100]:
from sklearn.model_selection import StratifiedKFold 

skf = StratifiedKFold(n_splits=10) #function for stratified 10 fold
skf.get_n_splits(x.copy(), y.copy()) # returns the number of splitting iterations in the cross-validator
print(skf)

StratifiedKFold(n_splits=10, random_state=None, shuffle=False)


In [101]:
fold = 1
skf_df = {} 

for train_index, test_index in skf.split(x, y): #split into k folds cross validation
    
    train_df = pd.concat([x.iloc[train_index], y.iloc[train_index]], axis=1).reset_index(drop=True)
    test_df = pd.concat([x.iloc[test_index], y.iloc[test_index]], axis=1).reset_index(drop=True)
    skf_df[fold] = [train_df, test_df]
    fold+=1

#skf_df[fold #][0=train, 1=test]

In [102]:
dt_cv_results_minority = [] #initilize the lists the will be result of the cross validation

for fold in skf_df: #iterate over folds in the dictionary skf_df in which the indexes of the x and y cols and train and testsets are stored
    kf_train_df = skf_df[fold][0] #iterated over the selected train dataframes and assign to a variable
    kf_test_df = skf_df[fold][1] #iterated over the selected test dataframes and assign to a variable
    
    dt, sl = dtree(data=kf_train_df, target='above_average') #put the training dataset into the tree function
    tpred = dtree_predict(kf_test_df, dt, target = 'above_average') #take the prediction results and testing data labels and 
    #put them in a list 

    fold_results_minority = [get_metrics(pred=tpred, target='above_average')] #figure out the performance 
    #given the predicted and test values
    dt_cv_results_minority.append(fold_results_minority) #append each fold results into a list

accuracy: 0.8812 , precision: 0.625 , recall: 0.2381 , f1: 0.3448
accuracy: 0.8438 , precision: 0.3 , recall: 0.1429 , f1: 0.1935
accuracy: 0.8375 , precision: 0.4286 , recall: 0.5455 , f1: 0.48
accuracy: 0.8438 , precision: 0.4286 , recall: 0.4091 , f1: 0.4186
accuracy: 0.925 , precision: 0.8125 , recall: 0.5909 , f1: 0.6842
accuracy: 0.8125 , precision: 0.1667 , recall: 0.0909 , f1: 0.1176
accuracy: 0.8187 , precision: 0.3704 , recall: 0.4545 , f1: 0.4082
accuracy: 0.8313 , precision: 0.3913 , recall: 0.4091 , f1: 0.4
accuracy: 0.4562 , precision: 0.0988 , recall: 0.3636 , f1: 0.1553
accuracy: 0.8491 , precision: 0.2 , recall: 0.0476 , f1: 0.0769


In [103]:
#process the output to a more usable and appealing form

dt_cv_array = np.asarray(dt_cv_results_minority)
dt_cv_array = np.mean(dt_cv_array, axis = 1)
dt_cv_df = pd.DataFrame(dt_cv_array)
dt_cv_df

per_report_df = dt_cv_df.rename(columns={ 0 : "accuracy",  1 : "precision",  2 : "recall", 3 : "f1"})
per_report_mean  = per_report_df.mean(axis = 0) #calculate the mean of the k folds for the performance scores 
print(per_report_df)
print(per_report_mean)

   accuracy  precision    recall        f1
0  0.881250   0.625000  0.238095  0.344828
1  0.843750   0.300000  0.142857  0.193548
2  0.837500   0.428571  0.545455  0.480000
3  0.843750   0.428571  0.409091  0.418605
4  0.925000   0.812500  0.590909  0.684211
5  0.812500   0.166667  0.090909  0.117647
6  0.818750   0.370370  0.454545  0.408163
7  0.831250   0.391304  0.409091  0.400000
8  0.456250   0.098765  0.363636  0.155340
9  0.849057   0.200000  0.047619  0.076923
accuracy     0.809906
precision    0.382175
recall       0.329221
f1           0.327926
dtype: float64


In [110]:
from scipy import stats

stats.sem(per_report_df) #standard error of the mean


array([0.04062494, 0.06793598, 0.06000132, 0.0601174 ])

In [104]:
per_report_df #results of k folds 

Unnamed: 0,accuracy,precision,recall,f1
0,0.88125,0.625,0.238095,0.344828
1,0.84375,0.3,0.142857,0.193548
2,0.8375,0.428571,0.545455,0.48
3,0.84375,0.428571,0.409091,0.418605
4,0.925,0.8125,0.590909,0.684211
5,0.8125,0.166667,0.090909,0.117647
6,0.81875,0.37037,0.454545,0.408163
7,0.83125,0.391304,0.409091,0.4
8,0.45625,0.098765,0.363636,0.15534
9,0.849057,0.2,0.047619,0.076923


In [105]:
per_report_mean #mean of the performance scores

accuracy     0.809906
precision    0.382175
recall       0.329221
f1           0.327926
dtype: float64

In [107]:
per_report_stdev = per_report_df.std(axis = 0) #standard deviation of the performance
per_report_stdev

accuracy     0.128467
precision    0.214832
recall       0.189741
f1           0.190108
dtype: float64

In [None]:
#.87 accuracy for the off the shelf + or - 0.018
#.81 accuracy for the custom + or - 0.13

In [108]:
from sklearn.metrics import confusion_matrix

confusion_matrix(tpred['above_average'], tpred['predictions']) #confusion matrix for comparing the test data with predictions

array([[134,   4],
       [ 20,   1]], dtype=int64)