In [4]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier # Import Decision Tree Classifier
from sklearn.model_selection import train_test_split # Import train_test_split function
from sklearn import metrics #Import scikit-learn metrics module for accuracy calculation

In [5]:
col_names = ['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar', 'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density', 'pH', 'sulfates', 'alcohol', 'quality']
# load dataset
red = pd.read_csv('winequality-red.csv', delimiter = ',', header=None, names=col_names)

In [6]:
red.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulfates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [7]:
#split dataset in features and target variable
feature_cols = ['fixed acidity', 'volatile acidity', 'citric acid', 'chlorides','total sulfur dioxide','density','sulfates', 'alcohol']
X = red[feature_cols] # Features
y = red.quality # Target variable

In [8]:
# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1) # 70% training and 30% test

In [9]:
def gini(X_train, y_train): 
    # Creating the classifier object 
    clf_gini = DecisionTreeClassifier(criterion = "gini", 
    random_state = 100,max_depth=10, min_samples_leaf=10) 
  
    # Performing training 
    clf_gini.fit(X_train, y_train) 
    return clf_gini

In [37]:
def entropy(X_train, y_train):
    
    # Decision tree with entropy 
    clf_entropy = DecisionTreeClassifier( 
            criterion = "entropy", random_state = 100, 
            max_depth = 10, min_samples_leaf = 10) 
  
    #Performing training 
    clf_entropy.fit(X_train, y_train) 
    return clf_entropy

In [38]:
def prediction(clf_model, X_test):
    # Predicton on test with giniIndex 
    y_pred = clf_model.predict(X_test) 
    print("Predicted values:") 
    print(y_pred)
    return y_pred

In [39]:
clf_gini = gini(X_train, y_train) 
clf_entropy = entropy(X_train, y_train) 

In [40]:
# Prediction using gini 
y_pred_gini = prediction(clf_gini, X_test) 
print("Accuracy:",metrics.accuracy_score(y_test, y_pred_gini))

Predicted values:
[5 6 7 6 6 7 6 5 6 5 6 5 6 7 6 5 6 6 5 5 5 6 5 6 5 6 7 6 5 6 5 7 5 6 5 6 5
 6 6 5 6 5 5 5 5 6 5 6 6 6 5 5 7 5 5 6 5 6 6 5 6 5 6 5 5 6 6 5 6 7 5 6 5 6
 5 5 6 5 5 6 7 5 5 6 6 5 5 7 5 6 5 6 5 7 6 6 7 5 5 5 5 6 6 5 5 6 5 6 5 5 7
 5 6 6 5 7 5 5 5 6 5 6 5 6 5 6 6 6 6 6 5 6 7 5 5 6 6 7 5 6 5 5 6 6 5 5 6 5
 6 6 5 7 5 6 6 6 6 6 5 5 7 6 5 5 6 6 6 5 5 5 5 5 5 5 6 5 6 5 5 6 6 6 7 5 6
 6 7 5 6 5 5 7 6 5 5 5 6 5 6 5 6 5 6 5 7 6 5 5 6 6 5 7 6 6 5 5 5 5 5 6 5 5
 6 5 6 5 5 6 7 5 6 6 6 6 5 5 7 5 5 5 6 5 6 6 5 6 6 5 5 6 5 5 6 6 6 5 5 5 6
 6 6 5 6 5 5 6 6 5 6 6 5 6 5 7 5 5 5 5 5 6 6 7 5 5 6 5 6 5 5 5 6 6 6 5 7 6
 5 7 5 5 5 6 5 6 5 6 5 6 6 5 5 6 6 5 6 6 5 5 6 7 6 6 7 5 5 5 5 7 6 6 7 5 5
 6 6 5 6 6 5 5 5 5 5 5 6 7 6 5 5 5 7 7 6 7 5 5 6 7 6 5 5 6 5 6 5 5 5 5 6 5
 5 6 5 5 5 6 6 7 5 7 6 5 6 5 6 6 5 6 6 5 5 5 6 6 5 6 5 6 5 5 6 5 6 6 5 6 5
 5 5 6 5 5 6 5 6 6 6 7 6 6 6 6 6 6 5 5 7 6 5 7 5 5 5 5 5 5 6 6 5 6 7 5 6 7
 6 5 5 5 5 5 5 5 5 5 6 5 6 5 7 5 5 5 5 7 6 6 5 5 6 7 6 5 5 7 5 6 6 6 6 7]
Accuracy

In [41]:
# Prediction using entropy
y_pred_entropy = prediction(clf_entropy, X_test) 
print("Accuracy:",metrics.accuracy_score(y_test, y_pred_entropy))

Predicted values:
[5 6 6 6 6 6 6 5 4 5 6 5 5 7 5 6 6 5 5 5 6 6 5 6 6 6 6 5 6 6 5 7 6 7 5 6 5
 6 6 5 6 5 5 4 5 6 5 5 5 6 5 5 6 5 6 6 6 6 6 5 7 5 5 5 5 6 7 6 6 6 5 6 6 5
 5 5 6 5 5 6 5 5 5 6 6 5 5 7 5 6 5 6 5 7 6 5 6 5 5 5 5 5 5 5 5 5 5 6 5 5 7
 5 6 5 5 7 5 5 5 5 5 6 5 5 6 6 5 5 6 6 5 5 5 5 6 6 5 6 5 6 5 5 6 5 6 6 6 5
 5 6 5 7 6 6 6 6 6 6 5 5 6 6 5 5 6 6 5 5 5 6 6 5 5 5 6 5 6 5 5 5 5 6 6 5 5
 6 6 5 6 5 6 6 6 5 5 5 6 5 5 5 6 5 6 5 7 6 5 5 6 6 5 6 5 7 5 6 5 5 6 6 5 5
 5 5 6 5 5 6 7 5 5 6 6 6 5 5 6 5 6 5 6 5 6 6 6 7 5 5 5 5 5 5 7 5 6 5 5 5 6
 5 6 5 6 6 6 5 5 5 5 6 5 5 5 7 5 5 5 5 5 5 6 6 5 5 5 5 6 5 5 6 6 6 6 6 7 7
 5 6 5 5 4 6 5 5 5 5 5 6 5 5 5 6 5 6 5 5 5 5 6 6 5 6 7 5 5 6 5 7 6 6 7 5 5
 6 5 5 6 7 5 6 5 5 5 4 6 7 6 5 6 5 6 7 6 6 6 5 7 7 5 5 6 5 5 6 5 5 5 6 5 5
 5 6 5 5 5 6 7 7 6 6 6 5 6 6 6 7 5 6 5 5 5 5 6 6 5 6 5 6 5 6 5 5 5 6 5 6 6
 4 5 6 4 5 5 5 5 6 5 6 5 5 6 6 6 5 6 6 6 5 5 7 5 5 5 5 5 5 5 5 5 6 5 5 6 7
 5 5 6 5 6 5 5 5 7 6 5 6 6 5 6 5 5 5 6 5 5 6 5 6 6 7 5 6 6 6 5 6 6 5 6 6]
Accuracy

In [42]:
#Confusion matrix for entropy
y_te = pd.Series(y_test)
y_pe = pd.Series(y_pred_entropy)
pd.crosstab(y_te, y_pe, rownames=['True'], colnames=['Predicted'], margins=True)

Predicted,4,5,6,7,All
True,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
4,0,2,2,0,4
5,0,39,29,2,70
6,0,30,17,5,52
7,1,7,5,1,14
8,0,1,0,0,1
All,1,79,53,8,141


In [43]:
#Confusion matrix for Gini Index
y_tgi = pd.Series(y_test)
y_pgi = pd.Series(y_pred_gini)
pd.crosstab(y_tgi, y_pgi, rownames=['True'], colnames=['Predicted'], margins=True)

Predicted,5,6,7,All
True,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
4,1,3,0,4
5,38,28,4,70
6,24,19,9,52
7,5,8,1,14
8,1,0,0,1
All,69,58,14,141
