In [2]:
import pandas as pd
import numpy as np
import random

SEED = 1408
random.seed(SEED)
np.random.seed(SEED)

In [3]:
train = pd.read_csv('data/wine.csv')

print('Train shape:', train.shape)
train.head()

Train shape: (178, 14)


Unnamed: 0,Cultivar,Alcohol,Malic acid,Ash,Alcalinity of ash,Magnesium,Total phenols,Flavanoids,Nonflavanoid phenols,Proanthocyanins,Color intensity,Hue,OD280/OD315 of diluted wines,Proline
0,1,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050
2,1,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185
3,1,14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480
4,1,13.24,2.59,2.87,21.0,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735


### Get correlation

In [4]:
train.corr()

Unnamed: 0,Cultivar,Alcohol,Malic acid,Ash,Alcalinity of ash,Magnesium,Total phenols,Flavanoids,Nonflavanoid phenols,Proanthocyanins,Color intensity,Hue,OD280/OD315 of diluted wines,Proline
Cultivar,1.0,-0.328222,0.437776,-0.049643,0.517859,-0.209179,-0.719163,-0.847498,0.489109,-0.49913,0.265668,-0.617369,-0.78823,-0.633717
Alcohol,-0.328222,1.0,0.094397,0.211545,-0.310235,0.270798,0.289101,0.236815,-0.155929,0.136698,0.546364,-0.071747,0.072343,0.64372
Malic acid,0.437776,0.094397,1.0,0.164045,0.2885,-0.054575,-0.335167,-0.411007,0.292977,-0.220746,0.248985,-0.561296,-0.36871,-0.192011
Ash,-0.049643,0.211545,0.164045,1.0,0.443367,0.286587,0.12898,0.115077,0.18623,0.009652,0.258887,-0.074667,0.003911,0.223626
Alcalinity of ash,0.517859,-0.310235,0.2885,0.443367,1.0,-0.083333,-0.321113,-0.35137,0.361922,-0.197327,0.018732,-0.273955,-0.276769,-0.440597
Magnesium,-0.209179,0.270798,-0.054575,0.286587,-0.083333,1.0,0.214401,0.195784,-0.256294,0.236441,0.19995,0.055398,0.066004,0.393351
Total phenols,-0.719163,0.289101,-0.335167,0.12898,-0.321113,0.214401,1.0,0.864564,-0.449935,0.612413,-0.055136,0.433681,0.699949,0.498115
Flavanoids,-0.847498,0.236815,-0.411007,0.115077,-0.35137,0.195784,0.864564,1.0,-0.5379,0.652692,-0.172379,0.543479,0.787194,0.494193
Nonflavanoid phenols,0.489109,-0.155929,0.292977,0.18623,0.361922,-0.256294,-0.449935,-0.5379,1.0,-0.365845,0.139057,-0.26264,-0.50327,-0.311385
Proanthocyanins,-0.49913,0.136698,-0.220746,0.009652,-0.197327,0.236441,0.612413,0.652692,-0.365845,1.0,-0.02525,0.295544,0.519067,0.330417


In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import classification_report, confusion_matrix

X = train.loc[:, train.columns != 'Cultivar']
y = train['Cultivar']

In [14]:
clf = LogisticRegression(max_iter=100, solver='lbfgs', n_jobs=-1, multi_class='auto')
clf.fit(X, y)
print('Confusion matrix:', confusion_matrix(y, clf.predict(X)))
print('Classification report for test data:')
print(classification_report(y, clf.predict(X)))

Confusion matrix: [[56  3  0]
 [ 0 69  2]
 [ 0  1 47]]
Classification report for test data:
              precision    recall  f1-score   support

           1       1.00      0.95      0.97        59
           2       0.95      0.97      0.96        71
           3       0.96      0.98      0.97        48

   micro avg       0.97      0.97      0.97       178
   macro avg       0.97      0.97      0.97       178
weighted avg       0.97      0.97      0.97       178



In [11]:
coefficients = pd.concat([pd.DataFrame(X.columns),pd.DataFrame(np.transpose(clf.coef_))], axis = 1)
coefficients.columns = ['feature', 'cultivar1', 'cultivar2', 'cultivar3']
print(coefficients)

                         feature  cultivar1  cultivar2  cultivar3
0                        Alcohol  -0.577782   0.935582  -0.321107
1                     Malic acid   0.833828  -1.302433   0.696126
2                            Ash   1.045972  -0.415574   0.046326
3              Alcalinity of ash  -0.597846   0.234376   0.122564
4                      Magnesium  -0.024850   0.006722   0.019377
5                  Total phenols   0.167294   0.236705  -0.761681
6                     Flavanoids   1.384629   0.429217  -1.878588
7           Nonflavanoid phenols   0.067346   0.156137  -0.031422
8                Proanthocyanins  -0.451441   0.423758  -0.743205
9                Color intensity  -0.133532  -1.868699   1.075389
10                           Hue  -0.138724   0.538145  -0.441925
11  OD280/OD315 of diluted wines   0.910259   0.270694  -1.221468
12                       Proline   0.015641  -0.013383   0.000175


In [12]:
clf = KNeighborsClassifier(n_jobs=-1)
clf.fit(X, y)
print('Confusion matrix:', confusion_matrix(y, clf.predict(X)))
print('Classification report for test data:')
print(classification_report(y, clf.predict(X)))

Confusion matrix: [[53  1  5]
 [ 6 53 12]
 [ 2 12 34]]
Classification report for test data:
              precision    recall  f1-score   support

           1       0.87      0.90      0.88        59
           2       0.80      0.75      0.77        71
           3       0.67      0.71      0.69        48

   micro avg       0.79      0.79      0.79       178
   macro avg       0.78      0.78      0.78       178
weighted avg       0.79      0.79      0.79       178



In [13]:
clf = RandomForestClassifier(n_estimators=100, n_jobs=-1)
clf.fit(X, y)
print('Confusion matrix:', confusion_matrix(y, clf.predict(X)))
print('Classification report for test data:')
print(classification_report(y, clf.predict(X)))

Confusion matrix: [[59  0  0]
 [ 0 71  0]
 [ 0  0 48]]
Classification report for test data:
              precision    recall  f1-score   support

           1       1.00      1.00      1.00        59
           2       1.00      1.00      1.00        71
           3       1.00      1.00      1.00        48

   micro avg       1.00      1.00      1.00       178
   macro avg       1.00      1.00      1.00       178
weighted avg       1.00      1.00      1.00       178

