In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [2]:
from sklearn.datasets import *

In [3]:
wine=load_wine()

In [4]:
print(wine.DESCR)

.. _wine_dataset:

Wine recognition dataset
------------------------

**Data Set Characteristics:**

    :Number of Instances: 178 (50 in each of three classes)
    :Number of Attributes: 13 numeric, predictive attributes and the class
    :Attribute Information:
 		- Alcohol
 		- Malic acid
 		- Ash
		- Alcalinity of ash  
 		- Magnesium
		- Total phenols
 		- Flavanoids
 		- Nonflavanoid phenols
 		- Proanthocyanins
		- Color intensity
 		- Hue
 		- OD280/OD315 of diluted wines
 		- Proline

    - class:
            - class_0
            - class_1
            - class_2
		
    :Summary Statistics:
    
                                   Min   Max   Mean     SD
    Alcohol:                      11.0  14.8    13.0   0.8
    Malic Acid:                   0.74  5.80    2.34  1.12
    Ash:                          1.36  3.23    2.36  0.27
    Alcalinity of Ash:            10.6  30.0    19.5   3.3
    Magnesium:                    70.0 162.0    99.7  14.3
    Total Phenols:                0

In [5]:
X=wine['data']
X

array([[1.423e+01, 1.710e+00, 2.430e+00, ..., 1.040e+00, 3.920e+00,
        1.065e+03],
       [1.320e+01, 1.780e+00, 2.140e+00, ..., 1.050e+00, 3.400e+00,
        1.050e+03],
       [1.316e+01, 2.360e+00, 2.670e+00, ..., 1.030e+00, 3.170e+00,
        1.185e+03],
       ...,
       [1.327e+01, 4.280e+00, 2.260e+00, ..., 5.900e-01, 1.560e+00,
        8.350e+02],
       [1.317e+01, 2.590e+00, 2.370e+00, ..., 6.000e-01, 1.620e+00,
        8.400e+02],
       [1.413e+01, 4.100e+00, 2.740e+00, ..., 6.100e-01, 1.600e+00,
        5.600e+02]])

In [6]:
y=wine['target']
y

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2])

In [7]:
#Import train_test_split function
from sklearn.model_selection import train_test_split

# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(wine.data, wine.target, test_size=0.2,random_state=109) # 80% training and 20% test

In [8]:
wine_df=pd.DataFrame(wine['data'])
wine_df.columns=wine['feature_names']
wine_df['class']=wine['target']

In [9]:
wine_df.head()

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline,class
0,14.23,1.71,2.43,15.6,127.0,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065.0,0
1,13.2,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050.0,0
2,13.16,2.36,2.67,18.6,101.0,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185.0,0
3,14.37,1.95,2.5,16.8,113.0,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480.0,0
4,13.24,2.59,2.87,21.0,118.0,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735.0,0


In [10]:
# Modeling:
# Generating Model for K=5

#import knearest neighbors Classifier model
from sklearn.neighbors import KNeighborsClassifier

#Create KNN Classifier
knn = KNeighborsClassifier(n_neighbors=3)

#Train the model using the training sets
knn.fit(X_train, y_train)

#Predict the response for test dataset
y_pred = knn.predict(X_test)

In [11]:
y_pred

array([0, 2, 0, 2, 0, 1, 0, 0, 1, 0, 1, 0, 2, 0, 0, 2, 1, 2, 0, 2, 2, 2,
       0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 2, 2])

In [12]:
# Model Evaluation for k=5

#Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics
# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.6111111111111112


In [13]:
# Re-Generating Model for k=7

#Import knearest neighbors Classifier model
from sklearn import metrics
for k in range(20):
    k=k+1
    
    from sklearn.neighbors import KNeighborsClassifier

    #Create KNN Classifier
    knn = KNeighborsClassifier(n_neighbors=k)

    #Train the model using the training sets
    knn.fit(X_train, y_train)

    #Predict the response for test dataset
    y_pred = knn.predict(X_test) 

# Model Accuracy, how often is the classifier correct?
    print('accuracy for k= ' , k , 'is:',metrics.accuracy_score(y_test, y_pred))

accuracy for k=  1 is: 0.6111111111111112
accuracy for k=  2 is: 0.6111111111111112
accuracy for k=  3 is: 0.6111111111111112
accuracy for k=  4 is: 0.6111111111111112
accuracy for k=  5 is: 0.6111111111111112
accuracy for k=  6 is: 0.6388888888888888
accuracy for k=  7 is: 0.6111111111111112
accuracy for k=  8 is: 0.6666666666666666
accuracy for k=  9 is: 0.6666666666666666
accuracy for k=  10 is: 0.6666666666666666
accuracy for k=  11 is: 0.6666666666666666
accuracy for k=  12 is: 0.6944444444444444
accuracy for k=  13 is: 0.6944444444444444
accuracy for k=  14 is: 0.6944444444444444
accuracy for k=  15 is: 0.6666666666666666
accuracy for k=  16 is: 0.6388888888888888
accuracy for k=  17 is: 0.6666666666666666
accuracy for k=  18 is: 0.6388888888888888
accuracy for k=  19 is: 0.6666666666666666
accuracy for k=  20 is: 0.6666666666666666


In [14]:
from sklearn.neighbors import KNeighborsClassifier

#Create KNN Classifier
knn = KNeighborsClassifier(n_neighbors=12)

#Train the model using the training sets
knn.fit(X_train, y_train)

#Predict the response for test dataset
y_pred = knn.predict(X_test)

In [15]:
y_pred

array([0, 2, 2, 1, 0, 1, 0, 0, 1, 0, 1, 0, 2, 2, 0, 2, 2, 0, 0, 2, 2, 1,
       0, 2, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 2])

In [17]:
from sklearn.metrics import confusion_matrix,classification_report
cm=confusion_matrix(y_test,y_pred)
cm

array([[12,  0,  1],
       [ 2,  9,  5],
       [ 0,  3,  4]], dtype=int64)

In [18]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.86      0.92      0.89        13
           1       0.75      0.56      0.64        16
           2       0.40      0.57      0.47         7

   micro avg       0.69      0.69      0.69        36
   macro avg       0.67      0.69      0.67        36
weighted avg       0.72      0.69      0.70        36

