In [1]:
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler

In [2]:
wine=load_wine()

In [3]:
X=wine.data
y=wine.target

In [4]:
X.shape

(178, 13)

In [5]:
y

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2])

In [6]:
wine.feature_names

['alcohol',
 'malic_acid',
 'ash',
 'alcalinity_of_ash',
 'magnesium',
 'total_phenols',
 'flavanoids',
 'nonflavanoid_phenols',
 'proanthocyanins',
 'color_intensity',
 'hue',
 'od280/od315_of_diluted_wines',
 'proline']

In [7]:
# divide the data set into training and testing set

In [8]:
X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=10)

In [9]:
print(wine.DESCR)

.. _wine_dataset:

Wine recognition dataset
------------------------

**Data Set Characteristics:**

    :Number of Instances: 178
    :Number of Attributes: 13 numeric, predictive attributes and the class
    :Attribute Information:
 		- Alcohol
 		- Malic acid
 		- Ash
		- Alcalinity of ash  
 		- Magnesium
		- Total phenols
 		- Flavanoids
 		- Nonflavanoid phenols
 		- Proanthocyanins
		- Color intensity
 		- Hue
 		- OD280/OD315 of diluted wines
 		- Proline

    - class:
            - class_0
            - class_1
            - class_2
		
    :Summary Statistics:
    
                                   Min   Max   Mean     SD
    Alcohol:                      11.0  14.8    13.0   0.8
    Malic Acid:                   0.74  5.80    2.34  1.12
    Ash:                          1.36  3.23    2.36  0.27
    Alcalinity of Ash:            10.6  30.0    19.5   3.3
    Magnesium:                    70.0 162.0    99.7  14.3
    Total Phenols:                0.98  3.88    2.29  0.63
    Fl

In [10]:
model=KNeighborsClassifier()
model.fit(X_train,y_train)
print('Train Score:',model.score(X_train,y_train))
print('Test Score:',model.score(X_test,y_test))

Train Score: 0.8270676691729323
Test Score: 0.6888888888888889


In [11]:
# with feature scaling 
sc=StandardScaler()
X_train_new=sc.fit_transform(X_train)
X_test_new=sc.transform(X_test)
model=KNeighborsClassifier()
model.fit(X_train_new,y_train)
print("Training score: ", model.score(X_train_new,y_train))
print("Testing score: ", model.score(X_test_new,y_test))

Training score:  0.9699248120300752
Testing score:  0.9111111111111111


# ANOVA test (Analysis of varience)------> Its computes importance of the all features , according to target
> this anova test used when target in classification and features presents in numerical

> works with classification

In [12]:
from sklearn.feature_selection import f_classif

In [13]:
svalue,pvalue=f_classif(X,y)            # svalue----> feature importance , pvalue-----> probablity value 

In [14]:
print(wine.feature_names)

['alcohol', 'malic_acid', 'ash', 'alcalinity_of_ash', 'magnesium', 'total_phenols', 'flavanoids', 'nonflavanoid_phenols', 'proanthocyanins', 'color_intensity', 'hue', 'od280/od315_of_diluted_wines', 'proline']


In [15]:
# remove less important feature like 'magnesium'
X_new=X[:,[0,1,2,3,5,6,7,8,9,10,11,12]]
X_train,X_test,y_train,y_test=train_test_split(X_new,y,random_state=10)
sc=StandardScaler()
X_train_new=sc.fit_transform(X_train)
X_test_new=sc.transform(X_test)
model=KNeighborsClassifier()
model.fit(X_train_new,y_train)
print('Training score',model.score(X_train_new,y_train))
print('Testing score',model.score(X_test_new,y_test))

Training score 0.9849624060150376
Testing score 0.9333333333333333


In [16]:
# remove 'ash '
X_new=X[:,[0,1,3,5,6,7,8,9,10,11,12]]
X_train,X_test,y_train,y_test=train_test_split(X_new,y,random_state=10)
sc=StandardScaler()
X_train_new=sc.fit_transform(X_train)
X_test_new=sc.transform(X_test)
model=KNeighborsClassifier()
model.fit(X_train_new,y_train)
print('Training pred',model.score(X_train_new,y_train))
print('Testing pred',model.score(X_test_new,y_test))

Training pred 0.9774436090225563
Testing pred 0.9111111111111111


In [17]:
X_new=X[:,[0,1,2,3,5,6,7,8,9,10,11,12]]
X_train,X_test,y_train,y_test=train_test_split(X_new,y,random_state=10)
sc=StandardScaler()
X_train_new=sc.fit_transform(X_train)
X_test_new=sc.transform(X_test)
model=KNeighborsClassifier(n_neighbors=4)
model.fit(X_train_new,y_train)
print('Training pred',model.score(X_train_new,y_train))
print('Testing pred',model.score(X_test_new,y_test))

Training pred 0.9699248120300752
Testing pred 0.9333333333333333


In [25]:
# using cross validation score for accuracy
from sklearn.model_selection import cross_val_score
X_new=sc.fit_transform(X_new)

model=KNeighborsClassifier(n_neighbors=4)
cross_val_score(model,X_new,y, cv=5).mean()

0.9441269841269841

In [19]:
model=KNeighborsClassifier(n_neighbors=5)
cross_val_score(model,X_new,y, cv=5).mean()

0.9663492063492063

In [20]:
# readyment
from sklearn.feature_selection import SelectKBest
skb=SelectKBest(score_func=f_classif,k=10)
X_new=skb.fit_transform(X,y)

In [21]:
X_new.shape

(178, 10)

In [22]:
skb.scores_

array([135.07762424,  36.94342496,  13.3129012 ,  35.77163741,
        12.42958434,  93.73300962, 233.92587268,  27.57541715,
        30.27138317, 120.66401844, 101.31679539, 189.97232058,
       207.9203739 ])

In [30]:
model=KNeighborsClassifier(n_neighbors=5)
cross_val_score(model,X_new,y, cv=5).mean()

0.9552380952380952

In [34]:
# Using Logistic Regression
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
model=LogisticRegression()
model.fit(X_new,y)
print('Accuracy :',model.score(X_new,y))
print('Cv score :',cross_val_score(model,X_new,y, cv=5).mean())

Accuracy : 0.9887640449438202
Cv score : 0.9552380952380952
