In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Normalizer, MinMaxScaler, StandardScaler, LabelEncoder
from sklearn.model_selection import GridSearchCV

In [5]:
data = pd.read_csv('ecoli.data', header=None, sep='\s+')
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,AAT_ECOLI,0.49,0.29,0.48,0.5,0.56,0.24,0.35,cp
1,ACEA_ECOLI,0.07,0.4,0.48,0.5,0.54,0.35,0.44,cp
2,ACEK_ECOLI,0.56,0.4,0.48,0.5,0.49,0.37,0.46,cp
3,ACKA_ECOLI,0.59,0.49,0.48,0.5,0.52,0.45,0.36,cp
4,ADI_ECOLI,0.23,0.32,0.48,0.5,0.55,0.25,0.35,cp


In [6]:
X = data.iloc[:, 1:-1]
y = data.iloc[:, -1:]

In [10]:
np.ravel(y)

array(['cp', 'cp', 'cp', 'cp', 'cp', 'cp', 'cp', 'cp', 'cp', 'cp', 'cp',
       'cp', 'cp', 'cp', 'cp', 'cp', 'cp', 'cp', 'cp', 'cp', 'cp', 'cp',
       'cp', 'cp', 'cp', 'cp', 'cp', 'cp', 'cp', 'cp', 'cp', 'cp', 'cp',
       'cp', 'cp', 'cp', 'cp', 'cp', 'cp', 'cp', 'cp', 'cp', 'cp', 'cp',
       'cp', 'cp', 'cp', 'cp', 'cp', 'cp', 'cp', 'cp', 'cp', 'cp', 'cp',
       'cp', 'cp', 'cp', 'cp', 'cp', 'cp', 'cp', 'cp', 'cp', 'cp', 'cp',
       'cp', 'cp', 'cp', 'cp', 'cp', 'cp', 'cp', 'cp', 'cp', 'cp', 'cp',
       'cp', 'cp', 'cp', 'cp', 'cp', 'cp', 'cp', 'cp', 'cp', 'cp', 'cp',
       'cp', 'cp', 'cp', 'cp', 'cp', 'cp', 'cp', 'cp', 'cp', 'cp', 'cp',
       'cp', 'cp', 'cp', 'cp', 'cp', 'cp', 'cp', 'cp', 'cp', 'cp', 'cp',
       'cp', 'cp', 'cp', 'cp', 'cp', 'cp', 'cp', 'cp', 'cp', 'cp', 'cp',
       'cp', 'cp', 'cp', 'cp', 'cp', 'cp', 'cp', 'cp', 'cp', 'cp', 'cp',
       'cp', 'cp', 'cp', 'cp', 'cp', 'cp', 'cp', 'cp', 'cp', 'cp', 'cp',
       'im', 'im', 'im', 'im', 'im', 'im', 'im', 'i

In [11]:
encoder = LabelEncoder()
y = encoder.fit_transform(np.ravel(y))

In [12]:
y

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       3, 3, 2, 2, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
       4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5,
       5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 7, 7,
       7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=123)

In [14]:
X_train.shape

(252, 7)

In [15]:
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)
knn.score(X_test, y_test)

0.8333333333333334

In [17]:
pip = Pipeline([
    ('scaler', StandardScaler()),
    ('clf', KNeighborsClassifier())
])

In [18]:
pip.fit(X_train, y_train)

In [19]:
pip.predict(X_test)

array([0, 1, 6, 7, 1, 0, 7, 0, 7, 0, 0, 4, 4, 1, 0, 6, 0, 1, 0, 1, 1, 1,
       0, 1, 1, 7, 1, 0, 1, 0, 0, 0, 6, 0, 1, 0, 4, 7, 0, 1, 0, 1, 7, 0,
       7, 1, 1, 7, 1, 0, 1, 7, 0, 0, 1, 1, 4, 0, 4, 1, 0, 0, 1, 1, 0, 0,
       4, 7, 7, 0, 7, 1, 4, 0, 1, 0, 0, 0, 1, 7, 0, 1, 1, 0])

In [20]:
pip.score(X_test, y_test)

0.8690476190476191

In [2]:
# MasoudKaviani.ir
# Dataset: https://archive.ics.uci.edu/ml/datasets/Ecoli