# **Welcome to the tutorial 6**

## Data Preparation
* Numpy array
* Pandas Dataframe
* Scipy for Statistics
*Sklearn for Machine Learning

In [1]:
import numpy as np
import pandas as pd
import scipy
import sklearn

## Get the Data

In [None]:
#download the data
!wget "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"

--2021-10-28 11:17:14--  https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4551 (4.4K) [application/x-httpd-php]
Saving to: ‘iris.data’


2021-10-28 11:17:15 (125 MB/s) - ‘iris.data’ saved [4551/4551]



## Preprocessing

In [4]:
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
features = ['sepal length', 'sepal width', 'petal length', 'petal width']
df = pd.read_csv('iris.data', names=['sepal length','sepal width','petal length','petal width','target'])
# Separating out the features
x_raw = df.loc[:, features].values
# Separating out the target
y = df.loc[:,['target']].values
# Standardizing the features
x = MinMaxScaler().fit_transform(x_raw)


le = LabelEncoder()
y = le.fit_transform(y)
label_list = list(le.classes_)
print(label_list)

['Iris-setosa', 'Iris-versicolor', 'Iris-virginica']


  y = column_or_1d(y, warn=True)


Split training and testing

In [5]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=0)

Warm up: an example for KNeighborsClassifier

In [6]:
from sklearn.neighbors import KNeighborsClassifier


neigh = KNeighborsClassifier(n_neighbors=5)
neigh.fit(X_train,y_train)

print('accuracy for testing',neigh.score(X_test,y_test))



accuracy for testing 0.96


 take into account its distance from its neighbors

In [7]:
from sklearn.neighbors import KNeighborsClassifier


neigh = KNeighborsClassifier(n_neighbors=5,weights='distance')
neigh.fit(X_train,y_train)

print('accuracy for testing',neigh.score(X_test,y_test))

accuracy for testing 0.96


How to tune k by cross-validation 

In [None]:
from sklearn.model_selection import cross_val_score

neigh = KNeighborsClassifier()
scores = cross_val_score(neigh, X_train, y_train, cv=5)
print(scores)



from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
best_k = 1
best_score = 0
for k in range(2,50):
   neigh = KNeighborsClassifier(n_neighbors=k)
   score = np.mean(cross_val_score(neigh, X_train, y_train, cv=10))
   if score > best_score:
      best_score = score
      best_k = k

print('The best k is', best_k)
print('The best score is', best_score)

[0.9  1.   1.   0.9  0.95]
The best k is 12
The best score is 0.9700000000000001


Robustness under different n_neighbors

In [None]:
name_list = ['small','medium','large']
k_size = [2,6,60]


for ind,name in enumerate(name_list):

 neigh = KNeighborsClassifier(n_neighbors=k_size[ind])
 neigh.fit(X_train,y_train)

 print('accuracy for testing '+'with ' +name +' n_neighbors in normal dataset', 
       neigh.score(X_test,y_test))




noise_exp_nums = 23

noise_x = np.random.uniform(low=0.0, high=1.0, size=[noise_exp_nums,X_train.shape[1]])
noise_x = np.concatenate([X_train,noise_x],axis = 0)
noise_y = np.random.randint(low=0, high=2, size=[noise_exp_nums])
noise_y = np.concatenate([y_train,noise_y],axis = 0)


for ind,name in enumerate(name_list):

 neigh = KNeighborsClassifier(n_neighbors=k_size[ind])
 neigh.fit(noise_x,noise_y)

 print('accuracy for testing '+'with ' +name +' numbers of n_neighbors in noise dataset', 
       neigh.score(X_test,y_test))



accuracy for testing with small n_neighbors in normal dataset 0.94
accuracy for testing with medium n_neighbors in normal dataset 0.96
accuracy for testing with large n_neighbors in normal dataset 0.9
accuracy for testing with small numbers of n_neighbors in noise dataset 0.92
accuracy for testing with medium numbers of n_neighbors in noise dataset 0.96
accuracy for testing with large numbers of n_neighbors in noise dataset 0.9


KNN Implemented by ourselves

In [None]:
from sklearn.metrics import accuracy_score

class knn():
 def __init__(self,k):
     self.k =k
     self.neighbor_distances_and_indices = []

 def distance_f(self, x1,x2):
     return np.sqrt(np.sum((x1-x2)**2,axis=-1))
     

 def knn_build(self, x, y):
    self.x = x
    self.y = y  

 def predict(self, x_test):
    output = []
    for ind,query in enumerate(x_test):
     #x_ = np.delete(x, ind, axis=0)
     #1. calulate the distance
     distance = self.distance_f(self.x, query)
     #2. find k ids that have smallest distances to the query sample
     id = np.argpartition(distance, kth=self.k)[:self.k]
     
     y_chosen = self.y[id]
     #print(distance)
     #3. count the most frequent ids as the predicted label.
     output.append(np.bincount(y_chosen).argmax())
    return output

     

knn_our = knn(7)
knn_our.knn_build(X_train, y_train)
our_predict = knn_our.predict(X_test)
print('Acc for ours in testing dataset:',accuracy_score(y_test,our_predict))





Acc for ours in testing dataset: 0.96
