In [229]:
import pandas as pd
import numpy as np
import math
import operator

## Load the dataset and describe it.

In [230]:
col=['SepalLengthCm','SepalWidthCm','PetalLengthCm','PetalWidthCm','Species']
irisRaw=pd.read_csv("/Users/vhb0007/Downloads/iris.csv",names=col)
print("First five rows")
print(irisRaw.head())
print("*********")
print("columns",irisRaw.columns)
print("*********")
print("shape:",irisRaw.shape)
print("*********")
print("Size:",irisRaw.size)
print("*********")
print("no of samples available for each type")
print(irisRaw['Species'].value_counts())
print("*********")
print(irisRaw.describe())
print("*********")
print("total sample",irisRaw.shape[0])

First five rows
    SepalLengthCm  SepalWidthCm  PetalLengthCm  PetalWidthCm      Species
Id  SepalLengthCm  SepalWidthCm  PetalLengthCm  PetalWidthCm      Species
1             5.1           3.5            1.4           0.2  Iris-setosa
2             4.9           3.0            1.4           0.2  Iris-setosa
3             4.7           3.2            1.3           0.2  Iris-setosa
4             4.6           3.1            1.5           0.2  Iris-setosa
*********
columns Index(['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm',
       'Species'],
      dtype='object')
*********
shape: (151, 5)
*********
Size: 755
*********
no of samples available for each type
Iris-virginica     50
Iris-versicolor    50
Iris-setosa        50
Species             1
Name: Species, dtype: int64
*********
       SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm         Species
count            151          151           151          151             151
unique            36           24 

### 3. Normalize the dataset. Visualize the dataset before and after normalization.

In [231]:
#Pre processing the data

irisClone = irisRaw.copy()
from sklearn.preprocessing import OrdinalEncoder
ord_enc = OrdinalEncoder()
irisClone["Species_type"] = ord_enc.fit_transform(irisClone[["Species"]])
irisClone = irisClone.drop('Species', 1)


In [232]:
# Visualizing pre processed data
print("First five rows")
print(irisClone.head())
print("*********")
print("columns",irisClone.columns)
print("*********")
print("shape:",irisClone.shape)
print("*********")
print("Size:",irisClone.size)
print("*********")
print("no of samples available for each type")
print(irisClone['Species_type'].value_counts())
print("*********")
print(irisClone.describe())
print("*********")
print("total sample",irisClone.shape[0])


First five rows
    SepalLengthCm  SepalWidthCm  PetalLengthCm  PetalWidthCm  Species_type
Id  SepalLengthCm  SepalWidthCm  PetalLengthCm  PetalWidthCm           3.0
1             5.1           3.5            1.4           0.2           0.0
2             4.9           3.0            1.4           0.2           0.0
3             4.7           3.2            1.3           0.2           0.0
4             4.6           3.1            1.5           0.2           0.0
*********
columns Index(['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm',
       'Species_type'],
      dtype='object')
*********
shape: (151, 5)
*********
Size: 755
*********
no of samples available for each type
2.0    50
1.0    50
0.0    50
3.0     1
Name: Species_type, dtype: int64
*********
       Species_type
count    151.000000
mean       1.013245
std        0.832560
min        0.000000
25%        0.000000
50%        1.000000
75%        2.000000
max        3.000000
*********
total sample 151


In [233]:
#Extracting inputs and outputs seprately
inputs = irisClone.iloc[1:irisClone.shape[0]+1,0:4]
inputs= inputs.to_numpy()
inputs= inputs.astype('float64')

outputs = irisClone["Species_type"].values.tolist()
outputs.pop(0)


3.0

## 4. Calculate Similarity based on distance function of your choice

### Distance measure used- Euclidean distance

In [179]:
import math
def euclidean_distance(row1, row2):
    distance = 0.0
    for i in range(len(row1)):
        distance += math.pow(row1[i] - row2[i],2)
    return math.sqrt(distance)

### Function to get the N nearest neighbors

In [203]:
def get_neighbors(x_train,y_train, test_row, num_neighbors):
    distances = list()
    for index in range(len(x_train)):
        dist = euclidean_distance(test_row, x_train[index])
        distances.append((x_train[index], y_train[index],dist))
    distances.sort(key=lambda tup: tup[2])
    neighbors = list()
    for i in range(num_neighbors):
        neighbors.append(distances[i][1])
    return neighbors

### Prediction using N nearest neighbors

In [218]:
def predict_label(x_train,y_train , test_row, num_neighbors):
    neighbors = get_neighbors(x_train,y_train, test_row, num_neighbors)
    prediction = max(set(neighbors), key=neighbors.count)
    return prediction

### Splitting the data for training and test, We use can use stratified splitting because data is equall distributed

In [234]:
# Data split 80:20
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(
             inputs, outputs, test_size = 0.2, stratify=outputs)


### Predicting for test data

In [235]:
predictions=list()
for index in range(len(X_test)):
    predictedLabel = predict_label(X_train, Y_train, X_test[index], 1)
    predictions.append(predictedLabel)
print(predictions)


[2.0, 2.0, 1.0, 2.0, 1.0, 1.0, 2.0, 0.0, 2.0, 1.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0, 0.0, 2.0, 1.0, 1.0, 0.0, 2.0, 0.0, 0.0, 1.0, 2.0, 0.0, 0.0, 1.0, 2.0]


### Training and testing using the sklearn libray

In [236]:
from sklearn.neighbors import KNeighborsClassifier
 
neigh = KNeighborsClassifier(n_neighbors=7)
 
neigh.fit(X_train, Y_train)
 
# Predict on dataset which model has not seen before
print(neigh.predict(X_test))

[2. 2. 1. 2. 1. 1. 2. 0. 2. 1. 0. 1. 1. 0. 1. 0. 0. 2. 1. 1. 0. 2. 0. 0.
 1. 2. 0. 0. 1. 2.]


In [222]:
print("starting library")

starting library


## Manoj

In [66]:
#Train the model
from sklearn.neighbors import KNeighborsClassifier

neigh=KNeighborsClassifier(n_neighbors=3)
neigh.fit(x,y)
print(neigh.predict([[5.1,3.8,1.6,0.2],[7.0,3.2,4.7,1.4],[5.3,3.7,1.5,0.2]]))

['Iris-setosa' 'Iris-versicolor' 'Iris-setosa']


  """


In [67]:
from sklearn.neighbors import KNeighborsClassifier

from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(
             x, y, test_size = 0.2, random_state=42)
 
neigh = KNeighborsClassifier(n_neighbors=7)
 
neigh.fit(X_train, Y_train)
 
# Predict on dataset which model has not seen before
print(neigh.predict(X_test))
print(neigh.predict([[5.1,3.8,1.6,0.2],[7.0,3.2,4.7,1.4],[5.3,3.7,1.5,0.2]]))

['Iris-versicolor' 'Iris-setosa' 'Iris-virginica' 'Iris-versicolor'
 'Iris-versicolor' 'Iris-setosa' 'Iris-versicolor' 'Iris-virginica'
 'Iris-virginica' 'Iris-versicolor' 'Iris-virginica' 'Iris-setosa'
 'Iris-setosa' 'Iris-setosa' 'Iris-setosa' 'Iris-versicolor'
 'Iris-virginica' 'Iris-versicolor' 'Iris-versicolor' 'Iris-virginica'
 'Iris-setosa' 'Iris-virginica' 'Iris-setosa' 'Iris-virginica'
 'Iris-virginica' 'Iris-virginica' 'Iris-virginica' 'Iris-virginica'
 'Iris-setosa' 'Iris-setosa']
['Iris-setosa' 'Iris-versicolor' 'Iris-setosa']


  # Remove the CWD from sys.path while we load stuff.


In [21]:
# from sklearn.datasets import load_iris
 
# # Loading data
# irisData = load_iris()
# print(irisData.data)
# print(irisData.target)

[[5.1 3.5 1.4 0.2]
 [4.9 3.  1.4 0.2]
 [4.7 3.2 1.3 0.2]
 [4.6 3.1 1.5 0.2]
 [5.  3.6 1.4 0.2]
 [5.4 3.9 1.7 0.4]
 [4.6 3.4 1.4 0.3]
 [5.  3.4 1.5 0.2]
 [4.4 2.9 1.4 0.2]
 [4.9 3.1 1.5 0.1]
 [5.4 3.7 1.5 0.2]
 [4.8 3.4 1.6 0.2]
 [4.8 3.  1.4 0.1]
 [4.3 3.  1.1 0.1]
 [5.8 4.  1.2 0.2]
 [5.7 4.4 1.5 0.4]
 [5.4 3.9 1.3 0.4]
 [5.1 3.5 1.4 0.3]
 [5.7 3.8 1.7 0.3]
 [5.1 3.8 1.5 0.3]
 [5.4 3.4 1.7 0.2]
 [5.1 3.7 1.5 0.4]
 [4.6 3.6 1.  0.2]
 [5.1 3.3 1.7 0.5]
 [4.8 3.4 1.9 0.2]
 [5.  3.  1.6 0.2]
 [5.  3.4 1.6 0.4]
 [5.2 3.5 1.5 0.2]
 [5.2 3.4 1.4 0.2]
 [4.7 3.2 1.6 0.2]
 [4.8 3.1 1.6 0.2]
 [5.4 3.4 1.5 0.4]
 [5.2 4.1 1.5 0.1]
 [5.5 4.2 1.4 0.2]
 [4.9 3.1 1.5 0.2]
 [5.  3.2 1.2 0.2]
 [5.5 3.5 1.3 0.2]
 [4.9 3.6 1.4 0.1]
 [4.4 3.  1.3 0.2]
 [5.1 3.4 1.5 0.2]
 [5.  3.5 1.3 0.3]
 [4.5 2.3 1.3 0.3]
 [4.4 3.2 1.3 0.2]
 [5.  3.5 1.6 0.6]
 [5.1 3.8 1.9 0.4]
 [4.8 3.  1.4 0.3]
 [5.1 3.8 1.6 0.2]
 [4.6 3.2 1.4 0.2]
 [5.3 3.7 1.5 0.2]
 [5.  3.3 1.4 0.2]
 [7.  3.2 4.7 1.4]
 [6.4 3.2 4.5 1.5]
 [6.9 3.1 4.

In [22]:
# from sklearn.neighbors import KNeighborsClassifier
# from sklearn.model_selection import train_test_split
# X = irisData.data
# Y = irisData.target
# # print(X)
# print("sss")
# # print(Y)
# # Split into training and test set
# X_train, X_test, y_train, y_test = train_test_split(
#              X, Y, test_size = 0.2, random_state=42)
 
# knn = KNeighborsClassifier(n_neighbors=7)
 
# knn.fit(X_train, y_train)
 
# # Predict on dataset which model has not seen before
# print(knn.predict(X_test))
# print(knn.predict([[5.1,3.8,1.6,0.2],[7.0,3.2,4.7,1.4],[5.3,3.7,1.5,0.2]]))

sss
[1 0 2 1 1 0 1 2 2 1 2 0 0 0 0 1 2 1 1 2 0 2 0 2 2 2 2 2 0 0]
[0 1 0]


In [23]:

testSet = [[5.1,3.8,1.6,0.2]]
test = pd.DataFrame(testSet)
print(test)
print("predicted:",neigh.predict(testSet))
print("neighbors",neigh.kneighbors(test))

# testSet = [[2, 2, 2, 4]]
# # test = dfs["Sheet2"]
# # print(test)
# print("predicted:",neigh.predict(testSet))
# print("neighbors",neigh.kneighbors(testSet))

     0    1    2    3
0  5.1  3.8  1.6  0.2


NotFittedError: This KNeighborsClassifier instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.