In [1]:
#import all libraries

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn import preprocessing 
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [2]:
pwd()

'/Users/aodubela'

In [3]:
cd /Users/aodubela/Desktop/ExerciseFiles/Data

/Users/aodubela/Desktop/ExerciseFiles/Data


In [4]:
# Load data
df = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/00528/dataset.csv", header = 1, sep = ";")
df = df.drop(columns=['Green frogs','Brown frogs', 'Common toad', 'Common newt', 'Great crested newt','Tree frog'])
df = df.drop(columns=['ID', 'TR', 'VR', 'SUR1', 'SUR2', 'SUR3', 'UR', 'FR', 'RR', 'BR','MR', 'CR'])
df['Fire-bellied toad'].value_counts()

0    131
1     58
Name: Fire-bellied toad, dtype: int64

In [5]:
# Prepare Data
X = df.iloc[:, 1:-1].values
y = df.iloc[:, 4].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.40)
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [6]:
classifier = KNeighborsClassifier(n_neighbors = 5)
classifier.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [7]:
# Sets to experiment with
X_train2, X_test2, y_train2, y_test2 = train_test_split(X, y, test_size = 0.40)
scaler = StandardScaler()
scaler.fit(X_train2)
X_train2 = scaler.transform(X_train2)
X_test2 = scaler.transform(X_test2)


In [8]:
# Basic Knn
knn = KNeighborsClassifier(n_neighbors = 5)
knn.fit(X_train2, y_train2)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [11]:
# Gridsearch
params = {"n_neighbors":[2,3,4,5,6,7,8,9]}
model = GridSearchCV(classifier, params, cv=3)

In [12]:
model.fit(X_train, y_train)
model.best_params_

{'n_neighbors': 3}

In [13]:
model.best_score_

0.6455666192508298

In [14]:
# Knn with 3 neighbors
gridsearch_knn = KNeighborsClassifier(n_neighbors = 3)
gridsearch_knn.fit(X_train2, y_train2)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=3, p=2,
                     weights='uniform')

In [15]:
gridsearch_y_pred = gridsearch_knn.predict(X_test2)
print(gridsearch_y_pred)

[0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0
 0 0 1 0 0 0 0 0 1 0 0 0 1 1 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0
 0 0]


In [16]:
print(confusion_matrix(y_test2, gridsearch_y_pred))
print(classification_report(y_test2, gridsearch_y_pred))

[[48 10]
 [15  3]]
              precision    recall  f1-score   support

           0       0.76      0.83      0.79        58
           1       0.23      0.17      0.19        18

    accuracy                           0.67        76
   macro avg       0.50      0.50      0.49        76
weighted avg       0.64      0.67      0.65        76



In [17]:
# Random Search
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

params = {"n_neighbors": range(1,5), "weights": ["uniform", "distance"]}
rsearch = RandomizedSearchCV(estimator=knn, param_distributions=params,cv=4,n_iter=8, random_state=5)
rsearch.fit(X_train, y_train)
rsearch_y_pred = rsearch.predict(X_test)

print(rsearch.best_score_)

0.6197660098522166


In [18]:
print(confusion_matrix(y_test2, rsearch_y_pred))
print(classification_report(y_test2, rsearch_y_pred))

[[47 11]
 [16  2]]
              precision    recall  f1-score   support

           0       0.75      0.81      0.78        58
           1       0.15      0.11      0.13        18

    accuracy                           0.64        76
   macro avg       0.45      0.46      0.45        76
weighted avg       0.61      0.64      0.62        76

