# Classification using KNN 

## Method 1 
### Using train_test_split method. 

In [29]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.utils import shuffle

In [30]:
data = pd.read_csv("column_2C_weka.csv")
df = pd.DataFrame(data)
df.head()

Unnamed: 0,pelvic_incidence,pelvic_tilt,lumbar_lordosis_angle,sacral_slope,pelvic_radius,degree_spondylolisthesis,class
0,63.027818,22.552586,39.609117,40.475232,98.672917,-0.2544,Abnormal
1,39.056951,10.060991,25.015378,28.99596,114.405425,4.564259,Abnormal
2,68.832021,22.218482,50.092194,46.613539,105.985135,-3.530317,Abnormal
3,69.297008,24.652878,44.311238,44.64413,101.868495,11.211523,Abnormal
4,49.712859,9.652075,28.317406,40.060784,108.168725,7.918501,Abnormal


In [47]:
X = df.iloc[:, 0:6]
Y = df.loc[:, 'class']

In [48]:
X_train, X_test, y_train, y_test = train_test_split(X,Y, test_size = 0.2)

In [49]:
# Initializing lists
test_error = []
min_test = []
k_star = 0
k_values = [x for x in range(1, 200, 1)]

# Loop for varying the k values and finding test * train error for plotting. 
for k in k_values:
    classifier = KNeighborsClassifier(n_neighbors= k, p=2, metric= 'euclidean')  # Instantiating the KNN model using sklearn
    classifier.fit(X_train,y_train)
    y_pred = classifier.predict(X_test)
    test_error.append(1- accuracy_score(y_test,y_pred)) #calculating test error
 
# Determining K* from test error
min_test= min(test_error)
print("Minimum Error is:", min_test)    
index1 = test_error.index(min_test)
k_star = 1+index1       # If the interval is 1, we can get k* by subtracting it with 208. 
print("Optimal K =", k_star)

Minimum Error is: 0.048387096774193505
Optimal K = 39


In [50]:
classifier = KNeighborsClassifier(n_neighbors=39, p=2, metric= 'euclidean')  #49 if interval is -3 and 3 if interval is -1
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

#Confusion Matrix calculation 
print("CONFUSION MATRIX TABLE: ")
print(" ")
print(pd.crosstab(y_test, y_pred, rownames=['True'], colnames=['Predicted'], margins=True)) 
print(" ")

# True Positive and True Negative Rate using confusion Matrix
con = confusion_matrix(y_test, y_pred)
tpr = float(con[0][0] / (con[0][0] + con[0][1]))
print("TRUE POSITIVE RATE:", tpr)
tnr = float(con[1][1] / (con[1][1] + con[1][0]))
print("TRUE NEGATIVE RATE:", tnr)
print("")

#Classificiation Report which includes F-1 score and Precision
print("CLASSIFICATION REPORT: ")
print(classification_report(y_test,y_pred))


CONFUSION MATRIX TABLE: 
 
Predicted  Abnormal  Normal  All
True                            
Abnormal         36       2   38
Normal            1      23   24
All              37      25   62
 
TRUE POSITIVE RATE: 0.9473684210526315
TRUE NEGATIVE RATE: 0.9583333333333334

CLASSIFICATION REPORT: 
              precision    recall  f1-score   support

    Abnormal       0.97      0.95      0.96        38
      Normal       0.92      0.96      0.94        24

   micro avg       0.95      0.95      0.95        62
   macro avg       0.95      0.95      0.95        62
weighted avg       0.95      0.95      0.95        62



## Method 2 
### Splitting by selecting the first 70 rows of Class 0 and the first 140 rows of Class 1 as the training and the rest of the data as the test set

In [51]:
train_ab= df.iloc[0:140, 0:7]
train_n = df.iloc[210:280, 0:7]
train = pd.concat([train_ab, train_n]) # train - DataFrame with training data

test_ab = df.iloc[140:210, 0:7]
test_n = df.iloc[280:310, 0:7]
test = pd.concat([test_ab, test_n]) # test - DataFrame with test data

In [43]:
# Dividing the data into x_train(features), x_test(labels), y_train(features), y_test(label)

x_train = train.iloc[:, 0:6]
y_train = train.loc[:, 'class']
x_test = test.iloc[:, 0:6]
y_test = test.loc[:, 'class']

# Shuffling the data
x_train = shuffle(x_train, random_state = 7)
x_test = shuffle(x_test, random_state = 7)
y_train = shuffle(y_train, random_state = 7)
y_test = shuffle(y_test, random_state = 7)

In [44]:
 # Initializing lists
test_error = []
min_test = []
k_star = 0
k_values = [x for x in range(1, 200, 1)]

# Loop for varying the k values and finding test * train error for plotting. 
for k in k_values:
    classifier = KNeighborsClassifier(n_neighbors= k, p=2, metric= 'euclidean')  # Instantiating the KNN model using sklearn
    classifier.fit(x_train,y_train)
    y_pred_test = classifier.predict(x_test)
    test_error.append(1- accuracy_score(y_test,y_pred_test))
 
# Determining K* from test error
min_test= min(test_error)
print("Minimum Error is:", min_test)    
index1 = test_error.index(min_test)
k_star = 1+index1          # If the interval is 1, we can get k* by subtracting it with 208. 
print("Optimal K =", k_star)

Minimum Error is: 0.07999999999999996
Optimal K = 3


In [46]:
classifier = KNeighborsClassifier(n_neighbors=3, p=2, metric= 'euclidean')  #49 if interval is -3 and 3 if interval is -1
classifier.fit(x_train, y_train)
y_pred = classifier.predict(x_test)

#Confusion Matrix calculation 
print("CONFUSION MATRIX TABLE: ")
print(" ")
print(pd.crosstab(y_test, y_pred, rownames=['True'], colnames=['Predicted'], margins=True)) 
print(" ")

# True Positive and True Negative Rate using confusion Matrix
con = confusion_matrix(y_test, y_pred)
tpr = float(con[0][0] / (con[0][0] + con[0][1]))
print("TRUE POSITIVE RATE:", tpr)
tnr = float(con[1][1] / (con[1][1] + con[1][0]))
print("TRUE NEGATIVE RATE:", tnr)
print("")

#Classificiation Report which includes F-1 score and Precision
print("CLASSIFICATION REPORT: ")
print(classification_report(y_test,y_pred))


CONFUSION MATRIX TABLE: 
 
Predicted  Abnormal  Normal  All
True                            
Abnormal         69       1   70
Normal            7      23   30
All              76      24  100
 
TRUE POSITIVE RATE: 0.9857142857142858
TRUE NEGATIVE RATE: 0.7666666666666667

CLASSIFICATION REPORT: 
              precision    recall  f1-score   support

    Abnormal       0.91      0.99      0.95        70
      Normal       0.96      0.77      0.85        30

   micro avg       0.92      0.92      0.92       100
   macro avg       0.93      0.88      0.90       100
weighted avg       0.92      0.92      0.92       100

