# Prediction Using KNN

<p style="font-family: Arial; font-size:1.75em;color:purple; font-style:bold"><br>

Importing the Necessary Libraries<br></p>

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score

<p style="font-family: Arial; font-size:1.75em;color:purple; font-style:bold"><br>

Creating a Pandas DataFrame from a CSV file<br></p>

In [2]:
def CreateDataFrame(types, level):
    data = pd.read_csv('../Extraction/'+types+"/"+level+'.csv')
    data["Class"] = level
    return data

In [3]:
def LoadData(types):
    frames = [CreateDataFrame(types,"Clean"), 
          CreateDataFrame(types,"Dirty (0-20)"), 
          CreateDataFrame(types,"Dirty (20-40)"), 
          CreateDataFrame(types,"Dirty (40-60)"),
          CreateDataFrame(types,"Dirty (60-80)"),
          CreateDataFrame(types,"Dirty (80-100)")]
    df = pd.concat(frames)
    y=df[['Class']].copy()
    X = df.copy()
    del X['Class']
    return X,y

# Classification using KNN

## Optimize Classfiers

In [4]:
def optimize(X_train, y_train):
    param_grid = [{'clf__weights': ["uniform", "distance"], 'clf__n_neighbors': [5,10,15]}]
    clfpipeline = Pipeline([
        ('scale', StandardScaler()),
        ('clf', KNeighborsClassifier())
    ])
    grid_search = GridSearchCV(clfpipeline, param_grid, cv=5, verbose=3, n_jobs=-1)
    grid_search.fit(X_train, y_train.values.ravel())
    return grid_search

## Cycle Work

In [5]:
def cycleWork(X,y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    gridsearch = optimize(X_train, y_train)
    print(gridsearch.best_params_)
    estimator = gridsearch.best_estimator_
    
    y_train_pred = estimator.predict(X_train)
    trnscore = accuracy_score(y_train, y_train_pred)
    
    y_tst_pred = estimator.predict(X_test)
    tstscore = accuracy_score(y_test, y_tst_pred)
    cm = confusion_matrix(y_test, y_tst_pred)
    true_pos = np.diag(cm) 
    precision = np.sum(true_pos / np.sum(cm, axis=0))
    recall = np.sum(true_pos / np.sum(cm, axis=1))
    f1 = 2 * ((precision* recall)/(precision + recall))
    #f1None = f1_score(y_test, y_tst_pred, average=None)
    #f1micro = f1_score(y_test, y_tst_pred, average='micro')
    return [trnscore, tstscore, f1]

In [6]:
def AllWork(types,cycles = 5):
    data = []
    X,y = LoadData(types)
    for i in range(0,cycles):
        print(i)
        acc = cycleWork(X,y)
        print("Train Accuracy ",acc[0],", Test Accuracy",acc[1], "F1 ",acc[2])
        data.append([acc[0],acc[1],acc[2]])
        numpyarray = np.array(data)
        df = pd.DataFrame(numpyarray, columns = ["Train Acc", "Test Acc", "F1 Score"])
        df.to_csv("KNN-"+types+'.csv',index = False)

In [7]:
types = "Solar Data"
AllWork(types,5)
types = "NoBG"
AllWork(types,5)

0
Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:    8.4s finished


{'clf__n_neighbors': 5, 'clf__weights': 'distance'}
Train Accuracy  1.0 , Test Accuracy 0.980251346499 F1  5.90289132136
1
Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:    7.5s finished


{'clf__n_neighbors': 5, 'clf__weights': 'distance'}
Train Accuracy  1.0 , Test Accuracy 0.97486535009 F1  5.85556544839
2
Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:   11.1s finished


{'clf__n_neighbors': 5, 'clf__weights': 'distance'}
Train Accuracy  1.0 , Test Accuracy 0.980251346499 F1  5.89648721774
3
Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:   13.2s finished


{'clf__n_neighbors': 5, 'clf__weights': 'distance'}
Train Accuracy  1.0 , Test Accuracy 0.983842010772 F1  5.91036426448
4
Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:   11.2s finished


{'clf__n_neighbors': 5, 'clf__weights': 'distance'}
Train Accuracy  1.0 , Test Accuracy 0.983842010772 F1  5.92037710077
0
Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:    9.5s finished


{'clf__n_neighbors': 5, 'clf__weights': 'distance'}
Train Accuracy  1.0 , Test Accuracy 0.960502692998 F1  5.75696739019
1
Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:   12.0s finished


{'clf__n_neighbors': 5, 'clf__weights': 'distance'}
Train Accuracy  1.0 , Test Accuracy 0.965888689408 F1  5.80654779334
2
Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:   10.8s finished


{'clf__n_neighbors': 5, 'clf__weights': 'distance'}
Train Accuracy  1.0 , Test Accuracy 0.956912028725 F1  5.75996390835
3
Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:   10.1s finished


{'clf__n_neighbors': 5, 'clf__weights': 'distance'}
Train Accuracy  1.0 , Test Accuracy 0.94973070018 F1  5.69168012157
4
Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:   14.9s finished


{'clf__n_neighbors': 5, 'clf__weights': 'distance'}
Train Accuracy  1.0 , Test Accuracy 0.962298025135 F1  5.7888455353
