# Prediction Using KNN

<p style="font-family: Arial; font-size:1.75em;color:purple; font-style:bold"><br>

Importing the Necessary Libraries<br></p>

In [150]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold
import time

<p style="font-family: Arial; font-size:1.75em;color:purple; font-style:bold"><br>

Creating a Pandas DataFrame from a CSV file<br></p>

In [2]:
def CreateDataFrame(types, level):
    data = pd.read_csv('../Extraction/'+types+"/"+level+'.csv')
    data["Class"] = level
    return data

In [3]:
def LoadData(types):
    frames = [CreateDataFrame(types,"Clean"), 
          CreateDataFrame(types,"Dirty (0-20)"), 
          CreateDataFrame(types,"Dirty (20-40)"), 
          CreateDataFrame(types,"Dirty (40-60)"),
          CreateDataFrame(types,"Dirty (60-80)"),
          CreateDataFrame(types,"Dirty (80-100)")]
    df = pd.concat(frames)
    y=df[['Class']].copy()
    X = df.copy()
    del X['Class']
    return X,y

In [4]:
def LoadMinimizedData(types):
    df = pd.read_csv('../Feature Selection/FeatureSelection-'+types+".csv")
    y=df[['Class']].copy()
    X = df.copy()
    del X['Class']
    return X,y

# Classification using KNN

## Optimize Classfiers

In [174]:
def optimize(X_train, y_train):
    param_grid = [{'clf__weights': ["uniform", "distance"], 'clf__n_neighbors': [5,10]}]
    clfpipeline = Pipeline([
        ('scale', StandardScaler()),
        ('clf', KNeighborsClassifier())
    ])
    grid_search = GridSearchCV(clfpipeline, param_grid, cv=5, verbose=3, n_jobs=-1)
    grid_search.fit(X_train, y_train.values.ravel())
    end = time.time()
    return grid_search

## Cycle Work

In [206]:
def cycleWork(X_train, X_test, y_train, y_test):
    gridsearch = optimize(X_train, y_train)
    traintime = gridsearch.cv_results_['mean_fit_time'].mean()*1000
    print("Mean Fit Time", traintime)
    print("Mean Score Time", gridsearch.cv_results_['mean_score_time'].mean()*1000)

    print(gridsearch.best_params_)
    estimator = gridsearch.best_estimator_
    
    y_train_pred = estimator.predict(X_train)
    trnscore = accuracy_score(y_train, y_train_pred)
    start = time.time()
    y_tst_pred = estimator.predict(X_test)
    end = time.time()
    diff = end-start
    clfperms = X_train.shape[0]/(diff*1000)
    tstscore = accuracy_score(y_test, y_tst_pred)
    cm = confusion_matrix(y_test, y_tst_pred)
    true_pos = np.diag(cm) 
    precision = np.sum(true_pos / np.sum(cm, axis=0))
    recall = np.sum(true_pos / np.sum(cm, axis=1))
    f1 = 2 * ((precision* recall)/(precision + recall))
    #f1None = f1_score(y_test, y_tst_pred, average=None)
    #f1micro = f1_score(y_test, y_tst_pred, average='micro')
    return [trnscore, tstscore, f1,traintime,clfperms]

In [207]:
def AllWork(types,cycles = 5):
    print(types)
    data = []
    for i in range(0,cycles):
        X,y = LoadData(types)
        #X,y = LoadMinimizedData(types)
        skf = StratifiedKFold(n_splits=5,shuffle = True)
        j = 0
        for train_index, test_index in skf.split(X, y.values.ravel()):
            X_train, X_test = X.iloc[train_index], X.iloc[test_index]
            y_train, y_test = y.iloc[train_index], y.iloc[test_index]
            acc = cycleWork(X_train, X_test, y_train, y_test)
            print("Train Accuracy ",acc[0],", Test Accuracy",acc[1], "F1 ",acc[2])
            data.append([i,j,acc[0],acc[1],acc[2],acc[3],acc[4]])
            j = j+1
    numpyarray = np.array(data)
    df = pd.DataFrame(numpyarray, columns = ["Cycle","Split","Train Acc", "Test Acc", "F1 Score","Train Time (ms)","clfs per MS"])
    df.to_csv("KNN-"+types+'.csv',index = False)
    df.head(10)

In [209]:
types = "NoBG"
AllWork(types,5)
#types = "Solar Data"
#AllWork(types,5)

NoBG
Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:    8.3s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:    8.3s finished


Mean Fit Time 23.8610744476
Mean Score Time 87.1016979218
{'clf__n_neighbors': 5, 'clf__weights': 'distance'}
Train Accuracy  1.0 , Test Accuracy 0.967857142857 F1  5.80248187556
Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:    8.7s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:    8.7s finished


Mean Fit Time 17.1406388283
Mean Score Time 68.4598445892
{'clf__n_neighbors': 5, 'clf__weights': 'distance'}
Train Accuracy  1.0 , Test Accuracy 0.960573476703 F1  5.7819457658
Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:    7.2s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:    7.2s finished


Mean Fit Time 16.3424134254
Mean Score Time 61.4126205444
{'clf__n_neighbors': 5, 'clf__weights': 'distance'}
Train Accuracy  1.0 , Test Accuracy 0.951612903226 F1  5.71799958062
Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:    7.5s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:    7.6s finished


Mean Fit Time 18.7391638756
Mean Score Time 74.2572188377
{'clf__n_neighbors': 5, 'clf__weights': 'distance'}
Train Accuracy  1.0 , Test Accuracy 0.962093862816 F1  5.77634825938
Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:    6.8s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:    6.8s finished


Mean Fit Time 16.5904283524
Mean Score Time 59.3662142754
{'clf__n_neighbors': 5, 'clf__weights': 'distance'}
Train Accuracy  1.0 , Test Accuracy 0.956678700361 F1  5.76476178268
Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:    7.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:    7.0s finished


Mean Fit Time 16.0403132439
Mean Score Time 65.1632547379
{'clf__n_neighbors': 5, 'clf__weights': 'distance'}
Train Accuracy  1.0 , Test Accuracy 0.9625 F1  5.78769184544
Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:    8.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:    8.0s finished


Mean Fit Time 22.1869945526
Mean Score Time 83.9523553848
{'clf__n_neighbors': 5, 'clf__weights': 'distance'}
Train Accuracy  1.0 , Test Accuracy 0.953405017921 F1  5.7158943566
Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:    9.5s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:    9.5s finished


Mean Fit Time 16.9398546219
Mean Score Time 62.264084816
{'clf__n_neighbors': 5, 'clf__weights': 'distance'}
Train Accuracy  1.0 , Test Accuracy 0.964157706093 F1  5.78298003397
Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:    7.9s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:    7.9s finished


Mean Fit Time 19.4885969162
Mean Score Time 77.8563380241
{'clf__n_neighbors': 5, 'clf__weights': 'distance'}
Train Accuracy  1.0 , Test Accuracy 0.965703971119 F1  5.79827672448
Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:    8.7s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:    8.7s finished


Mean Fit Time 16.0913467407
Mean Score Time 62.5131368637
{'clf__n_neighbors': 5, 'clf__weights': 'distance'}
Train Accuracy  1.0 , Test Accuracy 0.953068592058 F1  5.73715838296
Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:    8.8s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:    8.8s finished


Mean Fit Time 22.1871376038
Mean Score Time 103.040850163
{'clf__n_neighbors': 5, 'clf__weights': 'distance'}
Train Accuracy  1.0 , Test Accuracy 0.966071428571 F1  5.81340228561
Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:   13.3s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:   13.3s finished


Mean Fit Time 21.9874382019
Mean Score Time 89.4509673119
{'clf__n_neighbors': 5, 'clf__weights': 'distance'}
Train Accuracy  1.0 , Test Accuracy 0.971326164875 F1  5.83166530602
Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:    9.4s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:    9.5s finished


Mean Fit Time 17.6905989647
Mean Score Time 71.5086340904
{'clf__n_neighbors': 5, 'clf__weights': 'distance'}
Train Accuracy  1.0 , Test Accuracy 0.94982078853 F1  5.71238251762
Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:   10.5s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:   10.5s finished


Mean Fit Time 18.3391332626
Mean Score Time 65.6622767448
{'clf__n_neighbors': 5, 'clf__weights': 'distance'}
Train Accuracy  1.0 , Test Accuracy 0.958483754513 F1  5.76470625431
Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:    8.6s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:    8.6s finished


Mean Fit Time 26.8337368965
Mean Score Time 93.8462376595
{'clf__n_neighbors': 5, 'clf__weights': 'distance'}
Train Accuracy  1.0 , Test Accuracy 0.976534296029 F1  5.85763554706
Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:    9.9s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:    9.9s finished


Mean Fit Time 19.8382616043
Mean Score Time 80.9535861015
{'clf__n_neighbors': 5, 'clf__weights': 'distance'}
Train Accuracy  1.0 , Test Accuracy 0.971428571429 F1  5.82214994099
Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:   11.4s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:   11.5s finished


Mean Fit Time 25.4852175713
Mean Score Time 100.692689419
{'clf__n_neighbors': 5, 'clf__weights': 'distance'}
Train Accuracy  1.0 , Test Accuracy 0.967741935484 F1  5.81980948541
Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:    8.9s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:    8.9s finished


Mean Fit Time 15.8903002739
Mean Score Time 62.9140257835
{'clf__n_neighbors': 5, 'clf__weights': 'distance'}
Train Accuracy  1.0 , Test Accuracy 0.951612903226 F1  5.7493695268
Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:    7.6s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:    7.6s finished


Mean Fit Time 20.1878666878
Mean Score Time 71.1596488953
{'clf__n_neighbors': 5, 'clf__weights': 'distance'}
Train Accuracy  1.0 , Test Accuracy 0.956678700361 F1  5.75576328797
Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:   11.2s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:   11.2s finished


Mean Fit Time 21.837246418
Mean Score Time 77.2556424141
{'clf__n_neighbors': 5, 'clf__weights': 'distance'}
Train Accuracy  1.0 , Test Accuracy 0.960288808664 F1  5.78831168907
Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:    9.7s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:    9.7s finished


Mean Fit Time 23.7869381905
Mean Score Time 95.6942081451
{'clf__n_neighbors': 5, 'clf__weights': 'distance'}
Train Accuracy  1.0 , Test Accuracy 0.951785714286 F1  5.69409886123
Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:    8.4s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:    8.4s finished


Mean Fit Time 20.1382875443
Mean Score Time 77.1558046341
{'clf__n_neighbors': 5, 'clf__weights': 'distance'}
Train Accuracy  1.0 , Test Accuracy 0.967741935484 F1  5.81540551397
Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:   15.1s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:   15.1s finished


Mean Fit Time 27.3337483406
Mean Score Time 108.838069439
{'clf__n_neighbors': 5, 'clf__weights': 'distance'}
Train Accuracy  1.0 , Test Accuracy 0.974910394265 F1  5.87809390706
Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:    8.7s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:    8.7s finished


Mean Fit Time 19.7884559631
Mean Score Time 72.1086859703
{'clf__n_neighbors': 5, 'clf__weights': 'distance'}
Train Accuracy  1.0 , Test Accuracy 0.942238267148 F1  5.65947928126
Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:   10.5s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:   10.5s finished


Mean Fit Time 19.1389203072
Mean Score Time 73.5089182854
{'clf__n_neighbors': 5, 'clf__weights': 'distance'}
Train Accuracy  1.0 , Test Accuracy 0.963898916968 F1  5.79210062997
