# KNN implementation with all possible data 

In [1]:
# imports
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
import importlib.util
spec = importlib.util.spec_from_file_location("module.name", "../functions.py")
functions = importlib.util.module_from_spec(spec)
spec.loader.exec_module(functions)
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score,accuracy_score
import pandas as pd

# param values
neighbors = range(1, 32, 2)
weight = ["uniform", "distance"]
pp = [1, 2]
algo = ["auto"]
metric = ["minkowski", "manhattan"]
# creating  grid instance
grid_params_nn = dict(n_neighbors=neighbors, weights=weight, p=pp, algorithm=algo, metric=metric)

## KNN with just using default values

In [2]:
# get data
X, y = functions.get_data(type="_new")
# split dataset into train and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=10, stratify=y)

In [3]:
knn = KNeighborsClassifier()
knn.fit(X_train, y_train.values.ravel())

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [4]:
# predict test values
y_pred_basic = knn.predict(X_test)

In [5]:
# calculate F1 score
f1_basic = f1_score(y_test, y_pred_basic)
"F1 score is = {0}".format(f1_basic)

'F1 score is = 0.08846584546472565'

In [6]:
# creating KNN instance
knn = KNeighborsClassifier()
knnCV = GridSearchCV(knn, grid_params_nn, cv=10, scoring='accuracy', verbose=10, n_jobs=-1)

knnCV.fit(X_train, y_train.values.ravel())

Fitting 10 folds for each of 128 candidates, totalling 1280 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:   11.1s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:   19.2s
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:   22.8s
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   35.9s
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   48.4s
[Parallel(n_jobs=-1)]: Done  45 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done  69 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done  82 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done  97 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done 112 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 129 tasks      | elapsed:  2.8min
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:  3.2min
[Parallel(n_jobs=-1)]: Done 165 tasks      | elapsed:  3.6min
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  4

GridSearchCV(cv=10, error_score='raise-deprecating',
             estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                            metric='minkowski',
                                            metric_params=None, n_jobs=None,
                                            n_neighbors=5, p=2,
                                            weights='uniform'),
             iid='warn', n_jobs=-1,
             param_grid={'algorithm': ['auto'],
                         'metric': ['minkowski', 'manhattan'],
                         'n_neighbors': range(1, 32, 2), 'p': [1, 2],
                         'weights': ['uniform', 'distance']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=10)

In [7]:
knn = knnCV.best_estimator_
knn.fit(X_train, y_train.values.ravel())
# predict test values
y_pred_basic = knn.predict(X_test)

In [8]:
# calculate F1 score
f1_basic = f1_score(y_test, y_pred_basic)
print("F1 score is = {0}".format(f1_basic))
print("Accuracy is = {0}".format(accuracy_score(y_test, y_pred_basic)))

F1 score is = 0.0
Accuracy is = 0.8400835557618661


## KNN with removed outliers

In [9]:
# get data
X, y = functions.get_data(type="__removeoutliers_new")
# split dataset into train and test data
X_train, X_test, y_train, y_test_outliers = train_test_split(X, y, test_size=0.25, random_state=10, stratify=y)

In [10]:
knn = KNeighborsClassifier()
knn.fit(X_train, y_train.values.ravel())

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [11]:
# predict test values
y_pred_outliers = knn.predict(X_test)

In [12]:
# calculate F1 score
# calculate F1 score
f1_outliers = f1_score(y_test_outliers, y_pred_outliers)
"F1 score is = {0}".format(f1_outliers)

'F1 score is = 0.09744779582366589'

In [13]:
# creating KNN instance
knn = KNeighborsClassifier()
knnCV = GridSearchCV(knn, grid_params_nn, cv=10, scoring='accuracy', verbose=10, n_jobs=-1)

knnCV.fit(X_train, y_train.values.ravel())

Fitting 10 folds for each of 128 candidates, totalling 1280 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    4.5s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    8.9s
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    9.1s
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   17.6s
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   21.9s
[Parallel(n_jobs=-1)]: Done  45 tasks      | elapsed:   26.6s
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:   31.2s
[Parallel(n_jobs=-1)]: Done  69 tasks      | elapsed:   40.0s
[Parallel(n_jobs=-1)]: Done  82 tasks      | elapsed:   48.6s
[Parallel(n_jobs=-1)]: Done  97 tasks      | elapsed:   57.5s
[Parallel(n_jobs=-1)]: Done 112 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done 129 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 165 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  1

GridSearchCV(cv=10, error_score='raise-deprecating',
             estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                            metric='minkowski',
                                            metric_params=None, n_jobs=None,
                                            n_neighbors=5, p=2,
                                            weights='uniform'),
             iid='warn', n_jobs=-1,
             param_grid={'algorithm': ['auto'],
                         'metric': ['minkowski', 'manhattan'],
                         'n_neighbors': range(1, 32, 2), 'p': [1, 2],
                         'weights': ['uniform', 'distance']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=10)

In [14]:
knn = knnCV.best_estimator_
knn.fit(X_train, y_train.values.ravel())
# predict test values
y_pred_outliers = knn.predict(X_test)

# calculate F1 score
f1_outliers = f1_score(y_test_outliers, y_pred_outliers)
print("F1 score is = {0}".format(f1_outliers))
print("Accuracy is = {0}".format(accuracy_score(y_test_outliers, y_pred_outliers)))

F1 score is = 0.00574712643678161
Accuracy is = 0.8377871542428504


## KNN with PCA features

In [15]:
# get data
X, y = functions.get_data(type="__PCA_new")
# split dataset into train and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=10, stratify=y)

In [16]:
knn = KNeighborsClassifier()
knn.fit(X_train, y_train.values.ravel())

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [17]:
# predict test values
y_pred_PCA = knn.predict(X_test)

In [18]:
# calculate F1 score
f1_PCA = f1_score(y_test, y_pred_PCA)
"F1 score is = {0}".format(f1_PCA)

'F1 score is = 0.9859882005899705'

In [19]:
# creating KNN instance
knn = KNeighborsClassifier()
knnCV = GridSearchCV(knn, grid_params_nn, cv=10, scoring='accuracy', verbose=10, n_jobs=-1)

knnCV.fit(X_train, y_train.values.ravel())

Fitting 10 folds for each of 128 candidates, totalling 1280 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    5.0s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    9.9s
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:   10.2s
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   17.2s
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   20.9s
[Parallel(n_jobs=-1)]: Done  45 tasks      | elapsed:   28.0s
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:   34.4s
[Parallel(n_jobs=-1)]: Done  69 tasks      | elapsed:   43.5s
[Parallel(n_jobs=-1)]: Done  82 tasks      | elapsed:   53.4s
[Parallel(n_jobs=-1)]: Done  97 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 112 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 129 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 165 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  2

GridSearchCV(cv=10, error_score='raise-deprecating',
             estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                            metric='minkowski',
                                            metric_params=None, n_jobs=None,
                                            n_neighbors=5, p=2,
                                            weights='uniform'),
             iid='warn', n_jobs=-1,
             param_grid={'algorithm': ['auto'],
                         'metric': ['minkowski', 'manhattan'],
                         'n_neighbors': range(1, 32, 2), 'p': [1, 2],
                         'weights': ['uniform', 'distance']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=10)

In [20]:
knn = knnCV.best_estimator_
knn.fit(X_train, y_train.values.ravel())
# predict test values
y_pred_PCA = knn.predict(X_test)

# calculate F1 score
f1_PCA = f1_score(y_test, y_pred_PCA)
print("F1 score is = {0}".format(f1_PCA))
print("Accuracy is = {0}".format(accuracy_score(y_test, y_pred_PCA)))

F1 score is = 0.9882783882783883
Accuracy is = 0.9962864105837298


## KNN with feature tools features

In [21]:
# get data
X, y = functions.get_data(type="__FTs_new")
# split dataset into train and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=10, stratify=y)

In [22]:
knn = KNeighborsClassifier()
knn.fit(X_train, y_train.values.ravel())

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [23]:
# predict test values
y_pred_FT = knn.predict(X_test)

In [24]:
# calculate F1 score
f1_FT = f1_score(y_test, y_pred_FT)
"F1 score is = {0}".format(f1_FT)

'F1 score is = 0.13544018058690743'

In [25]:
# creating KNN instance
knn = KNeighborsClassifier()
knnCV = GridSearchCV(knn, grid_params_nn, cv=10, scoring='accuracy', verbose=10, n_jobs=-1)

knnCV.fit(X_train, y_train.values.ravel())

Fitting 10 folds for each of 128 candidates, totalling 1280 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1899s.) Setting batch_size=2.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.8s
[Parallel(n_jobs=-1)]: Done  52 tasks      | elapsed:    1.2s
[Parallel(n_jobs=-1)]: Done  74 tasks      | elapsed:    1.6s
[Parallel(n_jobs=-1)]: Done  96 tasks      | elapsed:    2.1s
[Parallel(n_jobs=-1)]: Done 122 tasks      | elapsed:    2.8s
[Parallel(n_jobs=-1)]: Done 148 tasks      | elapsed:    3.4s
[Parallel(n_jobs=-1)]: Done 178 tasks      | elapsed:    4.0s
[Parallel(n_jobs=-1)]: Done 208 tasks      | elapsed:    4.9s
[Parallel(n_jobs=-1)]: Done 242 tasks      | elapsed:    5.8s
[Parallel(n_jobs=-1)]: Done 276 tasks      | elapsed:    6.5s
[Parallel(n_jobs=-1)]: Done 314 tas

GridSearchCV(cv=10, error_score='raise-deprecating',
             estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                            metric='minkowski',
                                            metric_params=None, n_jobs=None,
                                            n_neighbors=5, p=2,
                                            weights='uniform'),
             iid='warn', n_jobs=-1,
             param_grid={'algorithm': ['auto'],
                         'metric': ['minkowski', 'manhattan'],
                         'n_neighbors': range(1, 32, 2), 'p': [1, 2],
                         'weights': ['uniform', 'distance']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=10)

In [26]:
knn = knnCV.best_estimator_
knn.fit(X_train, y_train.values.ravel())
# predict test values
y_pred_FT = knn.predict(X_test)

# calculate F1 score
f1_FT = f1_score(y_test, y_pred_FT)
print("F1 score is = {0}".format(f1_FT))
print("Accuracy is = {0}".format(accuracy_score(y_test, y_pred_FT)))

F1 score is = 0.010057471264367816
Accuracy is = 0.8400835557618661


## KNN with polynominal features

In [27]:
# get data
X, y = functions.get_data(type="__polynomial_new")
# split dataset into train and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=10, stratify=y)

In [28]:
knn = KNeighborsClassifier()
knn.fit(X_train, y_train.values.ravel())

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [29]:
# predict test values
y_pred_poly = knn.predict(X_test)

In [30]:
# calculate F1 score
f1_poly = f1_score(y_test, y_pred_poly)
"F1 score is = {0}".format(f1_poly)

'F1 score is = 0.14102564102564102'

In [31]:
# creating KNN instance
knn = KNeighborsClassifier()
knnCV = GridSearchCV(knn, grid_params_nn, cv=10, scoring='accuracy', verbose=10, n_jobs=-1)

knnCV.fit(X_train, y_train.values.ravel())

Fitting 10 folds for each of 128 candidates, totalling 1280 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    0.6s
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    0.9s
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    1.1s
[Parallel(n_jobs=-1)]: Done  45 tasks      | elapsed:    1.4s
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    1.8s
[Parallel(n_jobs=-1)]: Done  69 tasks      | elapsed:    2.3s
[Parallel(n_jobs=-1)]: Done  82 tasks      | elapsed:    2.8s
[Parallel(n_jobs=-1)]: Done  97 tasks      | elapsed:    3.4s
[Parallel(n_jobs=-1)]: Done 112 tasks      | elapsed:    3.8s
[Parallel(n_jobs=-1)]: Done 129 tasks      | elapsed:    4.7s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:    5.4s
[Parallel(n_jobs=-1)]: Done 165 tasks      | elapsed:    6.1s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   

GridSearchCV(cv=10, error_score='raise-deprecating',
             estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                            metric='minkowski',
                                            metric_params=None, n_jobs=None,
                                            n_neighbors=5, p=2,
                                            weights='uniform'),
             iid='warn', n_jobs=-1,
             param_grid={'algorithm': ['auto'],
                         'metric': ['minkowski', 'manhattan'],
                         'n_neighbors': range(1, 32, 2), 'p': [1, 2],
                         'weights': ['uniform', 'distance']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=10)

In [32]:
knn = knnCV.best_estimator_
knn.fit(X_train, y_train.values.ravel())
# predict test values
y_pred_poly = knn.predict(X_test)

# calculate F1 score
f1_poly = f1_score(y_test, y_pred_poly)
print("F1 score is = {0}".format(f1_poly))
print("Accuracy is = {0}".format(accuracy_score(y_test, y_pred_poly)))

F1 score is = 0.030790762771168653
Accuracy is = 0.839271208077057
