# KNN Analysis Notebook

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sys
import os

# Added path to access preprocessing functions
sys.path.append(os.path.abspath("../src"))


## 1. Data Preparation

### 1. Traffic Dataset

In [8]:
from data_processing.preprocess import load_traffic_data

(
    x_train_traffic,
    x_test_traffic,
    y_train_traffic,
    y_test_traffic,
    label_encoder_traffic,
    scaler_traffic
) = load_traffic_data()

display(x_train_traffic)

Unnamed: 0,Time,CarCount,BikeCount,BusCount,TruckCount,Total,Day,Day of the week_Monday,Day of the week_Saturday,Day of the week_Sunday,Day of the week_Thursday,Day of the week_Tuesday,Day of the week_Wednesday
232,-0.217344,-0.921752,-0.987072,1.685316,0.963500,-0.345112,-0.496058,False,False,False,True,False,False
527,-0.072904,-0.137105,-0.357215,-0.638451,0.587406,-0.228107,-0.151766,False,False,True,False,False,False
2037,-0.939544,0.909091,0.272642,-0.356782,-0.446853,0.590930,1.684459,False,False,False,False,True,False
473,1.515937,-1.183302,-1.065805,-0.990537,2.279829,-0.963568,-0.266530,False,True,False,False,False,False
909,-0.072904,-0.420450,-0.750876,-0.356782,0.493382,-0.478832,0.307290,False,False,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
123,-0.795104,1.236028,0.351374,0.699475,-1.105018,0.992091,-0.610822,False,False,False,False,False,True
1602,0.649296,1.737330,-0.042287,1.544481,-0.634900,1.577117,1.110639,False,False,False,True,False,False
531,0.071536,-1.052527,-0.357215,-0.215948,0.775453,-0.796418,-0.151766,False,False,True,False,False,False
2513,-1.083984,0.538563,-0.829608,-0.990537,0.023265,0.005904,-1.299406,False,False,True,False,False,False


### 2. Congressional Voting 

In [9]:
from data_processing.preprocess import load_congressional_voting_data

(
    x_train_voting,
    x_test_voting,
    y_train_voting,
    y_test_voting,
    label_encoder_voting,
    scaler_voting
) = load_congressional_voting_data()

display(x_train_voting)

Unnamed: 0,handicapped-infants,water-project-cost-sharing,adoption-of-the-budget-resolution,physician-fee-freeze,el-salvador-aid,religious-groups-in-schools,anti-satellite-test-ban,aid-to-nicaraguan-contras,mx-missile,immigration,synfuels-crporation-cutback,education-spending,superfund-right-to-sue,crime,duty-free-exports,export-administration-act-south-africa
202,-0.911770,0.818737,-1.204829,1.156919,1.0,0.72111,-1.188570,-1.273063,-0.948683,1.082326,1.366720,1.221394,0.887826,-1.204829,-0.864365,0.422249
73,-0.911770,-1.221394,-1.204829,1.156919,1.0,-1.38675,-1.188570,-1.273063,-0.948683,-0.923936,-0.731679,1.221394,-1.126347,0.829993,-0.864365,-2.368268
22,-0.911770,-1.221394,-1.204829,-0.864365,-1.0,-1.38675,0.841347,0.785507,1.054093,1.082326,-0.731679,1.221394,0.887826,0.829993,1.156919,0.422249
31,1.096767,0.818737,0.829993,-0.864365,-1.0,-1.38675,0.841347,0.785507,-0.948683,1.082326,-0.731679,-0.818737,-1.126347,-1.204829,-0.864365,0.422249
41,-0.911770,-1.221394,0.829993,-0.864365,-1.0,0.72111,0.841347,0.785507,1.054093,-0.923936,-0.731679,-0.818737,-1.126347,-1.204829,1.156919,0.422249
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200,1.096767,0.818737,0.829993,-0.864365,-1.0,-1.38675,0.841347,0.785507,1.054093,-0.923936,1.366720,-0.818737,-1.126347,-1.204829,1.156919,0.422249
212,1.096767,0.818737,0.829993,-0.864365,-1.0,-1.38675,0.841347,0.785507,1.054093,-0.923936,1.366720,-0.818737,-1.126347,-1.204829,1.156919,0.422249
197,-0.911770,-1.221394,-1.204829,1.156919,1.0,0.72111,-1.188570,-1.273063,-0.948683,-0.923936,-0.731679,1.221394,0.887826,0.829993,-0.864365,-2.368268
122,-0.911770,-1.221394,-1.204829,1.156919,1.0,0.72111,0.841347,-1.273063,-0.948683,1.082326,-0.731679,1.221394,-1.126347,0.829993,1.156919,0.422249


### 3. Amazon Reviews

In [10]:
from data_processing.preprocess import load_amazon_review_data

(
    x_train_amazon,
    x_test_amazon,
    y_train_amazon,
    y_test_amazon,
    label_encoder_amazon,
    scaler_amazon
) = load_amazon_review_data()

display(x_train_amazon)


array([[ 1.26347948e+01,  6.93827250e+00, -2.40156227e+00, ...,
        -4.31372439e+00, -1.04021262e+00, -4.49659272e+00],
       [ 1.86835778e+01,  1.24820789e+00, -3.24821678e+00, ...,
         1.37686828e-03,  3.32584535e+00, -2.57219783e+00],
       [ 1.51260608e+01, -3.77435393e+00,  7.16772543e+00, ...,
         7.14202867e-01,  6.04863923e+00, -2.53138196e-02],
       ...,
       [ 1.80839517e+01,  7.58503259e+00,  1.69092047e+00, ...,
        -1.92480392e+00, -1.30676196e+00,  1.85631845e+00],
       [-4.60756206e+00, -1.17614456e+01, -5.27940835e+00, ...,
        -5.26509143e+00, -1.07395139e+00, -1.90314147e+00],
       [-9.61616174e+00, -8.22084311e+00,  7.88243754e+00, ...,
        -7.93074487e+00,  3.35659361e+00, -3.63000869e+00]])

### 4. Wine Reviews 

In [3]:
from data_processing.preprocess import load_wine_review_data

(
    x_train_wine,
    x_test_wine,
    y_train_wine,
    y_test_wine,
    label_encoder_wine,
    scaler_wine
) = load_wine_review_data(data_size=1000)

display(x_train_wine)

Loading processed data...
Data shape after selecting top 10 countries: (143344, 10)
Splitting data into training and validation sets...


<800x3717 sparse matrix of type '<class 'numpy.float64'>'
	with 21910 stored elements in Compressed Sparse Row format>

## Grid Search 

#### KNN parameter setting

In [13]:
import sklearn
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import (
    average_precision_score,
    f1_score,
    precision_score,
    recall_score,
)
import time

# Define test parameters
neighbor_range = range(1, 21)
weight_functions = ["uniform", "distance"]
computation_algorithms = ["ball_tree", "kd_tree", "brute"]

# compute valid metrics for our computation algorithms
valid_metrics_ball_tree = sorted(sklearn.neighbors.VALID_METRICS["ball_tree"])
valid_metrics_kd_tree = sorted(sklearn.neighbors.VALID_METRICS["kd_tree"])
valid_metrics_brute_force = sorted(sklearn.neighbors.VALID_METRICS["brute"])
print(
    np.intersect1d(
        valid_metrics_ball_tree, valid_metrics_kd_tree, valid_metrics_brute_force
    )
)

metrics = ["euclidean", "cityblock", "manhattan", "l1", "l2"]

['chebyshev' 'cityblock' 'euclidean' 'infinity' 'l1' 'l2' 'manhattan'
 'minkowski' 'p']


### 1. Traffic Dataset 

#### Base Model

In [None]:
# computation of base model
base_model = KNeighborsClassifier(
    n_neighbors=10, algorithm="ball_tree", metric="euclidean"
)
base_model.fit(x_train_traffic, y_train_traffic)

# computation of base accuracy and probabilites
base_model_prediction = base_model.predict(x_test_traffic)
base_model_probabilites = base_model.predict_proba(x_test_traffic)

# Computation of base model evaluation metrics
base_model_accuracy = average_precision_score(
    y_test_traffic, base_model_probabilites, average="weighted"
)
base_model_f1_score = f1_score(
    y_test_traffic, base_model_prediction, average="weighted"
)
base_model_precision = precision_score(
    y_test_traffic, base_model_prediction, average="weighted"
)
base_model_recall = recall_score(
    y_test_traffic, base_model_prediction, average="weighted"
)

#### Grid Search

In [16]:
# dataframe initalisiation for storing results
test_results_traffic = pd.DataFrame(
    columns=[
        "Number Of Neighbors",
        "Metric",
        "Computation Algorithm",
        "Weight Function",
        "Accuracy",
        "Precision",
        "Recall",
        "F1 Score",
    ]
)

base_row = pd.DataFrame(
    {
        "Number Of Neighbors": [10],
        "Metric": "euclidean",
        "Computation Algorithm": "ball_tree",
        "Weight Function": "uniform",
        "Accuracy": [base_model_accuracy],
        "Precision": [base_model_precision],
        "Recall": [base_model_recall],
        "F1 Score": [base_model_f1_score],
    }
)

test_results_traffic = pd.concat([test_results_traffic, base_row], ignore_index=True)

In [11]:
for weight_function in weight_functions:
    for computation_algorithm in computation_algorithms:
        for metric in metrics:
            for number_of_neigbors in neighbor_range:
                current_model = KNeighborsClassifier(
                    n_neighbors=number_of_neigbors,
                    algorithm=computation_algorithm,
                    metric=metric,
                    weights=weight_function,
                )
                current_model.fit(x_train_traffic, y_train_traffic)

                # compute predicitons and probabilites for current model
                current_model_prediction = current_model.predict(x_test_traffic)
                current_model_probabilites = current_model.predict_proba(x_test_traffic)

                # compute evaluation metrics for current model
                current_model_accuracy = average_precision_score(
                    y_test_traffic, current_model_probabilites, average="weighted"
                )
                current_model_f1_score = f1_score(
                    y_test_traffic, current_model_prediction, average="weighted"
                )
                current_model_precision = precision_score(
                    y_test_traffic, current_model_prediction, average="weighted"
                )
                current_model_recall = recall_score(
                    y_test_traffic, current_model_prediction, average="weighted"
                )

                # create row and append to results dataframe
                current_row = pd.DataFrame(
                    {
                        "Number Of Neighbors": [number_of_neigbors],
                        "Metric": metric,
                        "Computation Algorithm": computation_algorithm,
                        "Weight Function": weight_function,
                        "Accuracy": [current_model_accuracy],
                        "Precision": [current_model_precision],
                        "Recall": [current_model_recall],
                        "F1 Score": [current_model_f1_score],
                    }
                )

                test_results_traffic = pd.concat(
                    [test_results_traffic, current_row], ignore_index=True
                )

In [12]:
display(test_results_traffic)
test_results_traffic.info()

Unnamed: 0,Number Of Neighbors,Metric,Computation Algorithm,Weight Function,Accuracy,Precision,Recall,F1 Score
0,10,euclidean,ball_tree,uniform,0.903359,0.841229,0.845465,0.842211
1,1,euclidean,ball_tree,uniform,0.782881,0.852304,0.854423,0.853100
2,2,euclidean,ball_tree,uniform,0.842477,0.851341,0.838746,0.841306
3,3,euclidean,ball_tree,uniform,0.868601,0.841833,0.845838,0.843153
4,4,euclidean,ball_tree,uniform,0.881112,0.847577,0.843598,0.844415
...,...,...,...,...,...,...,...,...
596,16,l2,brute,distance,0.858737,0.865735,0.869728,0.866496
597,17,l2,brute,distance,0.859163,0.865935,0.870101,0.866591
598,18,l2,brute,distance,0.859015,0.867739,0.871967,0.868135
599,19,l2,brute,distance,0.858952,0.865418,0.869728,0.865873


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 601 entries, 0 to 600
Data columns (total 8 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Number Of Neighbors    601 non-null    object 
 1   Metric                 601 non-null    object 
 2   Computation Algorithm  601 non-null    object 
 3   Weight Function        601 non-null    object 
 4   Accuracy               601 non-null    float64
 5   Precision              601 non-null    float64
 6   Recall                 601 non-null    float64
 7   F1 Score               601 non-null    float64
dtypes: float64(4), object(4)
memory usage: 37.7+ KB


#### Model selection

In [15]:
top_10_traffic_models = test_results_traffic.sort_values(by="F1 Score", ascending=False).head(10)

display(top_10_traffic_models)

Unnamed: 0,Number Of Neighbors,Metric,Computation Algorithm,Weight Function,Accuracy,Precision,Recall,F1 Score
468,8,l1,kd_tree,distance,0.860481,0.871683,0.874207,0.872456
528,8,cityblock,brute,distance,0.86061,0.871683,0.874207,0.872456
348,8,manhattan,ball_tree,distance,0.860528,0.871683,0.874207,0.872456
568,8,l1,brute,distance,0.86061,0.871683,0.874207,0.872456
328,8,cityblock,ball_tree,distance,0.860528,0.871683,0.874207,0.872456
448,8,manhattan,kd_tree,distance,0.860481,0.871683,0.874207,0.872456
548,8,manhattan,brute,distance,0.86061,0.871683,0.874207,0.872456
428,8,cityblock,kd_tree,distance,0.860481,0.871683,0.874207,0.872456
368,8,l1,ball_tree,distance,0.860528,0.871683,0.874207,0.872456
557,17,manhattan,brute,distance,0.858919,0.870951,0.874953,0.871529


### 2. Congressional Voting Dataset

#### Base Model

In [22]:
# computation of base model
base_model = KNeighborsClassifier(
    n_neighbors=10, algorithm="ball_tree", metric="euclidean"
)
base_model.fit(x_train_voting, y_train_voting)

# computation of base accuracy and probabilites
base_model_prediction = base_model.predict(x_test_voting)
base_model_probabilites = base_model.predict_proba(x_test_voting)

# Computation of base model evaluation metrics
base_model_accuracy = average_precision_score(
    y_test_voting, base_model_prediction, average="weighted"
)
base_model_f1_score = f1_score(
    y_test_voting, base_model_prediction, average="weighted"
)
base_model_precision = precision_score(
    y_test_voting, base_model_prediction, average="weighted"
)
base_model_recall = recall_score(
    y_test_voting, base_model_prediction, average="weighted"
)

#### Grid search

In [24]:
# dataframe initalisiation for storing results
test_results_voting = pd.DataFrame(
    columns=[
        "Number Of Neighbors",
        "Metric",
        "Computation Algorithm",
        "Weight Function",
        "Accuracy",
        "Precision",
        "Recall",
        "F1 Score",
    ]
)

base_row = pd.DataFrame(
    {
        "Number Of Neighbors": [10],
        "Metric": "euclidean",
        "Computation Algorithm": "ball_tree",
        "Weight Function": "uniform",
        "Accuracy": [base_model_accuracy],
        "Precision": [base_model_precision],
        "Recall": [base_model_recall],
        "F1 Score": [base_model_f1_score],
    }
)

test_results_voting = pd.concat([test_results_voting, base_row], ignore_index=True)
display(test_results_voting)

Unnamed: 0,Number Of Neighbors,Metric,Computation Algorithm,Weight Function,Accuracy,Precision,Recall,F1 Score
0,10,euclidean,ball_tree,uniform,0.789502,0.881313,0.878788,0.87924


In [28]:
for weight_function in weight_functions:
    for computation_algorithm in computation_algorithms:
        for metric in metrics:
            for number_of_neigbors in neighbor_range:
                current_model = KNeighborsClassifier(
                    n_neighbors=number_of_neigbors,
                    algorithm=computation_algorithm,
                    metric=metric,
                    weights=weight_function,
                )
                current_model.fit(x_train_voting, y_train_voting)

                # compute predicitons and probabilites for current model
                current_model_prediction = current_model.predict(x_test_voting)
                current_model_probabilites = current_model.predict_proba(x_test_voting)

                # compute evaluation metrics for current model
                current_model_accuracy = average_precision_score(
                    y_test_voting, current_model_prediction, average="weighted"
                )
                current_model_f1_score = f1_score(
                    y_test_voting, current_model_prediction, average="weighted"
                )
                current_model_precision = precision_score(
                    y_test_voting, current_model_prediction, average="weighted"
                )
                current_model_recall = recall_score(
                    y_test_voting, current_model_prediction, average="weighted"
                )

                # create row and append to results dataframe
                current_row = pd.DataFrame(
                    {
                        "Number Of Neighbors": [number_of_neigbors],
                        "Metric": metric,
                        "Computation Algorithm": computation_algorithm,
                        "Weight Function": weight_function,
                        "Accuracy": [current_model_accuracy],
                        "Precision": [current_model_precision],
                        "Recall": [current_model_recall],
                        "F1 Score": [current_model_f1_score],
                    }
                )

                test_results_voting = pd.concat(
                    [test_results_voting, current_row], ignore_index=True
                )

In [None]:
display(test_results_voting)
test_results_voting.info()

Unnamed: 0,Number Of Neighbors,Metric,Computation Algorithm,Weight Function,Accuracy,Precision,Recall,F1 Score
0,10,euclidean,ball_tree,uniform,0.903359,0.841229,0.845465,0.842211
1,2,l2,ball_tree,distance,0.789502,0.881313,0.878788,0.879240
2,1,euclidean,ball_tree,uniform,0.789502,0.881313,0.878788,0.879240
3,2,euclidean,ball_tree,uniform,0.842462,0.901216,0.893939,0.892014
4,3,euclidean,ball_tree,uniform,0.815159,0.894801,0.893939,0.894163
...,...,...,...,...,...,...,...,...
597,16,l2,brute,distance,0.784767,0.886586,0.878788,0.879461
598,17,l2,brute,distance,0.809105,0.898673,0.893939,0.894457
599,18,l2,brute,distance,0.809105,0.898673,0.893939,0.894457
600,19,l2,brute,distance,0.809105,0.898673,0.893939,0.894457


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 602 entries, 0 to 601
Data columns (total 8 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Number Of Neighbors    602 non-null    object 
 1   Metric                 602 non-null    object 
 2   Computation Algorithm  602 non-null    object 
 3   Weight Function        602 non-null    object 
 4   Accuracy               602 non-null    float64
 5   Precision              602 non-null    float64
 6   Recall                 602 non-null    float64
 7   F1 Score               602 non-null    float64
dtypes: float64(4), object(4)
memory usage: 37.8+ KB


#### Model selection

In [30]:
top_10_voting_models = test_results_voting.sort_values(by="F1 Score", ascending=False).head(10)

display(top_10_voting_models)

Unnamed: 0,Number Of Neighbors,Metric,Computation Algorithm,Weight Function,Accuracy,Precision,Recall,F1 Score
514,13,euclidean,brute,distance,0.835065,0.911448,0.909091,0.90943
394,13,l2,ball_tree,distance,0.835065,0.911448,0.909091,0.90943
494,13,l2,kd_tree,distance,0.835065,0.911448,0.909091,0.90943
594,13,l2,brute,distance,0.835065,0.911448,0.909091,0.90943
414,13,euclidean,kd_tree,distance,0.835065,0.911448,0.909091,0.90943
314,13,euclidean,ball_tree,distance,0.835065,0.911448,0.909091,0.90943
316,15,euclidean,ball_tree,distance,0.809105,0.898673,0.893939,0.894457
338,17,cityblock,ball_tree,distance,0.809105,0.898673,0.893939,0.894457
337,16,cityblock,ball_tree,distance,0.809105,0.898673,0.893939,0.894457
327,6,cityblock,ball_tree,distance,0.809105,0.898673,0.893939,0.894457


### 3. Amazon Reviews Dataset 

#### Base Model

In [33]:
# computation of base model
base_model = KNeighborsClassifier(
    n_neighbors=10, algorithm="ball_tree", metric="euclidean"
)
base_model.fit(x_train_amazon, y_train_amazon)

# computation of base accuracy and probabilites
base_model_prediction = base_model.predict(x_test_amazon)
base_model_probabilites = base_model.predict_proba(x_test_amazon)

# Computation of base model evaluation metrics
base_model_accuracy = average_precision_score(
    y_test_amazon, base_model_probabilites, average="weighted"
)
base_model_f1_score = f1_score(y_test_amazon, base_model_prediction, average="weighted")
base_model_precision = precision_score(
    y_test_amazon, base_model_prediction, average="weighted"
)
base_model_recall = recall_score(
    y_test_amazon, base_model_prediction, average="weighted"
)

  _warn_prf(average, modifier, msg_start, len(result))


#### Grid Search

In [34]:
test_results_amazon = pd.DataFrame(
    columns=[
        "Number Of Neighbors",
        "Metric",
        "Computation Algorithm",
        "Weight Function",
        "Accuracy",
        "Precision",
        "Recall",
        "F1 Score",
    ]
)

base_row = pd.DataFrame(
    {
        "Number Of Neighbors": [10],
        "Metric": "euclidean",
        "Computation Algorithm": "ball_tree",
        "Weight Function": "uniform",
        "Accuracy": [base_model_accuracy],
        "Precision": [base_model_precision],
        "Recall": [base_model_recall],
        "F1 Score": [base_model_f1_score],
    }
)

test_results_amazon = pd.concat([test_results_amazon, base_row], ignore_index=True)
display(test_results_amazon)

Unnamed: 0,Number Of Neighbors,Metric,Computation Algorithm,Weight Function,Accuracy,Precision,Recall,F1 Score
0,10,euclidean,ball_tree,uniform,0.233246,0.203092,0.173333,0.140627


In [36]:
for weight_function in weight_functions:
    for computation_algorithm in computation_algorithms:
        for metric in metrics:
            for number_of_neigbors in neighbor_range:
                current_model = KNeighborsClassifier(
                    n_neighbors=number_of_neigbors,
                    algorithm=computation_algorithm,
                    metric=metric,
                    weights=weight_function,
                )
                current_model.fit(x_train_amazon, y_train_amazon)

                # compute predicitons and probabilites for current model
                current_model_prediction = current_model.predict(x_test_amazon)
                current_model_probabilites = current_model.predict_proba(x_test_amazon)

                # compute evaluation metrics for current model
                current_model_accuracy = average_precision_score(
                    y_test_amazon, current_model_probabilites, average="weighted"
                )
                current_model_f1_score = f1_score(
                    y_test_amazon, current_model_prediction, average="weighted"
                )
                current_model_precision = precision_score(
                    y_test_amazon, current_model_prediction, average="weighted"
                )
                current_model_recall = recall_score(
                    y_test_amazon, current_model_prediction, average="weighted"
                )

                # create row and append to results dataframe
                current_row = pd.DataFrame(
                    {
                        "Number Of Neighbors": [number_of_neigbors],
                        "Metric": metric,
                        "Computation Algorithm": computation_algorithm,
                        "Weight Function": weight_function,
                        "Accuracy": [current_model_accuracy],
                        "Precision": [current_model_precision],
                        "Recall": [current_model_recall],
                        "F1 Score": [current_model_f1_score],
                    }
                )

                test_results_amazon = pd.concat(
                    [test_results_amazon, current_row], ignore_index=True
                )

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

In [37]:
display(test_results_amazon)
test_results_amazon.info()

Unnamed: 0,Number Of Neighbors,Metric,Computation Algorithm,Weight Function,Accuracy,Precision,Recall,F1 Score
0,10,euclidean,ball_tree,uniform,0.233246,0.203092,0.173333,0.140627
1,1,euclidean,ball_tree,uniform,0.141168,0.294118,0.200000,0.198496
2,2,euclidean,ball_tree,uniform,0.141290,0.154499,0.133333,0.112142
3,3,euclidean,ball_tree,uniform,0.161518,0.159387,0.160000,0.127015
4,4,euclidean,ball_tree,uniform,0.190815,0.223355,0.177778,0.152559
...,...,...,...,...,...,...,...,...
596,16,l2,brute,distance,0.316493,0.228552,0.173333,0.146032
597,17,l2,brute,distance,0.318279,0.218634,0.164444,0.136228
598,18,l2,brute,distance,0.326468,0.225936,0.168889,0.141410
599,19,l2,brute,distance,0.329569,0.221107,0.160000,0.132554


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 601 entries, 0 to 600
Data columns (total 8 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Number Of Neighbors    601 non-null    object 
 1   Metric                 601 non-null    object 
 2   Computation Algorithm  601 non-null    object 
 3   Weight Function        601 non-null    object 
 4   Accuracy               601 non-null    float64
 5   Precision              601 non-null    float64
 6   Recall                 601 non-null    float64
 7   F1 Score               601 non-null    float64
dtypes: float64(4), object(4)
memory usage: 37.7+ KB


#### Model selection

In [38]:
top_10_amazon_models = test_results_amazon.sort_values(by="F1 Score", ascending=False).head(10)

display(top_10_amazon_models)

Unnamed: 0,Number Of Neighbors,Metric,Computation Algorithm,Weight Function,Accuracy,Precision,Recall,F1 Score
583,3,l2,brute,distance,0.225814,0.301045,0.208889,0.201083
383,3,l2,ball_tree,distance,0.225814,0.301045,0.208889,0.201083
303,3,euclidean,ball_tree,distance,0.225814,0.301045,0.208889,0.201083
483,3,l2,kd_tree,distance,0.225814,0.301045,0.208889,0.201083
403,3,euclidean,kd_tree,distance,0.225814,0.301045,0.208889,0.201083
503,3,euclidean,brute,distance,0.225814,0.301045,0.208889,0.201083
502,2,euclidean,brute,distance,0.187754,0.294118,0.2,0.198496
582,2,l2,brute,distance,0.187754,0.294118,0.2,0.198496
581,1,l2,brute,distance,0.141168,0.294118,0.2,0.198496
81,1,l2,ball_tree,uniform,0.141168,0.294118,0.2,0.198496


### 4. Wine Reviews Dataset 


#### Base Model 

In [12]:
# computation of base model
base_model = KNeighborsClassifier(
    n_neighbors=10, algorithm="ball_tree", metric="euclidean"
)
base_model.fit(x_train_wine, y_train_wine)

# computation of base accuracy and probabilites
base_model_prediction = base_model.predict(x_test_wine)
base_model_probabilites = base_model.predict_proba(x_test_wine)

# Computation of base model evaluation metrics
base_model_accuracy = average_precision_score(
    y_test_wine, base_model_probabilites, average="weighted"
)
base_model_f1_score = f1_score(y_test_wine, base_model_prediction, average="weighted")
base_model_precision = precision_score(
    y_test_wine, base_model_prediction, average="weighted"
)
base_model_recall = recall_score(
    y_test_wine, base_model_prediction, average="weighted"
)



#### Grid Search

In [14]:
test_results_wine = pd.DataFrame(
    columns=[
        "Number Of Neighbors",
        "Metric",
        "Computation Algorithm",
        "Weight Function",
        "Accuracy",
        "Precision",
        "Recall",
        "F1 Score",
    ]
)

base_row = pd.DataFrame(
    {
        "Number Of Neighbors": [10],
        "Metric": "euclidean",
        "Computation Algorithm": "ball_tree",
        "Weight Function": "uniform",
        "Accuracy": [base_model_accuracy],
        "Precision": [base_model_precision],
        "Recall": [base_model_recall],
        "F1 Score": [base_model_f1_score],
    }
)

test_results_wine = pd.concat([test_results_wine, base_row], ignore_index=True)
display(test_results_wine)

Unnamed: 0,Number Of Neighbors,Metric,Computation Algorithm,Weight Function,Accuracy,Precision,Recall,F1 Score
0,10,euclidean,ball_tree,uniform,0.473328,0.468193,0.48,0.458632


In [15]:
for weight_function in weight_functions:
    for computation_algorithm in computation_algorithms:
        for metric in metrics:
            for number_of_neigbors in neighbor_range:
                current_model = KNeighborsClassifier(
                    n_neighbors=number_of_neigbors,
                    algorithm=computation_algorithm,
                    metric=metric,
                    weights=weight_function,
                )
                current_model.fit(x_train_wine, y_train_wine)

                # compute predicitons and probabilites for current model
                current_model_prediction = current_model.predict(x_test_wine)
                current_model_probabilites = current_model.predict_proba(x_test_wine)

                # compute evaluation metrics for current model
                current_model_accuracy = average_precision_score(
                    y_test_wine, current_model_probabilites, average="weighted"
                )
                current_model_f1_score = f1_score(
                    y_test_wine, current_model_prediction, average="weighted"
                )
                current_model_precision = precision_score(
                    y_test_wine, current_model_prediction, average="weighted"
                )
                current_model_recall = recall_score(
                    y_test_wine, current_model_prediction, average="weighted"
                )

                # create row and append to results dataframe
                current_row = pd.DataFrame(
                    {
                        "Number Of Neighbors": [number_of_neigbors],
                        "Metric": metric,
                        "Computation Algorithm": computation_algorithm,
                        "Weight Function": weight_function,
                        "Accuracy": [current_model_accuracy],
                        "Precision": [current_model_precision],
                        "Recall": [current_model_recall],
                        "F1 Score": [current_model_f1_score],
                    }
                )

                test_results_wine = pd.concat(
                    [test_results_wine, current_row], ignore_index=True
                )

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

In [16]:
display(test_results_wine)
test_results_wine.info()

Unnamed: 0,Number Of Neighbors,Metric,Computation Algorithm,Weight Function,Accuracy,Precision,Recall,F1 Score
0,10,euclidean,ball_tree,uniform,0.473328,0.468193,0.480,0.458632
1,1,euclidean,ball_tree,uniform,0.248307,0.421366,0.425,0.417762
2,2,euclidean,ball_tree,uniform,0.345547,0.447453,0.465,0.438024
3,3,euclidean,ball_tree,uniform,0.413148,0.474103,0.455,0.444823
4,4,euclidean,ball_tree,uniform,0.440981,0.458223,0.465,0.454883
...,...,...,...,...,...,...,...,...
596,16,l2,brute,distance,0.431862,0.401877,0.405,0.388744
597,17,l2,brute,distance,0.440670,0.409891,0.395,0.381757
598,18,l2,brute,distance,0.440339,0.424355,0.400,0.390379
599,19,l2,brute,distance,0.445430,0.413535,0.390,0.375670


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 601 entries, 0 to 600
Data columns (total 8 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Number Of Neighbors    601 non-null    object 
 1   Metric                 601 non-null    object 
 2   Computation Algorithm  601 non-null    object 
 3   Weight Function        601 non-null    object 
 4   Accuracy               601 non-null    float64
 5   Precision              601 non-null    float64
 6   Recall                 601 non-null    float64
 7   F1 Score               601 non-null    float64
dtypes: float64(4), object(4)
memory usage: 37.7+ KB


#### Model selection

In [17]:
top_10_wine_models = test_results_wine.sort_values(by="F1 Score", ascending=False).head(10)

display(top_10_wine_models)

Unnamed: 0,Number Of Neighbors,Metric,Computation Algorithm,Weight Function,Accuracy,Precision,Recall,F1 Score
415,15,euclidean,kd_tree,distance,0.521879,0.521072,0.505,0.493325
515,15,euclidean,brute,distance,0.521879,0.521072,0.505,0.493325
315,15,euclidean,ball_tree,distance,0.521879,0.521072,0.505,0.493325
410,10,euclidean,kd_tree,distance,0.540151,0.505659,0.505,0.492766
510,10,euclidean,brute,distance,0.540151,0.505659,0.505,0.492766
310,10,euclidean,ball_tree,distance,0.540151,0.505659,0.505,0.492766
308,8,euclidean,ball_tree,distance,0.548363,0.496814,0.5,0.49209
408,8,euclidean,kd_tree,distance,0.548363,0.496814,0.5,0.49209
508,8,euclidean,brute,distance,0.548363,0.496814,0.5,0.49209
412,12,euclidean,kd_tree,distance,0.52886,0.502864,0.505,0.487545
