In [54]:
# This code conducts a grid search over a predefined set of hyperparameters for the KNN model, 
#evaluates the performance using cross-validation, and identifies the best combination of hyperparameters. 
#The model is then retrained with these optimal parameters and evaluated on the test set.

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score
from src.utils import cross_validate_model, initialize_model_pipeline, load_data
import pandas as pd
import numpy as np

data = pd.read_csv('../data/thea.csv')

# Preparing the data
X = data.drop('increase_stock', axis=1) 
y = data['increase_stock']

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# K-Nearest Neighbors classifier
knn = KNeighborsClassifier()

# Parameters for grid search
param_grid = {
    'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan']
}

# Grid search for hyperparameter tuning
grid_search = GridSearchCV(knn, param_grid, cv=160, scoring='accuracy')

grid_search.fit(X_train, y_train)

# Best parameters
best_params = grid_search.best_params_

# Training the model with the best parameters
knn_best = KNeighborsClassifier(**best_params)
knn_best.fit(X_train, y_train)

# Predictions and evaluation
y_pred = knn_best.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("Best Parameters:", best_params)
print("Accuracy:", accuracy)
print("Classification Report:\n", report)

Best Parameters: {'metric': 'manhattan', 'n_neighbors': 2, 'weights': 'uniform'}
Accuracy: 0.84375
Classification Report:
               precision    recall  f1-score   support

           0       0.85      0.99      0.91       270
           1       0.50      0.08      0.14        50

    accuracy                           0.84       320
   macro avg       0.68      0.53      0.53       320
weighted avg       0.80      0.84      0.79       320


In [53]:
cross_validate_model(knn_best, data, data['increase_stock'], n_splits=160)

array([0.9, 0.9, 0.8, 0.9, 0.8, 1. , 0.8, 1. , 0.9, 0.8, 0.9, 0.8, 0.9,
       0.9, 0.8, 0.8, 1. , 0.8, 1. , 0.8, 0.8, 0.7, 1. , 0.8, 0.9, 0.8,
       0.9, 0.9, 0.9, 1. , 0.8, 0.9, 0.8, 1. , 0.6, 0.8, 1. , 0.7, 0.9,
       0.9, 0.8, 1. , 0.8, 0.8, 1. , 0.7, 0.8, 0.9, 0.9, 0.8, 0.8, 1. ,
       0.9, 0.7, 0.7, 1. , 0.9, 0.8, 1. , 0.8, 1. , 1. , 0.8, 1. , 0.9,
       0.8, 0.7, 1. , 0.9, 0.8, 0.8, 0.8, 0.8, 0.9, 0.9, 1. , 1. , 0.9,
       0.8, 0.8, 0.8, 0.8, 0.8, 0.7, 1. , 0.8, 0.5, 0.8, 0.8, 0.6, 0.8,
       0.9, 0.9, 0.8, 0.8, 0.8, 0.9, 0.9, 0.7, 0.6, 1. , 0.7, 0.7, 0.9,
       0.8, 1. , 0.9, 0.9, 0.9, 1. , 0.8, 0.8, 1. , 0.7, 0.9, 0.7, 0.9,
       0.9, 0.9, 0.8, 1. , 0.8, 0.8, 1. , 0.8, 0.6, 0.7, 0.8, 0.9, 0.8,
       0.8, 0.9, 1. , 0.9, 0.8, 0.9, 1. , 0.7, 0.8, 1. , 0.9, 0.8, 0.8,
       0.9, 0.6, 0.8, 0.9, 0.9, 1. , 0.7, 0.9, 0.9, 0.9, 0.7, 0.8, 0.9,
       0.9, 0.9, 0.9, 1. ])

In [27]:
leave_out_one = cross_validate_model(knn_best, data, data['increase_stock'],n_splits=len(data))