In [40]:
# This code conducts a grid search over a predefined set of hyperparameters for the KNN model, 
#evaluates the performance using cross-validation, and identifies the best combination of hyperparameters. 
#The model is then retrained with these optimal parameters and evaluated on the test set.

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score
from src.utils import load_data, cross_validate_model, initialize_model_pipeline
import pandas as pd
import numpy as np

# Load the dataset
# data = pd.read_csv('../data/training_data.csv')
# data = pd.read_csv('data_copy.csv')
data = load_data()

# Preparing the data
X = data.drop('increase_stock', axis=1) 
y = data['increase_stock']

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# K-Nearest Neighbors classifier
knn = KNeighborsClassifier()

# Parameters for grid search
param_grid = {
    'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan']
}

# Grid search for hyperparameter tuning
grid_search = GridSearchCV(knn, param_grid, cv=5, scoring='accuracy')

grid_search.fit(X_train, y_train)

# Best parameters
best_params = grid_search.best_params_

# Training the model with the best parameters
knn_best = KNeighborsClassifier(**best_params)
knn_best.fit(X_train, y_train)

# Predictions and evaluation
y_pred = knn_best.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("Best Parameters:", best_params)
print("Accuracy:", accuracy)
print("Classification Report:\n", report)

Best Parameters: {'metric': 'manhattan', 'n_neighbors': 12, 'weights': 'distance'}
Accuracy: 0.840625
Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.93      0.91       270
           1       0.49      0.34      0.40        50

    accuracy                           0.84       320
   macro avg       0.68      0.64      0.65       320
weighted avg       0.82      0.84      0.83       320


In [42]:
cross_validate_model(knn_best, data, data['increase_stock'], n_splits=10)

array([0.85   , 0.8625 , 0.86875, 0.93125, 0.85625, 0.825  , 0.875  ,
       0.85   , 0.85625, 0.8875 ])

In [27]:
leave_out_one = cross_validate_model(knn_best, data, data['increase_stock'],n_splits=len(data))

In [4]:
np.mean(leave_out_one)

0.86875

In [49]:
test = initialize_model_pipeline(KNeighborsClassifier())
thea_data = pd.DataFrame(test[0].fit_transform(data).toarray(), columns=test[:-1].get_feature_names_out())
thea_data['increase_stock'] = data['increase_stock']
thea_data.to_csv('../data/thea.csv')