In [7]:
# Import libraries
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report

# Load data
X_train = pd.read_excel("C:/Users/Lakmini/Desktop/ML product/booking cancellation/X_train.xlsx")
X_test = pd.read_excel("C:/Users/Lakmini/Desktop/ML product/booking cancellation/X_test.xlsx")
y_train = pd.read_excel("C:/Users/Lakmini/Desktop/ML product/booking cancellation/y_train.xlsx").squeeze()
y_test = pd.read_excel("C:/Users/Lakmini/Desktop/ML product/booking cancellation/y_test.xlsx").squeeze()

# Identify numeric and categorical columns
numeric_cols = X_train.select_dtypes(include=["int64", "float64"]).columns.tolist()
categorical_cols = X_train.select_dtypes(include=["object", "category"]).columns.tolist()

# Preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore', drop='first'), categorical_cols)
    ])

# Combine preprocessor and KNN model
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('knn', KNeighborsClassifier())
])

# Training and test accuracy with default k=5
pipeline.set_params(knn__n_neighbors=5)
pipeline.fit(X_train, y_train)

train_acc = accuracy_score(y_train, pipeline.predict(X_train))
test_acc = accuracy_score(y_test, pipeline.predict(X_test))

print(f"Training Accuracy (k=5): {train_acc:.4f}")
print(f"Testing Accuracy  (k=5): {test_acc:.4f}")

# Hyperparameter tuning: find best k from 1 to 20
param_grid = {'knn__n_neighbors': list(range(1, 21))}
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Best model results
best_k = grid_search.best_params_['knn__n_neighbors']
print(f"\nBest k: {best_k}")
print(f"Best CV Accuracy: {grid_search.best_score_:.4f}")

# Get the best model
best_model = grid_search.best_estimator_

# --- Training Evaluation ---
y_train_pred = best_model.predict(X_train)
train_accuracy = accuracy_score(y_train, y_train_pred)
print(f"\nTraining Accuracy with best k={best_k}: {train_accuracy:.4f}")
print("\nTraining Classification Report:\n", classification_report(y_train, y_train_pred))

# --- Testing Evaluation ---
y_test_pred = best_model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f"\nTest Accuracy with best k={best_k}: {test_accuracy:.4f}")
print("\nTest Classification Report:\n", classification_report(y_test, y_test_pred))

Training Accuracy (k=5): 0.8764
Testing Accuracy  (k=5): 0.8246

Best k: 6
Best CV Accuracy: 0.8221

Training Accuracy with best k=6: 0.8661

Training Classification Report:
               precision    recall  f1-score   support

           0       0.85      0.95      0.90     59921
           1       0.90      0.72      0.80     35268

    accuracy                           0.87     95189
   macro avg       0.88      0.84      0.85     95189
weighted avg       0.87      0.87      0.86     95189


Test Accuracy with best k=6: 0.8246

Test Classification Report:
               precision    recall  f1-score   support

           0       0.82      0.92      0.87     14951
           1       0.84      0.66      0.74      8847

    accuracy                           0.82     23798
   macro avg       0.83      0.79      0.80     23798
weighted avg       0.83      0.82      0.82     23798

