In [None]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from collections import Counter

In [None]:
class KNN:
    def __init__(self, k=3, distance_metric='euclidean'):
        self.k = k
        self.distance_metric = distance_metric

    def fit(self, X, y):
        self.X_train = X
        self.y_train = y
        self.classes = np.unique(y)

    def predict(self, X):
        predictions = []
        for x_test in X:
            distances = self.compute_distance(self.X_train, x_test)
            k_indices = np.argsort(distances)[:self.k]
            k_nearest_values = self.y_train[k_indices]
            predictions.append(np.mean(k_nearest_values))
        return np.array(predictions)

    def compute_distance(self, X1, X2):
        if self.distance_metric == 'euclidean':
            return np.sqrt(np.sum((X1 - X2) ** 2, axis=1))
        elif self.distance_metric == 'manhattan':
            return np.sum(np.abs(X1 - X2), axis=1)
        else:
            raise ValueError("distance_metric could only be euclidean or manhattan")

In [None]:
def preprocess_data(train_path, test_path):
    train_data = pd.read_csv(train_path)
    test_data = pd.read_csv(test_path)
    
    train_data["isMale"] = train_data["Gender"].map({"Male": 1.0, "Female": 0.0})
    train_data["isFemale"] = train_data["Gender"].map({"Female": 1.0, "Male": 0.0})
    test_data["isMale"] = test_data["Gender"].map({"Male": 1.0, "Female": 0.0})
    test_data["isFemale"] = test_data["Gender"].map({"Female": 1.0, "Male": 0.0})

#     train_data["isFrance"] = train_data["Geography"].map({"France": 1.0, "Germany": 0.0, "Spain": 0.0})
#     train_data["isGermany"] = train_data["Geography"].map({"France": 0.0, "Germany": 1.0, "Spain": 0.0})
#     train_data["isSpain"] = train_data["Geography"].map({"France": 0.0, "Germany": 0.0, "Spain": 1.0})
#     test_data["isFrance"] = test_data["Geography"].map({"France": 1.0, "Germany": 0.0, "Spain": 0.0})
#     test_data["isGermany"] = test_data["Geography"].map({"France": 0.0, "Germany": 1.0, "Spain": 0.0})
#     test_data["isSpain"] = test_data["Geography"].map({"France": 0.0, "Germany": 0.0, "Spain": 1.0})
    
    train_data = train_data.drop(columns=["id", "CustomerId", "Surname", "Geography", "Gender"])
    test_data = test_data.drop(columns=["id", "CustomerId", "Surname", "Geography", "Gender"])
    
    scaler = MinMaxScaler()
    norm_train_data = pd.DataFrame(scaler.fit_transform(train_data), columns=train_data.columns)
    norm_test_data = pd.DataFrame(scaler.fit_transform(test_data), columns=test_data.columns)
    
    X_train = norm_train_data.drop(columns=["Exited"])
    y_train = norm_train_data["Exited"]
    X_test = norm_test_data
    
    return X_train.to_numpy(), y_train.to_numpy(), X_test.to_numpy()

In [None]:
def cross_validate(X, y, knn, n_splits=5):
    kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    auc_scores = []

    for train_index, test_index in kf.split(X, y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        knn.fit(X_train, y_train)
        y_pred = knn.predict(X_test)
        
        auc = roc_auc_score(y_test, y_pred)
        auc_scores.append(auc)

    return np.array(auc_scores)

In [None]:
X, y, X_test = preprocess_data('/kaggle/input/cs-506-predicting-customer-churn-using-knn/train.csv', '/kaggle/input/cs-506-predicting-customer-churn-using-knn/test.csv')

In [None]:
best_k = None
best_distance_metric = None
mcvs = 0.0
for k in range(1,50):
    for distance_metric in ["euclidean", "manhattan"]:
        knn = KNN(k=k, distance_metric=distance_metric)
        score = np.mean(cross_validate(X, y, knn))
        if score > mcvs:
            best_k = k
            best_distance_metric = distance_metric
            mcvs = score
            print("NB:", score, k, distance_metric)

knn = KNN(k=best_k, distance_metric=best_distance_metric)
knn.fit(X, y)
test_predictions = knn.predict(X_test)

pd.DataFrame({'id': pd.read_csv('/kaggle/input/cs-506-predicting-customer-churn-using-knn/test.csv')['id'], 'Exited': test_predictions}).to_csv('submissions.csv', index=False)