In [74]:
import numpy as np
import pandas as pd

In [75]:
class KNN:
    def __init__(self, k=3, distance_metric='euclidean'):
        self.k = k
        self.distance_metric = distance_metric

    def fit(self, X, y):
        # Store the training data
        self.X_train = X
        self.y_train = y

    def predict(self, X):
        predictions = []
        for i in range(X.shape[0]):
            distances = np.array([self.compute_distance(X[i], x_train) for x_train in self.X_train])
            # Get the indices of the k nearest neighbors
            k_indices = distances.argsort()[:self.k]
            # Get the labels of the k nearest neighbors
            k_nearest_labels = self.y_train[k_indices]
            # Predict the most common class label
            unique, counts = np.unique(k_nearest_labels, return_counts=True)
            predictions.append(unique[counts.argmax()])
        return np.array(predictions)

    def compute_distance(self, X1, X2):
        if self.distance_metric == 'euclidean':
            return np.sqrt(np.sum((X1 - X2) ** 2))
        elif self.distance_metric == 'manhattan':
            return np.sum(np.abs(X1 - X2))
        else:
            raise ValueError(f"Unknown distance metric: {self.distance_metric}")

In [76]:
def preprocess_data(train_path, test_path):
    train_data = pd.read_csv(train_path)
    test_data = pd.read_csv(test_path)

    # Handle missing values (fill with mean for numerical, mode for categorical)
    for col in train_data.columns:
        if train_data[col].isnull().sum() > 0:
            if train_data[col].dtype == 'object':
                train_data[col].fillna(train_data[col].mode()[0], inplace=True)
            else:
                train_data[col].fillna(train_data[col].mean(), inplace=True)

    for col in test_data.columns:
        if test_data[col].isnull().sum() > 0:
            if test_data[col].dtype == 'object':
                test_data[col].fillna(test_data[col].mode()[0], inplace=True)
            else:
                test_data[col].fillna(test_data[col].mean(), inplace=True)

    # Convert categorical variables into numerical form (manual encoding)
    train_data['Gender'] = train_data['Gender'].map({'Male': 0, 'Female': 1})
    test_data['Gender'] = test_data['Gender'].map({'Male': 0, 'Female': 1})

    geography_mapping = {'France': 0, 'Spain': 1, 'Germany': 2}
    train_data['Geography'] = train_data['Geography'].map(geography_mapping)
    test_data['Geography'] = test_data['Geography'].map(geography_mapping)

    # Select relevant columns
    X_train = train_data.drop(['Exited', 'CustomerId', 'Surname'], axis=1)
    y_train = train_data['Exited']
    X_test = test_data.drop(['CustomerId', 'Surname'], axis=1)

    # Manually scale features (standardize them)
    X_train = (X_train - X_train.mean()) / X_train.std()
    X_test = (X_test - X_test.mean()) / X_test.std()

    return X_train.values, y_train.values, X_test.values

In [77]:
def cross_validate(X, y, knn, n_splits=5):
    fold_size = len(X) // n_splits
    auc_scores = []

    for i in range(n_splits):
        X_val = X[i * fold_size: (i + 1) * fold_size]
        y_val = y[i * fold_size: (i + 1) * fold_size]

        X_train = np.concatenate([X[:i * fold_size], X[(i + 1) * fold_size:]], axis=0)
        y_train = np.concatenate([y[:i * fold_size], y[(i + 1) * fold_size:]], axis=0)

        # Fit the model
        knn.fit(X_train, y_train)
        # Make predictions
        y_pred = knn.predict(X_val)

        # Calculate accuracy as a basic metric
        accuracy = np.mean(y_pred == y_val)
        auc_scores.append(accuracy)

    return np.mean(auc_scores)

In [None]:
def hyperparameter_tuning(X, y):
    best_k = None
    best_distance_metric = None
    best_score = 0

    k_values = range(3, 11)
    distance_metrics = ['euclidean', 'manhattan']

    for k in k_values:
        for distance_metric in distance_metrics:
            knn = KNN(k=k, distance_metric=distance_metric)
            score = cross_validate(X, y, knn)

            print(f'k = {k}, distance_metric = {distance_metric}, score = {score}')

            if score > best_score:
                best_score = score
                best_k = k
                best_distance_metric = distance_metric

    print(f'Best k: {best_k}, Best distance_metric: {best_distance_metric}, Best score: {best_score}')
    return best_k, best_distance_metric

# Load and preprocess data
X, y, X_test = preprocess_data('train.csv', 'test.csv')

# Perform hyperparameter tuning
best_k, best_distance_metric = hyperparameter_tuning(X, y)

# Create and evaluate final model with optimal hyperparameters
knn = KNN(k=best_k, distance_metric=best_distance_metric)

# Train on full dataset and make predictions on the test set
knn.fit(X, y)
test_predictions = knn.predict(X_test)

# Save test predictions
pd.DataFrame({'id': pd.read_csv('test.csv')['CustomerId'], 'Exited': test_predictions}).to_csv('submissions.csv', index=False)

k = 3, distance_metric = euclidean, score = 0.8645333333333334
k = 3, distance_metric = manhattan, score = 0.8652000000000001
k = 4, distance_metric = euclidean, score = 0.8664
k = 4, distance_metric = manhattan, score = 0.8664
k = 5, distance_metric = euclidean, score = 0.8721333333333332
k = 5, distance_metric = manhattan, score = 0.8718666666666668
k = 6, distance_metric = euclidean, score = 0.8694666666666666
k = 6, distance_metric = manhattan, score = 0.8716000000000002
k = 7, distance_metric = euclidean, score = 0.8728666666666666
k = 7, distance_metric = manhattan, score = 0.8771333333333334
k = 8, distance_metric = euclidean, score = 0.8721333333333334
k = 8, distance_metric = manhattan, score = 0.8750666666666665
k = 9, distance_metric = euclidean, score = 0.8748666666666667
k = 9, distance_metric = manhattan, score = 0.8788
k = 10, distance_metric = euclidean, score = 0.8733333333333333
k = 10, distance_metric = manhattan, score = 0.8755333333333333
Best k: 9, Best distance_m