In [62]:
import numpy as np
import pandas as pd

In [63]:
# Define the KNN class
class KNN:
    def __init__(self, k=3, distance_metric='euclidean'):
        self.k = k
        self.distance_metric = distance_metric

    def fit(self, X, y):
        # TODO: Implement the fit method
        # Store the training data
        self.X_train = X
        self.y_train = y

    def predict(self, X):
        # TODO: Implement the predict method
        return np.array([self._predict(x) for x in X])

    # helper 
    def _predict(self, dist):
        distances = self.compute_distances(self.X_train, dist)
        k_indices = np.argsort(distances)[:self.k]
        k_labels = self.y_train[k_indices]
        most_common = np.mean(k_labels == 1)
        return most_common

    def compute_distances(self, X1, X2):
        if self.distance_metric == 'euclidean':
            distances = np.sqrt(np.sum((X1 - X2) ** 2, axis=1))
        else:
            raise ValueError(f"Unsupported distance metric: {self.distance_metric}")
        return distances


In [64]:
# Define data preprocessing function
def preprocess_data(train_path, test_path):
    train_data = pd.read_csv(train_path)
    test_data = pd.read_csv(test_path)

    # TODO: Implement data preprocessing
    # Handle categorical variables, scale features, etc.
    # pass

    # Drop the unwanted columns
    test_data = test_data.drop(columns=['CustomerId', 'Surname'])
    train_data = train_data.drop(columns=['CustomerId', 'Surname'])

    # Convert categorical variables to dummies
    train_data = pd.get_dummies(train_data, columns=['Geography', 'Gender'], drop_first=True)
    test_data = pd.get_dummies(test_data, columns=['Geography', 'Gender'], drop_first=True)

    for col in train_data.columns:
        if col not in test_data.columns:
            test_data[col] = 0
    
    test_data = test_data[train_data.columns.drop('Exited')]

    # Separate features and labels
    y = train_data['Exited']
    X = train_data.drop(columns=['Exited'])
    X_test = test_data

    # Feature normalization
    X_train_mean = X.mean()
    X_train_st_dev = X.std()

    X = (X - X_train_mean) / X_train_st_dev
    X_test = (X_test - X_train_mean) / X_train_st_dev  # Use train mean and std for test data

    return X.values, y.values, X_test.values

In [65]:
def compute_roc_auc(y_true, y_pred):

    sorted = np.argsort(y_pred)[::-1]
    y_true = y_true[sorted]

    # Compute the true positive and true negative rate. 
    cumpos = np.cumsum(y_true)
    cumneg = np.cumsum(1 - y_true)
    total_pos = np.sum(y_true)
    total_neg = len(y_true) - total_pos
    tpr = cumpos / total_pos
    fpr = cumneg / total_neg

    # Compute ROC AUC
    return np.trapz(tpr, fpr)

#   Define cross validation function
def cross_validate(X, y, knn, n_splits=5):
    # TODO: Implement cross-validation
    # Compute ROC AUC scores
    fold_size = len(X) // n_splits
    indices = np.arange(len(X))
    np.random.shuffle(indices)

    roc_auc_scores = []

    for fold in range(n_splits):
        # Splitd the data into Training and Validation using fold size
        X_val = X[fold * fold_size:(fold + 1) * fold_size]
        y_val = y[fold * fold_size:(fold + 1) * fold_size]

        X_train = np.concatenate([X[:fold * fold_size], X[(fold + 1) * fold_size:]])
        y_train = np.concatenate([y[:fold * fold_size], y[(fold + 1) * fold_size:]])

        # Fit the Model
        knn.fit(X_train, y_train)
        y_pred = knn.predict(X_val)

        # Calculate the roc auc
        ROC_AUC = compute_roc_auc(y_val, y_pred)
        roc_auc_scores.append(ROC_AUC)

    # Return avg score.
    return np.mean(roc_auc_scores)

In [66]:
# Load and preprocess data
X, y, X_test = preprocess_data('train.csv', 'test.csv')

# Create and evaluate model
knn = KNN(k=5, distance_metric='euclidean')

# Perform cross-validation
cv_scores = cross_validate(X, y, knn)

print("Cross-validation scores:", cv_scores)

# TODO: hyperparamters tuning
# Tune hyperparameters
# Try k-values from 1 to 20 and both euclidean and manhattan distance metrics
k_values = range(1, 13)

best_k = 1
distance_metric = 'euclidean'
best_score = 0

for k in k_values:
    knn = KNN(k=k, distance_metric=distance_metric)
    score = cross_validate(X, y, knn)
    print(f"k={k} with a ROC-AUC score of {score}")

    # Find the best hyperparameters
    if score > best_score:
        best_k = k
        best_score = score

print(f"Best hyperparameters with score {best_score:.4f}: k={best_k}")

knn = KNN(k=best_k, distance_metric=distance_metric)
knn.fit(X, y)
test_predictions = knn.predict(X_test)

# Save test predictions
pd.DataFrame({'id': pd.read_csv('test.csv')['id'], 'Exited': test_predictions}).to_csv('submissions.csv', index=False)


  return np.trapz(tpr, fpr)


Cross-validation scores: 0.8678493268229162
k=1 with a ROC-AUC score of 0.7485188253573817
k=2 with a ROC-AUC score of 0.8132938901409886
k=3 with a ROC-AUC score of 0.8403929756025121
k=4 with a ROC-AUC score of 0.8574542771727298
k=5 with a ROC-AUC score of 0.8678493268229162
k=6 with a ROC-AUC score of 0.8737839376532784
k=7 with a ROC-AUC score of 0.8799767258676944
k=8 with a ROC-AUC score of 0.8841370752574136
k=9 with a ROC-AUC score of 0.8871678055958888
k=10 with a ROC-AUC score of 0.8888738442119472
k=11 with a ROC-AUC score of 0.890509932148943
k=12 with a ROC-AUC score of 0.8914064037946178
Best hyperparameters with score 0.8914: k=12
