In [63]:
import numpy as np
import pandas as pd

In [64]:
# Define the KNN class
class KNN:
    def __init__(self, k=3, distance_metric='euclidean'):
        self.k = k
        if distance_metric not in ['euclidean', 'manhattan']:
            raise ValueError(f"Unsupported distance metric: {distance_metric}")
        self.distance_metric = distance_metric

    def fit(self, X, y):
        self.X_train = X
        self.y_train = y

    def predict_proba(self, X):
        probas = []
        for x in X:
            distances = self.compute_distance(x, self.X_train)
            k_indices = np.argsort(distances)[:self.k]
            k_nearest_labels = self.y_train[k_indices]
            proba = np.mean(k_nearest_labels)
            probas.append(proba)
        return np.array(probas)

    def predict(self, X):
        return (self.predict_proba(X) > 0.5).astype(int)

    def compute_distance(self, X1, X2):
        if self.distance_metric == 'euclidean':
            return np.sqrt(np.sum((X1 - X2)**2, axis=1))
        elif self.distance_metric == 'manhattan':
            return np.sum(np.abs(X1 - X2), axis=1)
        

In [65]:
# Define data preprocessing function
def preprocess_data(train_path, test_path):
    train_data = pd.read_csv(train_path)
    test_data = pd.read_csv(test_path)

    # Combine train and test data for preprocessing
    all_data = pd.concat([train_data, test_data], axis=0, ignore_index=True)

    # Handle categorical variables
    categorical_columns = ['Geography', 'Gender']
    all_data = pd.get_dummies(all_data, columns=categorical_columns, drop_first=True)

    # Drop unnecessary columns
    columns_to_drop = ['id', 'CustomerId', 'Surname']
    all_data = all_data.drop(columns=columns_to_drop)

    # Scale numerical features
    numerical_columns = ['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'EstimatedSalary']
    all_data[numerical_columns] = (all_data[numerical_columns] - all_data[numerical_columns].mean()) / all_data[numerical_columns].std()

    # Split back into train and test
    X_train = all_data[:len(train_data)].drop('Exited', axis=1)
    y_train = train_data['Exited']
    X_test = all_data[len(train_data):].drop('Exited', axis=1)

    return X_train.values, y_train.values, X_test.values

In [66]:
# Define cross-validation function
def cross_validate(X, y, knn, n_splits=5):
    # TODO: Implement cross-validation
    np.random.seed(42)
    indices = np.arange(len(X))
    np.random.shuffle(indices)
    
    fold_size = len(X) // n_splits
    scores = []
    
    for i in range(n_splits):
        start = i * fold_size
        end = (i + 1) * fold_size if i < n_splits - 1 else len(X)
        val_indices = indices[start:end]
        train_indices = np.concatenate([indices[:start], indices[end:]])
        
        X_train, X_val = X[train_indices], X[val_indices]
        y_train, y_val = y[train_indices], y[val_indices]
        
        knn.fit(X_train, y_train)
        y_pred_proba = knn.predict_proba(X_val)
        
        score = roc_auc_score(y_val, y_pred_proba)
        scores.append(score)
    
    return np.mean(scores)


# Compute ROC AUC scores
def roc_auc_score(y_true, y_score):
    # Simple implementation of ROC AUC score
    positive = y_score[y_true == 1]
    negative = y_score[y_true == 0]
    
    n_pos = len(positive)
    n_neg = len(negative)
    
    if n_pos == 0 or n_neg == 0:
        return 0.5  # Default score when there's only one class
    
    auc_score = 0
    for pos in positive:
        auc_score += np.sum(pos > negative)
    
    auc_score /= (n_pos * n_neg)
    return auc_score





In [67]:
# Load and preprocess data
X, y, X_test = preprocess_data('train.csv', 'test.csv')

# TODO: hyperparamters tuning
k_values = [3, 5, 7, 9, 11]
distance_metrics = ['euclidean', 'manhattan']

best_score = 0
best_k = 0
best_metric = ''

for k in k_values:
    for metric in distance_metrics:
        knn = KNN(k=k, distance_metric=metric)
        score = cross_validate(X, y, knn)
        
        print(f"k={k}, metric={metric}, score={score}")
        
        if score > best_score:
            best_score = score
            best_k = k
            best_metric = metric

print(f"Best hyperparameters: k={best_k}, distance_metric={best_metric}")
print(f"Best cross-validation score: {best_score}")

# TODO: Train on full dataset with optimal hyperparameters and make predictions on test set
knn = KNN(k=best_k, distance_metric=best_metric)
knn.fit(X, y)
test_predictions = knn.predict_proba(X_test)

# Save test predictions
submission = pd.DataFrame({'id': pd.read_csv('test.csv')['id'], 'Exited': test_predictions})
submission.to_csv('submissions.csv', index=False)

print("Predictions saved to 'submissions.csv'")

k=3, metric=euclidean, score=0.7653154647307125
k=3, metric=manhattan, score=0.7628247399743523
k=5, metric=euclidean, score=0.8213106370931668
k=5, metric=manhattan, score=0.8232270895109549
k=7, metric=euclidean, score=0.8489785657062546
k=7, metric=manhattan, score=0.8457478343180359
k=9, metric=euclidean, score=0.8646901679878567
k=9, metric=manhattan, score=0.8611632490505434
k=11, metric=euclidean, score=0.876304203322599
k=11, metric=manhattan, score=0.8713427404691391
Best hyperparameters: k=11, distance_metric=euclidean
Best cross-validation score: 0.876304203322599
Predictions saved to 'submissions.csv'
