In [13]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import roc_auc_score

In [14]:
import numpy as np
from collections import Counter

class KNN:
    def __init__(self, k=3, distance_metric='euclidean'):
        self.k = k
        self.distance_metric = distance_metric

    def fit(self, X, y):
        """Store the training data."""
        self.X_train = np.array(X)
        self.y_train = np.array(y)

    def compute_distances(self, X):
        """Compute distances between test data and all training samples."""
        if self.distance_metric == 'euclidean':
            return np.sqrt(((X[:, np.newaxis] - self.X_train) ** 2).sum(axis=2))
        elif self.distance_metric == 'manhattan':
            return np.abs(X[:, np.newaxis] - self.X_train).sum(axis=2)
        else:
            raise ValueError("Unsupported distance metric")

    def predict(self, X):
        """Predict probabilities for the given data."""
        distances = self.compute_distances(X)
        nearest_indices = np.argsort(distances, axis=1)[:, :self.k]
        nearest_labels = self.y_train[nearest_indices]
        return np.mean(nearest_labels, axis=1)


In [15]:
def preprocess_data(train_path, test_path):
    train_data = pd.read_csv(train_path)
    test_data = pd.read_csv(test_path)

    y_train = train_data['Exited']
    X_train = train_data.drop(['id', 'CustomerId', 'Surname', 'Exited'], axis=1)
    X_test = test_data.drop(['id', 'CustomerId', 'Surname'], axis=1)

    # Encode categorical variables (Geography and Gender)
    label_encoder_geo = LabelEncoder()
    label_encoder_gender = LabelEncoder()

    X_train['Geography'] = label_encoder_geo.fit_transform(X_train['Geography'])
    X_train['Gender'] = label_encoder_gender.fit_transform(X_train['Gender'])

    X_test['Geography'] = label_encoder_geo.transform(X_test['Geography'])
    X_test['Gender'] = label_encoder_gender.transform(X_test['Gender'])

    # Standardize the numerical features for KNN
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)


    return X_train_scaled, y_train, X_test_scaled, test_data['id']

In [16]:
def cross_validate(X, y, knn, n_splits=5):
    fold_size = len(X) // n_splits
    roc_auc_scores = []

    for i in range(n_splits):
        # Define the indices for validation and training sets
        start = i * fold_size
        end = start + fold_size

        X_val, y_val = X[start:end], y[start:end]
        X_train = np.concatenate((X[:start], X[end:]), axis=0)
        y_train = np.concatenate((y[:start], y[end:]), axis=0)

        # Fit the KNN model on the training data
        knn.fit(X_train, y_train)

        # Predict probabilities on the validation data
        y_prob = knn.predict(X_val)

        # Calculate ROC AUC score and store it
        score = roc_auc_score(y_val, y_prob)
        roc_auc_scores.append(score)

    # Return the average ROC AUC score across all folds
    return np.mean(roc_auc_scores), roc_auc_scores

# Load and preprocess data
X, y, X_test, test_ids = preprocess_data('train.csv', 'test.csv')

# Create and evaluate model
knn = KNN(k=5, distance_metric='euclidean')

# Perform cross-validation
cv_scores = cross_validate(X, y, knn)

print("Cross-validation scores:", cv_scores)

# Hyperparameter tuning with K values up to 21 and different distance metrics
def tune_hyperparameters(X, y):
    """Tune the hyperparameter K and return the best K with its CV score."""
    best_k = 1
    best_score = 0
    best_metric = None

    # Should go higher but it's taking like 20 mins to process
    for metric in ['euclidean', 'manhattan']:
        print(f"Starting tuning with {metric} distance metric...")
        
        # Try different values of K from 2 to 7 (adjust range if needed) // this was found to be best
        k=7
            # Initialize KNN with current K and distance metric
        knn = KNN(k=k, distance_metric=metric)
            
            # Perform cross-validation to evaluate the current combination of K and distance metric
        mean_auc, _ = cross_validate(X, y, knn, n_splits=5)
            
            # Print the result for the current combination of K and distance metric
        print(f"K={k} with {metric} distance: AUC = {mean_auc:.4f}")
            
            # Update the best parameters if the current AUC score is higher than the previous best
        if mean_auc > best_score:
            best_score = mean_auc
            best_k = k
            best_metric = metric
    
    return best_k, best_score, best_metric


best_k, best_score, best_metric = tune_hyperparameters(X, y)
print(f"Best K: {best_k}, Best AUC: {best_score}, Best Metric: {best_metric}")

# Train on full dataset with optimal hyperparameters and make predictions on test set
knn = KNN(k=best_k, distance_metric=best_metric)
knn.fit(X, y)
test_predictions = knn.predict(X_test)

# Save test predictions
pd.DataFrame({'id': pd.read_csv('test.csv')['id'], 'Exited': test_predictions}).to_csv('submissions.csv', index=False)

Cross-validation scores: (0.8742747202408111, [0.8748263586245969, 0.881579283089903, 0.8745085190039318, 0.8734393829671969, 0.8670200575184273])
Starting tuning with euclidean distance metric...
K=7 with euclidean distance: AUC = 0.8830
Starting tuning with manhattan distance metric...
K=7 with manhattan distance: AUC = 0.8861
Best K: 7, Best AUC: 0.8860520528225747, Best Metric: manhattan
