In [87]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score


In [88]:
# Define the KNN class
class KNN:
    def __init__(self, k=3, distance_metric='euclidean'):
        self.k = k
        self.distance_metric = distance_metric

    def fit(self, X, y):
        self.X_train = X
        self.y_train = np.array(y)

    def predict(self, X):
        predictions = []
        for x in X:
            distances = self.compute_distance(self.X_train, x)
            k_indices = distances.argsort()[:self.k]
            k_nearest_labels = self.y_train[k_indices]
            unique, counts = np.unique(k_nearest_labels, return_counts=True)
            predictions.append(unique[counts.argmax()])
    
        return np.array(predictions)

    def compute_distance(self, X1, X2):
        if self.distance_metric == 'euclidean':
            return np.sqrt(np.sum((X1 - X2)**2, axis=1))
        elif self.distance_metric == 'manhattan':
            return np.sum(np.abs(X1 - X2), axis=1)
        else:
            raise ValueError(f"Unsupported distance metric: {self.distance_metric}")

In [89]:

def preprocess_data(train_path, test_path):
    train_data = pd.read_csv(train_path)
    test_data = pd.read_csv(test_path)

    train_data = train_data.drop(['CustomerId', 'Surname'], axis=1)
    test_data = test_data.drop(['CustomerId', 'Surname'], axis=1)

    X_train = train_data.drop('Exited', axis=1)
    y_train = train_data['Exited']
    
    X_test = test_data.copy()

    categorical_cols = ['Geography', 'Gender']
    numerical_cols = ['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary']

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), numerical_cols),
            ('cat', OneHotEncoder(drop='first'), categorical_cols)
        ]
    )

    X_train_preprocessed = preprocessor.fit_transform(X_train)
    X_test_preprocessed = preprocessor.transform(X_test)

    print("Shape of X_train_preprocessed:", X_train_preprocessed.shape)
    print("Shape of y_train:", y_train.shape)
    print("Shape of X_test_preprocessed:", X_test_preprocessed.shape)

    return X_train_preprocessed, X_test_preprocessed, y_train


In [90]:

def cross_validate(X, y, knn, n_splits=5):
    skf = StratifiedKFold(n_splits=n_splits)
    auc_scores = []

    for train_index, val_index in skf.split(X, y):
        X_train, X_val = X[train_index], X[val_index]
        y_train, y_val = y[train_index], y[val_index]

        knn.fit(X_train, y_train)
        
        y_pred = knn.predict(X_val)
        
        auc = roc_auc_score(y_val, y_pred)
        auc_scores.append(auc)

    return np.mean(auc_scores), auc_scores


In [91]:
X_train_preprocessed, X_test_preprocessed, y_train = preprocess_data('train.csv', 'test.csv')

# Create the KNN model
knn = KNN(k=5, distance_metric='euclidean')

# Perform cross-validation on the preprocessed training data and labels
cv_scores, auc_scores = cross_validate(X_train_preprocessed, y_train, knn)

# Print cross-validation results
print("Cross-validation AUC scores:", auc_scores)
print("Mean cross-validation AUC:", cv_scores)

# Train the model on the full dataset and make predictions on the test set
knn.fit(X_train_preprocessed, y_train)
test_predictions = knn.predict(X_test_preprocessed)

# Save the test predictions
submission = pd.DataFrame({'id': pd.read_csv('test.csv')['id'], 'Exited': test_predictions})
submission.to_csv('submissions.csv', index=False)

Shape of X_train_preprocessed: (15000, 11)
Shape of y_train: (15000,)
Shape of X_test_preprocessed: (10000, 11)
Cross-validation AUC scores: [np.float64(0.7774717321356196), np.float64(0.7678002762682283), np.float64(0.775059533193671), np.float64(0.7850646896391246), np.float64(0.7605619355189593)]
Mean cross-validation AUC: 0.7731916333511206
