In [3]:
import numpy as np
import pandas as pd

In [14]:
def preprocess_data(train_path, test_path):
    # Load the datasets
    train_data = pd.read_csv(train_path)
    test_data = pd.read_csv(test_path)

    # Drop non-informative columns and categorical columns (cannot convert to numerical)
    train_data.drop(['Surname', 'CustomerId'], axis=1, inplace=True)
    test_data.drop(['Surname', 'CustomerId'], axis=1, inplace=True)

    # Convert 'Gender' column to numeric values (0 for Female, 1 for Male)
    train_data['Gender'] = train_data['Gender'].map({'Female': 0, 'Male': 1})
    test_data['Gender'] = test_data['Gender'].map({'Female': 0, 'Male': 1})

    # Convert 'Geography' column to numeric using One-Hot Encoding
    train_data = pd.get_dummies(train_data, columns=['Geography'], drop_first=True)
    test_data = pd.get_dummies(test_data, columns=['Geography'], drop_first=True)

    # Handle missing values in the dataset by filling with median
    train_data.fillna(train_data.median(), inplace=True)
    test_data.fillna(test_data.median(), inplace=True)

    # Ensure that both train and test have the same columns after one-hot encoding
    test_data = test_data.reindex(columns=train_data.columns.drop('Exited'), fill_value=0)

    # Standardize the numerical columns manually
    numerical_columns = ['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'EstimatedSalary']

    # Compute the mean and std from the training set
    means = train_data[numerical_columns].mean()
    stds = train_data[numerical_columns].std()

    # Apply the scaling to both train and test sets
    train_data[numerical_columns] = (train_data[numerical_columns] - means) / stds
    test_data[numerical_columns] = (test_data[numerical_columns] - means) / stds

    # Extract the target variable from the training data
    X_train = train_data.drop('Exited', axis=1).values
    y_train = train_data['Exited'].values
    X_test = test_data.values

    return X_train, y_train, X_test

In [7]:
# Define the KNN class
class KNN:
    def __init__(self, k=3, distance_metric='euclidean'):
        self.k = k
        self.distance_metric = distance_metric

    def fit(self, X, y):
        self.X_train = X
        self.y_train = y

    def compute_distance(self, X1, X2):
        if self.distance_metric == 'euclidean':
            return np.sqrt(np.sum((X1 - X2) ** 2, axis=1))
        elif self.distance_metric == 'manhattan':
            return np.sum(np.abs(X1 - X2), axis=1)

    def predict(self, X):
        predictions = []

        for i in range(len(X)):
            # Compute the distance between the current sample and all training samples
            distances = self.compute_distance(self.X_train, X[i])

            # Get the indices of the k-nearest neighbors
            k_neighbors_indices = np.argsort(distances)[:self.k]

            # Get the labels of the k-nearest neighbors
            k_neighbor_labels = self.y_train[k_neighbors_indices]

            # Perform majority voting
            unique, counts = np.unique(k_neighbor_labels, return_counts=True)
            majority_label = unique[np.argmax(counts)]

            predictions.append(majority_label)

        return np.array(predictions)


In [8]:
import numpy as np

def evaluate_model(y_true, y_pred):
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)

    # Calculate True Positives, False Positives, True Negatives, False Negatives
    TP = np.sum((y_true == 1) & (y_pred == 1))
    TN = np.sum((y_true == 0) & (y_pred == 0))
    FP = np.sum((y_true == 0) & (y_pred == 1))
    FN = np.sum((y_true == 1) & (y_pred == 0))

    accuracy = (TP + TN) / len(y_true)

    precision = TP / (TP + FP) if (TP + FP) != 0 else 0

    recall = TP / (TP + FN) if (TP + FN) != 0 else 0

    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) != 0 else 0

    # AUC-ROC: Simplified as a comparison between actual positive and predicted positive rates (binary classification)
    pos_label = np.argsort(y_true)
    pos_prob = np.argsort(y_pred)
    auc = np.mean([1 if pred >= actual else 0 for pred, actual in zip(pos_prob, pos_label)])

    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(f"AUC-ROC: {auc:.4f}")

    return accuracy, precision, recall, f1, auc

In [9]:
def cross_validate(X, y, knn, n_splits=5):
    # Split the data into n_splits folds
    fold_size = len(X) // n_splits
    eval_metrics = {'accuracy': [], 'precision': [], 'recall': [], 'f1': [], 'auc': []}

    for i in range(n_splits):
        # Define start and end indices of the validation set for the current fold
        start_idx = i * fold_size
        end_idx = start_idx + fold_size if i < n_splits - 1 else len(X)

        # Validation data
        X_val = X[start_idx:end_idx]
        y_val = y[start_idx:end_idx]

        X_train = np.concatenate((X[:start_idx], X[end_idx:]), axis=0)
        y_train = np.concatenate((y[:start_idx], y[end_idx:]), axis=0)

        knn.fit(X_train, y_train)

        y_val_pred = knn.predict(X_val)

        # Evaluate the metrics for this fold
        accuracy, precision, recall, f1, auc = evaluate_model(y_val, y_val_pred)

        # Store the results
        eval_metrics['accuracy'].append(accuracy)
        eval_metrics['precision'].append(precision)
        eval_metrics['recall'].append(recall)
        eval_metrics['f1'].append(f1)
        eval_metrics['auc'].append(auc)

    # Calculate average metrics across all folds
    print("\nAverage metrics across all folds:")
    for metric, values in eval_metrics.items():
        print(f"{metric.capitalize()}: {np.mean(values):.4f}")

    return eval_metrics

In [1]:
from google.colab import files
uploaded = files.upload()

Saving test.csv to test.csv
Saving train.csv to train.csv


In [19]:
# Load and preprocess data
X, y, X_test = preprocess_data('train.csv', 'test.csv')

# Create and evaluate model
knn = KNN(k=3, distance_metric='manhattan')

In [20]:
# Perform cross-validation
cross_validate(X, y, knn)

Accuracy: 0.7947
Precision: 0.0000
Recall: 0.0000
F1 Score: 0.0000
AUC-ROC: 0.5723
Accuracy: 0.6847
Precision: 0.2975
Recall: 0.4555
F1 Score: 0.3599
AUC-ROC: 0.5617
Accuracy: 0.7993
Precision: 0.0000
Recall: 0.0000
F1 Score: 0.0000
AUC-ROC: 0.5780
Accuracy: 0.7847
Precision: 0.4305
Recall: 0.2152
F1 Score: 0.2870
AUC-ROC: 0.5653
Accuracy: 0.8137
Precision: 0.8269
Recall: 0.1372
F1 Score: 0.2353
AUC-ROC: 0.6033

Average metrics across all folds:
Accuracy: 0.7754
Precision: 0.3110
Recall: 0.1616
F1: 0.1764
Auc: 0.5761


{'accuracy': [0.7946666666666666,
  0.6846666666666666,
  0.7993333333333333,
  0.7846666666666666,
  0.8136666666666666],
 'precision': [0,
  0.2975391498881432,
  0,
  0.4304635761589404,
  0.8269230769230769],
 'recall': [0.0,
  0.4554794520547945,
  0.0,
  0.2152317880794702,
  0.1371610845295056],
 'f1': [0, 0.35994587280108253, 0, 0.28697571743929357, 0.23529411764705885],
 'auc': [0.5723333333333334,
  0.5616666666666666,
  0.578,
  0.5653333333333334,
  0.6033333333333334]}

In [23]:
# Train on full dataset with optimal hyperparameters and make predictions on test set
knn = KNN(k=6, distance_metric='manhattan')
knn.fit(X, y)
test_predictions = knn.predict(X_test)

# Save test predictions
pd.DataFrame({'id': pd.read_csv('test.csv')['id'], 'Exited': test_predictions}).to_csv('submissions.csv', index=False)

In [24]:
files.download('submissions.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>