In [None]:
import numpy as np
import pandas as pd

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
from sklearn.base import BaseEstimator, ClassifierMixin
import numpy as np

class KNN(BaseEstimator, ClassifierMixin):
    def __init__(self, k=3, distance_metric='euclidean'):
        self.k = k
        self.distance_metric = distance_metric

    def fit(self, X, y):
        self.X_train = X
        self.y_train = y
        self.classes_ = np.unique(y)  # Add classes_ attribute
        return self

    def predict(self, X):
        return np.array([self._predict_single(x) for x in X])

    def predict_proba(self, X):
        probs = []
        for x in X:
            distances = self.compute_distance(self.X_train, x)
            k_indices = np.argsort(distances)[:self.k]
            k_nearest_labels = self.y_train[k_indices]
            # Calculate probability as the proportion of the positive class among neighbors
            prob = np.mean(k_nearest_labels == 1)
            probs.append([1 - prob, prob])  # [probability of 0, probability of 1]
        return np.array(probs)

    def _predict_single(self, x):
        distances = self.compute_distance(self.X_train, x)
        k_indices = np.argsort(distances)[:self.k]
        k_nearest_labels = self.y_train[k_indices]
        return np.bincount(k_nearest_labels).argmax()

    def compute_distance(self, X1, X2):
        if self.distance_metric == 'euclidean':
            return np.sqrt(np.sum((X1 - X2) ** 2, axis=1))
        elif self.distance_metric == 'manhattan':
            return np.sum(np.abs(X1 - X2), axis=1)
        else:
            raise ValueError(f"Unsupported distance metric: {self.distance_metric}")

    def get_params(self, deep=True):
        return {'k': self.k, 'distance_metric': self.distance_metric}

    def set_params(self, **params):
        for param, value in params.items():
            setattr(self, param, value)
        return self


In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

def preprocess_data(train_path, test_path):
    # Load the data
    train_data = pd.read_csv(train_path)
    test_data = pd.read_csv(test_path)

    # Separate features and target in the train data
    X_train = train_data.drop(['id', 'CustomerId', 'Surname', 'Exited'], axis=1)
    y_train = train_data['Exited']

    # Prepare the test data (dropping irrelevant columns)
    X_test = test_data.drop(['id', 'CustomerId', 'Surname'], axis=1)

    # Identify numerical and categorical columns
    numerical_cols = X_train.select_dtypes(include=['float64', 'int64']).columns
    categorical_cols = X_train.select_dtypes(include=['object']).columns

    # Create a preprocessing pipeline
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), numerical_cols),
            ('cat', OneHotEncoder(), categorical_cols)
        ]
    )

    # Fit and transform the training data
    X_train_processed = preprocessor.fit_transform(X_train)
    # Transform the test data (using the same fitted pipeline)
    X_test_processed = preprocessor.transform(X_test)

    # Return preprocessed train, test data, and the target variable
    return X_train_processed, y_train.values, X_test_processed


In [None]:
from sklearn.model_selection import cross_val_score

def cross_validate(X, y, knn, n_splits=5):

    # Perform cross-validation and calculate ROC AUC scores
    cv_scores = cross_val_score(knn, X, y, cv=n_splits, scoring='roc_auc')

    # Calculate mean ROC AUC score
    score = cv_scores.mean()

    print(f"Cross-validation ROC AUC scores: {cv_scores}")
    print(f"Mean ROC AUC: {score:.4f}")

    return score


In [None]:
# Load and preprocess data
X, y, X_test = preprocess_data('/content/drive/My Drive/train.csv', '/content/drive/My Drive/test.csv')

# Create and evaluate model
knn = KNN(k=5, distance_metric='euclidean')

# Perform cross-validation
cv_scores = cross_validate(X, y, knn)

print("Cross-validation score:", cv_scores)

# Hyperparameter tuning
best_k = 1
best_metric = 'euclidean'
best_score = cv_scores
distance_metrics = ['euclidean', 'manhattan']


for k in range(1, 21):  # Testing k from 1 to 20
    for metric in distance_metrics:
        knn = KNN(k=k, distance_metric=metric)

        scores = cross_validate(X, y, knn)

        # Update best hyperparameters if the current mean_score is better
        if scores > best_score:
            best_score = scores
            best_k = k
            best_metric = metric

        print(f'k={k}, metric={metric}, Mean ROC AUC: {scores:.4f}')

print(f"Optimal k: {best_k}, Optimal distance metric: {best_metric}, Best ROC AUC: {best_score:.4f}")

# Train on full dataset with optimal hyperparameters and make predictions on the test set
knn = KNN(k=best_k, distance_metric=best_metric)
knn.fit(X, y)


# Make predictions on the test set
test_predictions = knn.predict_proba(X_test)[:, 1]  # Probability of the positive class

# Save test predictions
pd.DataFrame({'id': pd.read_csv('/content/drive/My Drive/test.csv')['id'], 'Exited': test_predictions}).to_csv('submissions.csv', index=False)

Cross-validation ROC AUC scores: [0.87502068 0.88438506 0.8778005  0.8720785  0.86503641]
Mean ROC AUC: 0.8749
Cross-validation score: 0.87486422962723
Cross-validation ROC AUC scores: [0.75601132 0.74367644 0.76681438 0.75846459 0.74874066]
Mean ROC AUC: 0.7547
k=1, metric=euclidean, Mean ROC AUC: 0.7547
Cross-validation ROC AUC scores: [0.74691473 0.74656939 0.75212402 0.76276805 0.74583956]
Mean ROC AUC: 0.7508
k=1, metric=manhattan, Mean ROC AUC: 0.7508
Cross-validation ROC AUC scores: [0.81695128 0.81780255 0.82812824 0.8229353  0.80628012]
Mean ROC AUC: 0.8184
k=2, metric=euclidean, Mean ROC AUC: 0.8184
Cross-validation ROC AUC scores: [0.81386428 0.82174289 0.81973817 0.82181865 0.80801225]
Mean ROC AUC: 0.8170
k=2, metric=manhattan, Mean ROC AUC: 0.8170
Cross-validation ROC AUC scores: [0.84897061 0.85495091 0.85335558 0.84792204 0.83433043]
Mean ROC AUC: 0.8479
k=3, metric=euclidean, Mean ROC AUC: 0.8479
Cross-validation ROC AUC scores: [0.84484141 0.8568406  0.85010268 0.8517