In [6]:
import numpy as np
import pandas as pd

In [7]:
class KNN:
    def __init__(self, k=3, distance_metric='euclidean'):
        self.k = k
        self.distance_metric = distance_metric

    def fit(self, X, y):
        if isinstance(X, pd.DataFrame):
            self.X_train = X.values
        else:
            self.X_train = X

        if isinstance(y, pd.Series):
            self.y_train = y.values
        else:
            self.y_train = y

    def predict(self, X):
        if isinstance(X, pd.DataFrame):
            X = X.values
        else: 
            X = X

        if X.ndim == 1:
            X = X.reshape(1, -1)
        sample_probabilities = []
        for sample in X:
            distances = self.compute_distance(sample, self.X_train)
            k_indices = np.argsort(distances)[:self.k]
            k_nearest_labels = self.y_train[k_indices]
            probability = np.sum(k_nearest_labels) / self.k
            sample_probabilities.append(probability)
        return np.array(sample_probabilities)
    
    def compute_distance(self, X1, X2):
        X1 = np.array(X1, dtype=np.float64)
        X2 = np.array(X2, dtype=np.float64)
        if self.distance_metric == 'euclidean':
            return np.sqrt(np.sum((X1 - X2)**2, axis=1))
        elif self.distance_metric == 'manhattan':
            return np.sum(np.abs(X1 - X2), axis=1)

In [8]:
class scalar:
    def __init__(self):
        self.mean_ = None
        self.scale_ = None

    def check_input(self, X):
        if isinstance(X, pd.DataFrame) or isinstance(X, pd.Series):
            return X.values
        return np.array(X)
    
    def fit(self, X):
        X = self.check_input(X)
        self.mean_ = np.mean(X, axis=0)
        self.scale_ = np.std(X, axis=0, ddof=1)
        return self

    def transform(self, X):
        X = self.check_input(X)
        return (X - self.mean_) / self.scale_

    def transformv2(self, X):
        return self.fit(X).transform(X)

# Define data preprocessing function
def preprocess_data(train, test):
    train_data = pd.read_csv(train)
    test_data = pd.read_csv(test)

    train_data.drop(columns=['CustomerId', 'Surname', 'id'], inplace=True)
    test_data.drop(columns=['CustomerId', 'Surname', 'id'], inplace=True)

    train_data['HasCrCard'] = train_data['HasCrCard'].astype('object')
    train_data['IsActiveMember'] = train_data['IsActiveMember'].astype('object')

    test_data['HasCrCard'] = test_data['HasCrCard'].astype('object')
    test_data['IsActiveMember'] = test_data['IsActiveMember'].astype('object')
    
    train_data = pd.get_dummies(train_data, columns=['Geography', 'Gender'], drop_first=True)
    test_data = pd.get_dummies(test_data, columns=['Geography', 'Gender'], drop_first=True)
    from scipy import stats

    features = train_data.select_dtypes(include=[np.number]).columns.tolist()
    if 'Exited' in features:
        features.remove('Exited')
    
    def outliers(df, features, threshold=3):
        outliers = np.zeros(df.shape[0])
        for feature in features:
            zScores = np.abs(stats.zscore(df[feature]))
            outliers += (zScores > threshold).astype(int)
        return outliers > 0

    outliers = outliers(train_data, features)
    train_data = train_data[~outliers]

    scaler = scalar()
    train_data[features] = scaler.transformv2(train_data[features])
    test_data[features] = scaler.transform(test_data[features])

    X = train_data.drop('Exited', axis=1)
    X = X.astype('float')
    y = train_data['Exited']
    X_test = test_data

    return X, y, X_test

In [9]:
class StratifiedKFold:
    def __init__(self, n_splits=5, shuffle=True, state=None):
        self.n_splits = n_splits
        self.shuffle = shuffle
        self.state = state
    
    def split(self, X, y):
        np.random.seed(self.state)

        y = np.array(y)
        unique_classes, y_indices = np.unique(y, return_inverse=True)

        n_classes = len(unique_classes)
    
        folds = [[] for _ in range(self.n_splits)]
        
        for n in range(n_classes):
            class_indices = np.where(y_indices == n)[0]
            if self.shuffle:
                np.random.shuffle(class_indices)
            fsizes = np.full(self.n_splits, len(class_indices) // self.n_splits, dtype=int)
            fsizes[:len(class_indices) % self.n_splits] += 1
            current = 0
            for fold, fsize in enumerate(fsizes):
                folds[fold].extend(class_indices[current:current + fsize])
                current += fsize
        
        for fold in range(self.n_splits):
            value_indices = np.array(folds[fold])
            train_indices = np.array([idx for f in range(self.n_splits) if f != fold for idx in folds[f]])
            yield train_indices, value_indices

def roc_auc_score(y_true, y_scores):
    score_indices = np.argsort(-y_scores)
    y_true = y_true[score_indices]
    y_scores = y_scores[score_indices]
    indices = np.where(np.diff(y_scores))[0]
    threshold_indices = np.r_[indices, y_true.size - 1]
    truepos = np.cumsum(y_true)[threshold_indices]
    falsepos = 1 + threshold_indices - truepos
    truepos = np.r_[0, truepos]
    falsepos = np.r_[0, falsepos]
    falsepos = falsepos / falsepos[-1]
    truepos = truepos / truepos[-1]
    auc = np.trapz(truepos, falsepos)
    return auc

# Define cross-validation function
def cross_validate(X, y, knn, n_splits=5):
    kfold = StratifiedKFold(n_splits = n_splits, shuffle = True, state = 43)
    scores = []
    for fold, (train_index, value_index) in enumerate(kfold.split(X, y), 1):
        X_train, X_value = X.iloc[train_index].reset_index(drop=True), X.iloc[value_index].reset_index(drop=True)
        y_train, y_value = y.iloc[train_index].reset_index(drop=True), y.iloc[value_index].reset_index(drop=True)
        knn.fit(X_train, y_train)
        y_probability = knn.predict(X_value)
        score = roc_auc_score(y_value.values, y_probability)
        scores.append(score)
    
    mean = np.mean(scores)
    print(f"ROC AUC: {mean}")
    return scores

In [10]:
# Load and preprocess data
X, y, X_test = preprocess_data('train.csv', 'test.csv')

# Create and evaluate model
knn = KNN(k=20, distance_metric='euclidean')

# Perform cross-validation
cv_scores = cross_validate(X, y, knn)

print("Cross-validation scores:", cv_scores)

# TODO: hyperparamters tuning
k_values = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
metrics = ['euclidean', 'manhattan']
best_k = 0
best_metric = None
best_score = 0

# TODO: Train on full dataset with optimal hyperparameters and make predictions on test set
for k in k_values:
    for metric in metrics:
        knn = KNN(k=k, distance_metric=metric)
        scores = cross_validate(X, y, knn)
        mean = np.mean(scores)
        
        if mean > best_score:
            best_score = mean
            best_k = k
            best_metric = metric

print(f"k = {best_k}, distance_metric = {best_metric}")
print(f"Best ROC AUC: {best_score:}")

knn = KNN(k=best_k, distance_metric=best_metric)
knn.fit(X, y)
test_predictions = knn.predict(X_test)
pd.DataFrame({'id': pd.read_csv('test.csv')['id'], 'Exited': test_predictions}).to_csv('submissions.csv', index=False)

ROC AUC: 0.9126458108796077
Cross-validation scores: [0.9151673093846362, 0.9174517603794778, 0.9046717533211177, 0.9167977532923416, 0.9091404780204653]
ROC AUC: 0.9000723624081737
ROC AUC: 0.8999531647859598
ROC AUC: 0.9126458108796077
ROC AUC: 0.9105320277871305
ROC AUC: 0.9163043467691783
ROC AUC: 0.914595867124169
ROC AUC: 0.9174686985836585
ROC AUC: 0.9167111666476029
ROC AUC: 0.917574210951301
ROC AUC: 0.9177262369715871
ROC AUC: 0.9175624311006118
ROC AUC: 0.9176991206703274
ROC AUC: 0.9174661509261476
ROC AUC: 0.917643374782853
ROC AUC: 0.9169539226344694
ROC AUC: 0.9179266103322907
ROC AUC: 0.9168891943369338
ROC AUC: 0.9175222136740728
ROC AUC: 0.9163376731141633
ROC AUC: 0.9174410452144237
k = 80, distance_metric = manhattan
Best ROC AUC: 0.9179266103322907
