In [10]:
import numpy as np
import pandas as pd

In [11]:
class KNN:
    def __init__(self, k=3, distance_metric='euclidean', weighted=False):
        self.k = k
        self.distance_metric = distance_metric
        self.weighted = weighted

    def fit(self, X, y):
        self.X_train = X
        self.y_train = y

    def predict(self, X):
        predictions = []
        for x in X:
            distances = self.compute_distance(x, self.X_train)
            indices = np.argsort(distances)[:self.k]
            k_labels = self.y_train[indices]
            if self.weighted:
                k_distances = distances[indices]
                k_distances = np.where(k_distances == 0, 1e-5, k_distances)
                weights = 1 / k_distances
                predicted_prob = np.average(k_labels, weights=weights)
            else:
                predicted_prob = np.mean(k_labels)
            predictions.append(predicted_prob)
        return np.array(predictions)

    def compute_distance(self, x, X_train):
        if self.distance_metric == 'euclidean':
            distances = np.sqrt(np.sum((X_train - x) ** 2, axis=1))
        elif self.distance_metric == 'manhattan':
            distances = np.sum(np.abs(X_train - x), axis=1)
        else:
            raise ValueError('Unsupported distance metric')
        return distances

In [12]:
def preprocess_data(train_path, test_path):
    train_data = pd.read_csv(train_path)
    test_data = pd.read_csv(test_path)

    # Drop unnecessary columns
    train_data = train_data.drop(['CustomerId', 'Surname'], axis=1)
    test_data = test_data.drop(['CustomerId', 'Surname'], axis=1)

    # Store 'id' column from test_data
    test_ids = test_data['id']

    # Drop 'id' column from train and test data
    train_data = train_data.drop(['id'], axis=1)
    test_data = test_data.drop(['id'], axis=1)

    # Map 'Gender' to numerical values
    gender_mapping = {'Male': 0, 'Female': 1}
    train_data['Gender'] = train_data['Gender'].map(gender_mapping)
    test_data['Gender'] = test_data['Gender'].map(gender_mapping)

    # One-hot encode 'Geography'
    train_data = pd.get_dummies(train_data, columns=['Geography'], drop_first=True)
    test_data = pd.get_dummies(test_data, columns=['Geography'], drop_first=True)

    # Align test data columns with train data
    missing_cols = set(train_data.columns) - set(test_data.columns) - set(['Exited'])
    for c in missing_cols:
        test_data[c] = 0
    test_data = test_data[train_data.drop('Exited', axis=1).columns]

    # Separate features and target
    X_train = train_data.drop('Exited', axis=1)
    y_train = train_data['Exited']

    # Combine features for scaling
    features = pd.concat([X_train, test_data], axis=0)

    # Convert all data to float
    features = features.astype(float)

    # Min-Max Scaling
    feature_min = features.min()
    feature_max = features.max()
    features = (features - feature_min) / (feature_max - feature_min)

    # Split back into train and test data
    X_train = features.iloc[:len(X_train), :]
    X_test = features.iloc[len(X_train):, :]

    # Convert to numpy arrays
    X_train = X_train.values
    y_train = y_train.values
    X_test = X_test.values

    return X_train, y_train, X_test, test_ids


In [13]:
def roc_auc_score(y_true, y_scores):
    # Ensure numpy arrays
    y_true = np.asarray(y_true)
    y_scores = np.asarray(y_scores)

    # Get positive and negative scores
    pos = y_scores[y_true == 1]
    neg = y_scores[y_true == 0]

    # Total number of positive and negative samples
    n_pos = len(pos)
    n_neg = len(neg)

    # Handle cases with no positive or negative samples
    if n_pos == 0 or n_neg == 0:
        return 0.0

    # Calculate ranks
    combined_scores = np.concatenate([pos, neg])
    combined_labels = np.concatenate([np.ones(n_pos), np.zeros(n_neg)])
    sorted_indices = np.argsort(combined_scores)
    sorted_labels = combined_labels[sorted_indices]

    # Calculate rank sums for positive samples
    rank = np.arange(1, n_pos + n_neg + 1)
    rank_sum = np.sum(rank[sorted_labels == 1])

    # Compute AUC using Mann-Whitney U statistic
    auc = (rank_sum - n_pos * (n_pos + 1) / 2) / (n_pos * n_neg)
    return auc


In [14]:
def train_validation_split(X, y, test_size=0.2, random_state=None):
    if random_state is not None:
        np.random.seed(random_state)
    indices = np.arange(len(X))
    np.random.shuffle(indices)
    split_idx = int(len(X) * (1 - test_size))
    train_indices = indices[:split_idx]
    val_indices = indices[split_idx:]
    X_train = X[train_indices]
    X_val = X[val_indices]
    y_train = y[train_indices]
    y_val = y[val_indices]
    return X_train, X_val, y_train, y_val

In [15]:
import random

def random_search(X, y, param_distributions, n_iter=10):
    best_auc = 0
    best_params = None
    np.random.seed(42)
    random.seed(42)

    # Split data into training and validation sets
    X_train_rs, X_val_rs, y_train_rs, y_val_rs = train_validation_split(
        X, y, test_size=0.2, random_state=42
    )

    for _ in range(n_iter):
        # Randomly sample hyperparameters
        params = {key: random.choice(values) for key, values in param_distributions.items()}

        # Create and train the model
        knn = KNN(k=params['k'], distance_metric=params['distance_metric'], weighted=params['weighted'])
        knn.fit(X_train_rs, y_train_rs)

        # Predict on validation set
        y_scores = knn.predict(X_val_rs)

        # Compute ROC AUC
        auc = roc_auc_score(y_val_rs, y_scores)
        print(f"Params: {params}, AUC: {auc}")

        if auc > best_auc:
            best_auc = auc
            best_params = params

    print(f"\nBest AUC: {best_auc} with params: {best_params}")
    return best_params


In [16]:
def stratified_k_fold(X, y, n_splits):
    from collections import defaultdict
    # Group indices by class
    indices_by_class = defaultdict(list)
    for idx, label in enumerate(y):
        indices_by_class[label].append(idx)

    folds = [[] for _ in range(n_splits)]
    # Distribute indices to folds
    for label, indices in indices_by_class.items():
        np.random.shuffle(indices)
        for i, idx in enumerate(indices):
            folds[i % n_splits].append(idx)
    return folds

def hyperparameter_tuning(X, y, param_grid, max_evals=10):
    from itertools import product

    # Generate all combinations of hyperparameters
    all_params = list(product(*param_grid.values()))
    np.random.shuffle(all_params)  # Shuffle for randomness

    best_auc = 0
    best_params = None
    evaluations = 0

    for params in all_params:
        if evaluations >= max_evals:
            break
        param_dict = dict(zip(param_grid.keys(), params))
        auc = cross_validate(X, y, **param_dict)
        print(f"Params: {param_dict}, AUC: {auc}")
        if auc > best_auc:
            best_auc = auc
            best_params = param_dict
        evaluations += 1

    print(f"\nBest AUC: {best_auc} with params: {best_params}")
    return best_params

def cross_validate(X, y, k, distance_metric, weighted, n_splits=5):
    np.random.seed(42)
    folds = stratified_k_fold(X, y, n_splits)
    auc_scores = []

    for i in range(n_splits):
        val_indices = folds[i]
        train_indices = [idx for fold in folds[:i] + folds[i+1:] for idx in fold]

        X_train, y_train = X[train_indices], y[train_indices]
        X_val, y_val = X[val_indices], y[val_indices]

        # Fit the model
        knn = KNN(k=k, distance_metric=distance_metric, weighted=weighted)
        knn.fit(X_train, y_train)

        # Predict probabilities on validation set
        y_scores = knn.predict(X_val)

        # Compute ROC AUC
        auc = roc_auc_score(y_val, y_scores)
        auc_scores.append(auc)

    mean_auc = np.mean(auc_scores)
    return mean_auc

In [17]:
# Sample a subset of the data for hyperparameter tuning
def sample_data(X, y, sample_size=1000):
    np.random.seed(42)
    indices = np.random.choice(len(X), size=sample_size, replace=False)
    return X[indices], y[indices]


In [18]:
X, y, X_test, test_ids = preprocess_data('train.csv', 'test.csv')

# Define hyperparameter distributions
param_distributions = {
    'k': list(range(3, 20, 2)),  # Odd values between 3 and 19
    'distance_metric': ['euclidean', 'manhattan'],
    'weighted': [True, False]
}

# Perform random search
best_params = random_search(X, y, param_distributions, n_iter=20)

# Train on full dataset with optimal hyperparameters
knn = KNN(k=best_params['k'], distance_metric=best_params['distance_metric'], weighted=best_params['weighted'])
knn.fit(X, y)

# Make predictions on test set
test_predictions = knn.predict(X_test)

# Save test predictions
submission = pd.DataFrame({'id': test_ids, 'Exited': test_predictions})
submission.to_csv('submissions.csv', index=False)

Params: {'k': 5, 'distance_metric': 'euclidean', 'weighted': False}, AUC: 0.8978633351404054
Params: {'k': 9, 'distance_metric': 'euclidean', 'weighted': True}, AUC: 0.9085972077004341
Params: {'k': 5, 'distance_metric': 'euclidean', 'weighted': False}, AUC: 0.8978633351404054
Params: {'k': 3, 'distance_metric': 'euclidean', 'weighted': True}, AUC: 0.8954994405766166
Params: {'k': 9, 'distance_metric': 'euclidean', 'weighted': True}, AUC: 0.9085972077004341
Params: {'k': 19, 'distance_metric': 'euclidean', 'weighted': False}, AUC: 0.9143189995650814
Params: {'k': 9, 'distance_metric': 'manhattan', 'weighted': False}, AUC: 0.9106639191933448
Params: {'k': 3, 'distance_metric': 'euclidean', 'weighted': False}, AUC: 0.8872163106010901
Params: {'k': 13, 'distance_metric': 'manhattan', 'weighted': True}, AUC: 0.9143719225777035
Params: {'k': 9, 'distance_metric': 'manhattan', 'weighted': True}, AUC: 0.9154358108314409
Params: {'k': 5, 'distance_metric': 'manhattan', 'weighted': True}, AUC: 

In [19]:
import pandas as pd

# Load training and test data separately
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

# Print column names
print("Training Data Columns:")
print(train_data.columns.tolist())

print("\nTest Data Columns:")
print(test_data.columns.tolist())

Training Data Columns:
['id', 'CustomerId', 'Surname', 'CreditScore', 'Geography', 'Gender', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary', 'Exited']

Test Data Columns:
['id', 'CustomerId', 'Surname', 'CreditScore', 'Geography', 'Gender', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary']
