In [1]:
import numpy as np 
import pandas as pd


In [2]:
# Define the KNN class
class KNN:
    def __init__(self, k=3, distance_metric='euclidean'):
        self.k = k
        self.distance_metric = distance_metric

    def fit(self, X, y):
        # TODO: Implement the fit method
        self.X_train = X
        self.y_train = y

    def predict(self, X, batch_size=100):
        n_samples = X.shape[0]
        predictions = np.zeros(n_samples, dtype=int)
        for start in range(0, n_samples, batch_size):
            end = min(start + batch_size, n_samples)
            X_batch = X[start:end]
            distances = self.compute_distance(self.X_train, X_batch)
            k_indices = np.argsort(distances, axis=1)[:, :self.k]
            k_nearest_labels = self.y_train[k_indices]
            batch_predictions = self._mode(k_nearest_labels)
            predictions[start:end] = batch_predictions
        return predictions

    def compute_distance(self, X1, X2):
        # TODO: Implement distance computation based on self.distance_metric
        # Hint: Use numpy operations for efficient computation
        if self.distance_metric == 'euclidean':
            X1_sq = np.sum(X1 ** 2, axis=1).reshape(-1, 1)
            X2_sq = np.sum(X2 ** 2, axis=1).reshape(1, -1)
            cross_term = np.dot(X1, X2.T)
            distances = np.sqrt(X1_sq - 2 * cross_term + X2_sq)
            distances = distances.T
        elif self.distance_metric == 'manhattan':
            distances = np.abs(X1[:, None, :] - X2[None, :, :]).sum(axis=2)
            distances = distances.T
        else:
            raise ValueError(f"Unsupported distance metric: {self.distance_metric}")
        return distances
    
    def predict_proba(self, X, batch_size=100):
        n_samples = X.shape[0]
        scores = np.zeros(n_samples)
        for start in range(0, n_samples, batch_size):
            end = min(start + batch_size, n_samples)
            X_batch = X[start:end]
            distances = self.compute_distance(self.X_train, X_batch)
            k_indices = np.argsort(distances, axis=1)[:, :self.k]
            k_nearest_labels = self.y_train[k_indices]
            batch_scores = np.mean(k_nearest_labels, axis=1)
            scores[start:end] = batch_scores
        return scores
    
    
    def _mode(self, data):
        counts = np.apply_along_axis(lambda x: np.bincount(x, minlength=2), axis=1, arr=data)
        mode = np.argmax(counts, axis=1)
        return mode

In [3]:
# Define data preprocessing function
def preprocess_data(train_path, test_path):
    train_data = pd.read_csv(train_path)
    test_data = pd.read_csv(test_path)

    print(f"Train data shape: {train_data.shape}")
    print(f"Test data shape: {test_data.shape}")

    # Combine train and test data for consistent preprocessing
    combined = pd.concat([train_data, test_data], sort=False, ignore_index=True)
    print(f"Combined data shape: {combined.shape}")

    # Handle categorical variables
    categorical_cols = combined.select_dtypes(include=['object', 'category']).columns
    combined = pd.get_dummies(combined, columns=categorical_cols)

    # Feature Scaling
    feature_cols = combined.columns.drop('Exited', errors='ignore')
    means = combined[feature_cols].mean()
    stds = combined[feature_cols].std()
    combined[feature_cols] = (combined[feature_cols] - means) / stds

    # Split back into train and test sets
    train_processed = combined.iloc[:len(train_data)].copy()
    test_processed = combined.iloc[len(train_data):].copy()
    print(f"Train processed shape: {train_processed.shape}")
    print(f"Test processed shape: {test_processed.shape}")

    # Separate features and target
    X = train_processed.drop('Exited', axis=1).values
    y = train_processed['Exited'].values
    X_test = test_processed.drop('Exited', axis=1).values

    return X, y, X_test

In [4]:
# Implement ROC AUC score calculation
def compute_roc_auc(y_true, y_scores):
    # Ensure y_true and y_scores are numpy arrays
    y_true = np.array(y_true)
    y_scores = np.array(y_scores)

    # Sort scores and corresponding true labels
    sorted_indices = np.argsort(y_scores)
    y_true_sorted = y_true[sorted_indices]
    y_scores_sorted = y_scores[sorted_indices]

    # Calculate the ranks
    n = len(y_true)
    pos_count = np.sum(y_true == 1)
    neg_count = n - pos_count

    # Calculate the sum of ranks for positive samples
    cum_ranks = np.arange(1, n + 1)
    sum_pos_ranks = np.sum(cum_ranks[y_true_sorted == 1])

    # Compute AUC
    auc = (sum_pos_ranks - pos_count * (pos_count + 1) / 2) / (pos_count * neg_count)
    return auc

# Implement Stratified K-Fold Cross-Validation
def stratified_k_fold(X, y, n_splits=5, shuffle=True, random_state=None):
    if shuffle:
        if random_state is not None:
            np.random.seed(random_state)
        idx = np.random.permutation(len(y))
        X = X[idx]
        y = y[idx]
    # Identify the unique classes and their indices
    classes, y_indices = np.unique(y, return_inverse=True)
    n_classes = len(classes)

    # Initialize folds
    folds = [[] for _ in range(n_splits)]
    class_counts = np.bincount(y_indices)
    class_folds = [np.array_split(np.where(y_indices == cls)[0], n_splits) for cls in range(n_classes)]

    for fold_idx in range(n_splits):
        fold_indices = np.hstack([class_folds[cls][fold_idx] for cls in range(n_classes)])
        folds[fold_idx] = fold_indices

    return folds


# Define cross-validation function
def cross_validate(X, y, knn, n_splits=5, batch_size=100):
    folds = stratified_k_fold(X, y, n_splits=n_splits, shuffle=True, random_state=42)
    auc_scores = []

    for i in range(n_splits):
        val_idx = folds[i]
        train_idx = np.hstack([folds[j] for j in range(n_splits) if j != i])

        X_train_fold, X_val_fold = X[train_idx], X[val_idx]
        y_train_fold, y_val_fold = y[train_idx], y[val_idx]

        knn.fit(X_train_fold, y_train_fold)
        y_scores = knn.predict_proba(X_val_fold, batch_size=batch_size)

        # Compute ROC AUC score
        auc = compute_roc_auc(y_val_fold, y_scores)
        auc_scores.append(auc)

    return auc_scores


   


: 

In [None]:
# Load and preprocess data
X, y, X_test = preprocess_data('train.csv', 'test.csv')



# Create and evaluate model
knn = KNN(k=5, distance_metric='manhattan')

# Perform cross-validation
cv_scores = cross_validate(X, y, knn)
print("Cross-validation scores:", cv_scores)
print("Mean AUC Score:", np.mean(cv_scores))

# TODO: hyperparamters tuning
best_k = None
best_score = 0
for k in range(1, 16, 2):  # Trying odd values of k
    knn = KNN(k=k, distance_metric='euclidean')
    cv_scores = cross_validate(X, y, knn)
    mean_score = np.mean(cv_scores)
    print(f"k={k}, Mean AUC={mean_score:.4f}")
    if mean_score > best_score:
        best_score = mean_score
        best_k = k

print(f"Best k: {best_k} with Mean AUC: {best_score:.4f}")

# TODO: Train on full dataset with optimal hyperparameters and make predictions on test set
knn = KNN(k=best_k, distance_metric='euclidean')
knn.fit(X, y)

test_scores = knn.predict_proba(X_test, batch_size=500)

# Ensure probabilities are between 0 and 1
test_scores = np.clip(test_scores, 0, 1)


# Save test predictions
test_ids = pd.read_csv('test.csv')['id']

print(f"Length of X_test: {len(X_test)}")
print(f"Length of test_ids: {len(test_ids)}")
print("Predicted probabilities summary:")
print(f"Min: {test_scores.min()}, Max: {test_scores.max()}")
print(f"Mean: {test_scores.mean()}, Std: {test_scores.std()}")
submission = pd.DataFrame({'id': test_ids, 'Exited': test_scores})
submission.to_csv('submissions.csv', index=False)

Train data shape: (15000, 14)
Test data shape: (10000, 13)
Combined data shape: (25000, 14)
Train processed shape: (15000, 855)
Test processed shape: (10000, 855)
