In [182]:
import numpy as np
import pandas as pd

In [183]:
# Define the KNN class
class KNN:
    def __init__(self, k=3, distance_metric='euclidean'):
        self.k = k
        self.distance_metric = distance_metric

    def fit(self, X, y):
        # TODO: Implement the fit method
        self.X_train = X
        self.y_train = y
        # pass

    def predict(self, X):
        # TODO: Implement the predict method
        # pass
        return np.array([self._predict(x) for x in X])

    def _predict(self, x):
        distances = self.compute_distance(x, self.X_train)
        k_indices = np.argsort(distances)[:self.k]
        k_nearest_labels = self.y_train[k_indices]
        return np.bincount(k_nearest_labels).argmax()

    def compute_distance(self, X1, X2):
        # TODO: Implement distance computation based on self.distance_metric
        # Hint: Use numpy operations for efficient computation
        if self.distance_metric == 'euclidean':
            return np.sqrt(np.sum((X2 - X1)**2, axis=1))
        elif self.distance_metric == 'manhattan':
            return np.sum(np.abs(X2 - X1), axis=1)
        elif self.distance_metric == 'minkowski':
            p = 5 # adjustable
            return np.power(np.sum(np.abs(X2 - X1)**p, axis=1), 1/p)
        else:
            raise ValueError("Unsupported distance metric")
        # pass

In [184]:
# Define data preprocessing function
def preprocess_data(train_path, test_path):
    train_data = pd.read_csv(train_path)
    test_data = pd.read_csv(test_path)

    # TODO: Implement data preprocessing
    # Ensure we're working with all test data rows
    print(f"Number of rows in test data: {len(test_data)}")

    all_data = pd.concat([train_data, test_data], axis=0, sort=False)

    # Handle categorical variables
    all_data = pd.get_dummies(all_data, columns=['Geography', 'Gender'], drop_first=False)

    # Feature engineering
    all_data['BalanceToSalary'] = all_data['Balance'] / (all_data['EstimatedSalary'] + 1)
    all_data['ProductsPerTenure'] = all_data['NumOfProducts'] / (all_data['Tenure'] + 1)
    all_data['CreditScoreToAge'] = all_data['CreditScore'] / (all_data['Age'] + 1)
    all_data['BalancePerProduct'] = all_data['Balance'] / (all_data['NumOfProducts'] + 1)
    all_data['AgeGroup'] = pd.cut(all_data['Age'], bins=[0, 30, 45, 60, 100], labels=[0, 1, 2, 3]).astype(int)

    # Select features and assign weights
    features_and_weights = {
        'Age': 2.5, 'Tenure': 4.0, 'Balance': 2.0, 'NumOfProducts': 3.5,
        'IsActiveMember': 5.0, 'CreditScore': 2.0, 'EstimatedSalary': 1.5,
        'HasCrCard': 1.5, 'Geography_France': 1.0, 'Geography_Germany': 1.0,
        'Geography_Spain': 1.0, 'Gender_Male': 1.0, 'BalanceToSalary': 3.0,
        'ProductsPerTenure': 4.0, 'CreditScoreToAge': 2.5, 'BalancePerProduct': 3.0,
        'AgeGroup': 2.0,
    }

    features = list(features_and_weights.keys())

    # Impute and scale
    for feature in features:
        if feature != 'AgeGroup':
            all_data[feature] = all_data[feature].fillna(all_data[feature].median())
            all_data[feature] = (all_data[feature] - all_data[feature].mean()) / all_data[feature].std()

    # Apply feature weights
    for feature, weight in features_and_weights.items():
        all_data[feature] *= weight

    # Split back into train and test
    X = all_data[features].iloc[:len(train_data)].values
    y = train_data['Exited'].astype(int).values
    X_test = all_data[features].iloc[len(train_data):].values

    return X, y, X_test, test_data[['id', 'CustomerId']].values, features


    # def mutual_info_score(X, y):
    #     y = y.reshape(-1, 1)
    #     y_counts = np.bincount(y.ravel())
    #     y_probs = y_counts / len(y)
    #     y_entropy = -np.sum(y_probs * np.log2(y_probs + 1e-10))

    #     mi_scores = []
    #     for col in X.T:
    #         bins = np.histogram_bin_edges(col, bins='auto')
    #         xy_counts = np.histogram2d(col, y.ravel(), bins=(bins, [0, 1, 2]))[0]
    #         xy_probs = xy_counts / len(y)
    #         x_probs = xy_probs.sum(axis=1)
    #         y_probs = xy_probs.sum(axis=0)

    #         mi = np.sum(xy_probs * np.log2(xy_probs / (x_probs[:, None] * y_probs[None, :]) + 1e-10))
    #         mi_scores.append(mi)

    #     return np.array(mi_scores)

    # mi_scores = mutual_info_score(X, y)
    # selected_features = [features[i] for i in np.argsort(mi_scores)[-8:]]  # Select top 8 features

    # # Split back into train and test
    # X = all_data[selected_features].iloc[:len(train_data)].values
    # y = train_data['Exited'].astype(int).values  # Convert to int
    # X_test = all_data[selected_features].iloc[len(train_data):].values

    # return X, y, X_test, all_data['CustomerId'].iloc[len(train_data):].values, selected_features

    # pass

In [185]:
# Define cross-validation function
def cross_validate(X, y, knn, n_splits=5):
    # TODO: Implement cross-validation
    # Compute ROC AUC scores
    np.random.seed(42)
    indices = np.random.permutation(len(X))
    fold_size = len(X) // n_splits

    scores = []
    for i in range(n_splits):
        start = i * fold_size
        end = (i + 1) * fold_size if i < n_splits - 1 else len(X)

        test_indices = indices[start:end]
        train_indices = np.concatenate([indices[:start], indices[end:]])

        X_train, X_val = X[train_indices], X[test_indices]
        y_train, y_val = y[train_indices], y[test_indices]

        knn.fit(X_train, y_train)
        predictions = knn.predict(X_val)

        score = np.mean(predictions == y_val)
        scores.append(score)

    return np.mean(scores)
    # pass

In [186]:
# Load and preprocess data
X, y, X_test, test_ids, selected_features = preprocess_data('/content/train.csv', '/content/test.csv')
# # Create and evaluate model
# knn = KNN(k=5, distance_metric='euclidean')

# # Perform cross-validation
# cv_scores = cross_validate(X, y, knn)

# print("Cross-validation scores:", cv_scores)

# TODO: hyperparamters tuning
# Hyperparameter tuning
k_values = [28]
distance_metrics = ['euclidean', 'manhattan']

best_score = 0
best_k = 0
best_metric = ''

for k in k_values:
    for metric in distance_metrics:
        knn = KNN(k=k, distance_metric=metric)
        score = cross_validate(X, y, knn)
        print(f"k={k}, metric={metric}, score={score}")

        if score > best_score:
            best_score = score
            best_k = k
            best_metric = metric

print(f"Best hyperparameters: k={best_k}, metric={best_metric}, score={best_score}")



# TODO: Train on full dataset with optimal hyperparameters and make predictions on test set
# Train on full dataset with optimal hyperparameters
knn = KNN(k=best_k, distance_metric=best_metric)
knn.fit(X, y)
test_predictions = knn.predict(X_test)

# Create submissions DataFrame
submissions_df = pd.DataFrame({'id': test_ids[:, 0], 'CustomerId': test_ids[:, 1], 'Exited': test_predictions})
submissions_df = submissions_df.sort_values('id')

# Verify we have 10,000 unique predictions
print(f"Number of predictions: {len(submissions_df)}")
print(f"Number of unique IDs: {submissions_df['id'].nunique()}")

# Save test predictions
submissions_df[['id', 'Exited']].to_csv('submissions.csv', index=False)

print(f"Saved {len(submissions_df)} unique predictions to submissions.csv")

Number of rows in test data: 10000
k=28, metric=euclidean, score=0.8836666666666666
k=28, metric=manhattan, score=0.8862666666666665
Best hyperparameters: k=28, metric=manhattan, score=0.8862666666666665
Number of predictions: 10000
Number of unique IDs: 10000
Saved 10000 unique predictions to submissions.csv
