In [1]:
import numpy as np
import pandas as pd

In [2]:
# Define the KNN class
class KNN:
    def __init__(self, k=3, distance_metric='euclidean'):
        self.k = k
        self.distance_metric = distance_metric

    def fit(self, X, y):
        # TODO: Implement the fit method
        self.X_train = X
        self.y_train = y

    def predict(self, X):
        # TODO: Implement the predict method
        predictions = []
        
        for example in X:
            # Calculate distances from the current example to all training points
            distances = []
            for i in range(len(self.X_train)):
                distance = self.compute_distance(self.X_train[i], example)
                distances.append((i, distance))

            # Sort the distances and get the indices of the k nearest neighbors
            distances.sort(key=lambda x: x[1])
            k_neighbors = distances[:self.k]

            # Retrieve the labels of the k nearest neighbors
            k_y_values = [self.y_train[neighbor[0]] for neighbor in k_neighbors]

            # Perform majority voting for classification
            prediction = max(set(k_y_values), key=k_y_values.count)
            predictions.append(prediction)

        return np.array(predictions)

    def compute_distance(self, X1, X2):
        # TODO: Implement distance computation based on self.distance_metric
        # Hint: Use numpy operations for efficient computation
        if self.distance_metric == 'euclidean':
            return np.sqrt(np.sum((X1 - X2) ** 2))
        if self.distance_metric == 'manhattan':
            return np.sum(np.abs(X1 - X2))

In [3]:
# Define data preprocessing function
def preprocess_data(train_path, test_path):
    train_data = pd.read_csv(train_path)
    test_data = pd.read_csv(test_path)

    # TODO: Implement data preprocessing
    # Handle categorical variables, scale features, etc.
    
    # Drop irrelevant columns
    drop_columns = ['id', 'CustomerId', 'Surname']
    train_data = train_data.drop(columns=drop_columns)
    test_data = test_data.drop(columns=drop_columns)

    # Fill missing values for numerical columns with mean
    numeric_features = ['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary']
    for feature in numeric_features:
        train_data[feature].fillna(train_data[feature].mean(), inplace=True)
        test_data[feature].fillna(train_data[feature].mean(), inplace=True)  # Use train mean for test

    # One-hot encoding for categorical variables ('Geography' and 'Gender')
    train_data = pd.get_dummies(train_data, columns=['Geography', 'Gender'], drop_first=True)
    test_data = pd.get_dummies(test_data, columns=['Geography', 'Gender'], drop_first=True)

    # Align train and test data columns
    test_data = test_data.reindex(columns=train_data.columns.drop('Exited'), fill_value=0)

    # Separate features and target labels (for train only)
    X_train = train_data.drop('Exited', axis=1)
    y_train = train_data['Exited']
    X_test = test_data.copy()  # Assuming test doesn't have 'Exited'

    # Manual Min-Max Scaling
    for feature in numeric_features:
        min_value = X_train[feature].min()
        max_value = X_train[feature].max()
        X_train[feature] = (X_train[feature] - min_value) / (max_value - min_value)
        # Scale test data using the same min and max from the train data
        X_test[feature] = (X_test[feature] - min_value) / (max_value - min_value)

    return X_train.values, y_train.values, X_test.values

In [4]:
#helper functions
# Accuracy calculation
def accuracy(y_true, y_pred):
    return np.sum(y_true == y_pred) / len(y_true)

# Precision calculation
def precision(y_true, y_pred):
    true_positive = np.sum((y_true == 1) & (y_pred == 1))
    predicted_positive = np.sum(y_pred == 1)
    return true_positive / predicted_positive if predicted_positive > 0 else 0

# Recall calculation
def recall(y_true, y_pred):
    true_positive = np.sum((y_true == 1) & (y_pred == 1))
    actual_positive = np.sum(y_true == 1)
    return true_positive / actual_positive if actual_positive > 0 else 0

# F1-Score calculation
def f1_score(y_true, y_pred):
    p = precision(y_true, y_pred)
    r = recall(y_true, y_pred)
    return 2 * (p * r) / (p + r) if (p + r) > 0 else 0

# ROC AUC calculation (simplified, assuming binary classification)
def roc_auc(y_true, y_scores):
    # Sort true labels by predicted scores
    sorted_indices = np.argsort(y_scores)[::-1]
    y_true_sorted = y_true[sorted_indices]
    
    # Calculate true positive rate (TPR) and false positive rate (FPR)
    tpr = np.cumsum(y_true_sorted) / np.sum(y_true_sorted)  # True positive rate
    fpr = np.cumsum(1 - y_true_sorted) / np.sum(1 - y_true_sorted)  # False positive rate
    
    # Compute AUC using the trapezoidal rule
    auc = np.trapz(tpr, fpr)
    return auc

In [5]:
# Define cross-validation function
def cross_validate(X, y, knn, n_splits=5):
    # TODO: Implement cross-validation
    # Compute ROC AUC scores
    indices = np.arange(len(X))
    np.random.shuffle(indices)
    X, y = X[indices], y[indices]
    
    fold_size = len(X) // n_splits
    metrics = {'accuracy': [], 'precision': [], 'recall': [], 'f1_score': [], 'roc_auc': []}
    
    for i in range(n_splits):
        # Create train and validation sets
        X_val = X[i * fold_size: (i + 1) * fold_size]
        y_val = y[i * fold_size: (i + 1) * fold_size]
        X_train = np.concatenate((X[:i * fold_size], X[(i + 1) * fold_size:]), axis=0)
        y_train = np.concatenate((y[:i * fold_size], y[(i + 1) * fold_size:]), axis=0)
        
        # Fit the KNN model
        knn.fit(X_train, y_train)
        
        # Predict class labels for validation set
        y_pred = knn.predict(X_val)
        
        # Compute performance metrics
        metrics['accuracy'].append(accuracy(y_val, y_pred))
        metrics['precision'].append(precision(y_val, y_pred))
        metrics['recall'].append(recall(y_val, y_pred))
        metrics['f1_score'].append(f1_score(y_val, y_pred))
        metrics['roc_auc'].append(roc_auc(y_val, y_pred))  # ROC AUC from binary labels

    # Return the average of each metric across all folds
    return {key: np.mean(value) for key, value in metrics.items()}

In [None]:
# Load and preprocess data
X, y, X_test = preprocess_data('train.csv', 'test.csv')

# Create and evaluate model
knn = KNN(k=5, distance_metric='euclidean')

# Perform cross-validation
cv_scores = cross_validate(X, y, knn)

print("Cross-validation scores:", cv_scores)

# TODO: hyperparamters tuning
best_k = None
best_score = 0
for k in range(1, 21):  # Try k from 1 to 20
    knn = KNN(k=k, distance_metric='euclidean')
    cv_scores = cross_validate(X, y, knn)
    
    if cv_scores['accuracy'] > best_score:
        best_score = cv_scores['accuracy']
        best_k = k

print(f"Best k: {best_k} with Accuracy: {best_score}")


# TODO: Train on full dataset with optimal hyperparameters and make predictions on test set
knn = KNN(k=best_k, distance_metric='euclidean')
knn.fit(X, y)
test_predictions = knn.predict(X_test)

# Save test predictions
pd.DataFrame({'id': pd.read_csv('test.csv')['id'], 'Exited': test_predictions}).to_csv('submissions.csv', index=False)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_data[feature].fillna(train_data[feature].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_data[feature].fillna(train_data[feature].mean(), inplace=True)  # Use train mean for test
  auc = np.trapz(tpr, fpr)
