In [74]:
import numpy as np
import pandas as pd

In [158]:
# Define the KNN class
class KNN:
    def __init__(self, k, distance_metric):
        self.k = k
        self.distance_metric = distance_metric
        self.X_train = None
        self.y_train = None

    def fit(self, X, y):
        self.X_train = np.array(X, dtype=float)  # Ensure float type
        self.y_train = np.array(y)

    def predictTrain(self, X):
        X = np.array(X, dtype=float)  # Ensure float type
        predictions = []
        for x in X:
            distances = self.compute_distance(self.X_train, x)
            k_indices = np.argsort(distances)[:self.k]
            k_nearest_labels = self.y_train[k_indices]

            # Manual mode calculation
            unique_labels, counts = np.unique(k_nearest_labels, return_counts=True)
            most_common_label = unique_labels[np.argmax(counts)]
            predictions.append(most_common_label)

        return np.array(predictions)
    
    def predictTest(self, X):
        # Predict probabilities for each sample in X
        if X.ndim == 1:
            X = X.reshape(1, -1)
        return np.array([self._predictTest(x) for x in X])
    
    def _predictTest(self, x):
        distances = self.compute_distance(self.X_train, x)
        k_indices = np.argsort(distances)[:self.k]
        k_nearest_labels = self.y_train[k_indices]
        k_nearest_distances = distances[k_indices]
        # Weighted voting: weight inversely proportional to distance
        weights = 1 / (k_nearest_distances + 1e-5)
        weighted_sum = np.sum(weights * k_nearest_labels)
        total_weight = np.sum(weights)
        prob = weighted_sum / total_weight
        return prob

    def compute_distance(self, X1, X2):
        if self.distance_metric == 'euclidean':
            return np.sqrt(np.sum((X1 - X2) ** 2, axis=1))
        elif self.distance_metric == 'manhattan':
            return np.sum(np.abs(X1 - X2), axis=1)
        else:
            raise ValueError("Unsupported distance metric")

In [151]:
def preprocess_data(train_path, test_path):
    train_data = pd.read_csv(train_path)
    test_data = pd.read_csv(test_path)

    # Handle missing values by imputation (using median for numerical columns)
    for column in train_data.columns:
        if train_data[column].isnull().any():
            median_value = train_data[column].median()
            train_data[column].fillna(median_value, inplace=True)
            test_data[column].fillna(median_value, inplace=True)

    # Drop unnecessary columns
    X_train = train_data.drop(columns=['id', 'CustomerId', 'Surname', 'Exited'])
    y_train = train_data['Exited']
    X_test = test_data.drop(columns=['id', 'CustomerId', 'Surname'])

    # Map Gender
    X_train['Gender'] = X_train['Gender'].map({'Male': 1, 'Female': 0})
    X_test['Gender'] = X_test['Gender'].map({'Male': 1, 'Female': 0})

    # One-hot encode Geography
    X_train = pd.get_dummies(X_train, columns=['Geography'], drop_first=True)
    X_test = pd.get_dummies(X_test, columns=['Geography'], drop_first=True)

    # Creating interaction features
    X_train['Age_Tenure_Ratio'] = X_train['Age'] / (X_train['Tenure'] + 1e-5)  # Avoid division by zero
    X_test['Age_Tenure_Ratio'] = X_test['Age'] / (X_test['Tenure'] + 1e-5)
    X_train['Balance_Salary_Ratio'] = X_train['Balance'] / (X_train['EstimatedSalary'] + 1e-5)
    X_test['Balance_Salary_Ratio'] = X_test['Balance'] / (X_test['EstimatedSalary'] + 1e-5)

    # Add new interaction features
    X_train['CreditScore_Tenure'] = X_train['CreditScore'] * X_train['Tenure']
    X_test['CreditScore_Tenure'] = X_test['CreditScore'] * X_test['Tenure']

    X_train['Income_Stability'] = X_train['Balance'] / (X_train['EstimatedSalary'] * (X_train['Tenure'] + 1e-5))
    X_test['Income_Stability'] = X_test['Balance'] / (X_test['EstimatedSalary'] * (X_test['Tenure'] + 1e-5))

    # Add Credit Score Bucketing
    def credit_score_bucket(credit_score):
        if credit_score <= 500:
            return 'Low'
        elif 500 < credit_score <= 700:
            return 'Medium'
        elif 700 < credit_score <= 850:
            return 'High'
        else:
            return 'Very High'
    
    # Apply the function to create the new bucketed feature
    X_train['CreditScore_Bucket'] = X_train['CreditScore'].apply(credit_score_bucket)
    X_test['CreditScore_Bucket'] = X_test['CreditScore'].apply(credit_score_bucket)
    
    X_train['BalanceGroup'] = pd.cut(X_train['Balance'], bins=[-1, 20000, 50000, 100000, 150000, 100000000], labels=[0, 1, 2, 3, 4])
    X_test['BalanceGroup'] = pd.cut(X_test['Balance'], bins=[-1, 20000, 50000, 100000, 150000, 100000000], labels=[0, 1, 2, 3, 4])
    
    X_train = pd.get_dummies(X_train, columns=['BalanceGroup'], drop_first=True)
    X_test = pd.get_dummies(X_test, columns=['BalanceGroup'], drop_first=True)
    
    # One-hot encode the CreditScore_Bucket feature
    X_train = pd.get_dummies(X_train, columns=['CreditScore_Bucket'], drop_first=True)
    X_test = pd.get_dummies(X_test, columns=['CreditScore_Bucket'], drop_first=True)

    # Add new feature for the key finding: age group and number of products interaction
    def age_group(age):
        if age < 30:
            return 'Under 30'
        elif 30 <= age < 40:
            return '30-40'
        elif 40 <= age < 50:
            return '40-50'
        else:
            return 'Over 50'

    X_train['Age_Group'] = X_train['Age'].apply(age_group)
    X_test['Age_Group'] = X_test['Age'].apply(age_group)
    
    X_train['AgeGroup_NumOfProducts_Interaction'] = X_train['Age_Group'] + '_' + X_train['NumOfProducts'].astype(str)
    X_test['AgeGroup_NumOfProducts_Interaction'] = X_test['Age_Group'] + '_' + X_test['NumOfProducts'].astype(str)

    # One-hot encode the AgeGroup_NumOfProducts_Interaction
    X_train = pd.get_dummies(X_train, columns=['AgeGroup_NumOfProducts_Interaction'], drop_first=True)
    X_test = pd.get_dummies(X_test, columns=['AgeGroup_NumOfProducts_Interaction'], drop_first=True)

    # Align the columns of X_test to match X_train (any missing columns will be filled with 0)
    X_test = X_test.reindex(columns=X_train.columns, fill_value=0)

    # Select the most important features, including the new interaction terms
    feature_columns = [
        'CreditScore',
        'Gender',
        'Age',
        'Tenure',
        'NumOfProducts',
        'Age_Tenure_Ratio',
        'Balance_Salary_Ratio',
        'CreditScore_Tenure',
        # One-hot encoded CreditScore_Bucket features (automatically handled),
        # AgeGroup_NumOfProducts_Interaction one-hot encoded
    ]

    # Filter the training and testing data
    X_train = X_train[feature_columns]
    X_test = X_test[feature_columns]

    # Apply Standard scaling to each column
    X_train_scaled = (X_train - X_train.mean()) / X_train.std()
    X_test_scaled = (X_test - X_train.mean()) / X_train.std()  # Use training mean/std for test data

    return X_train_scaled.values, y_train.values, X_test_scaled.values


In [161]:
def cross_validate(X, y, knn, n_splits=5):
    """ Perform k-fold cross-validation """
    # Convert X and y to NumPy arrays if they are not already
    X = np.array(X)
    y = np.array(y)

    # Check if X and y have the same length
    if len(X) != len(y):
        raise ValueError("X and y must have the same length")

    fold_size = len(X) // n_splits
    indices = np.arange(len(X))
    np.random.shuffle(indices)  # Shuffle the indices
    scores = []

    for fold in range(n_splits):
        # Calculate the indices for validation set
        val_indices = indices[fold * fold_size : (fold + 1) * fold_size] if fold < n_splits - 1 else indices[fold * fold_size :]

        train_indices = np.concatenate([indices[:fold * fold_size], indices[(fold + 1) * fold_size:]])
        
        X_train, X_val = X[train_indices], X[val_indices]
        y_train, y_val = y[train_indices], y[val_indices]

        # Fit the model and predict
        knn.fit(X_train, y_train)
        y_val_pred = knn.predictTrain(X_val)
        
        # Calculate accuracy or any other metric
        score = np.mean(y_val_pred == y_val)
        scores.append(score)

    return np.array(scores)

In [163]:
# Load and preprocess data
X, y, X_test = preprocess_data('train.csv', 'test.csv')
print(X.shape, y.shape, X_test.shape)

# best_k = None
# best_score = 0
# for k in range(7, 12):  # Test k values from 1 to 20
#     knn = KNN(k=k, distance_metric='euclidean')
#     cv_scores = cross_validate(X, y, knn)
#     mean_score = np.mean(cv_scores)
#     print(k, mean_score)


#     if mean_score > best_score:
#         best_score = mean_score
#         best_k = k

# print(best_k, best_score)


# knn = KNN(k=11, distance_metric='euclidean')
# cv_scores = cross_validate(X, y, knn)

# # Save test predictions
# test_predictions = knn.predict(X_test)
# pd.DataFrame({'id': pd.read_csv('test.csv')['id'], 'Exited': test_predictions}).to_csv('submissions.csv', index=False)

(15000, 10) (15000,) (10000, 10)
