In [82]:
import numpy as np
import pandas as pd

In [83]:
class KNN:
    def __init__(self, k=3, distance_metric='euclidean'):
        self.k = k
        self.distance_metric = distance_metric
        self.X_train = None
        self.y_train = None

    def fit(self, X, y):
        # Store the training data
        self.X_train = X
        self.y_train = y

    def predict(self, X):
        predictions = []
        for x in X:
            distances = [self.compute_distance(x, x_train) for x_train in self.X_train]
            k_indices = np.argsort(distances)[:self.k]  # Get indices of k nearest neighbors
            k_nearest_labels = [self.y_train[i] for i in k_indices]
            predictions.append(np.bincount(k_nearest_labels).argmax())  # Majority vote
        return np.array(predictions)

    def predict_proba(self, X):
        probabilities = []
        for x in X:
            distances = [self.compute_distance(x, x_train) for x_train in self.X_train]
            k_indices = np.argsort(distances)[:self.k]  # Get indices of k nearest neighbors
            k_nearest_labels = [self.y_train[i] for i in k_indices]
            prob = np.mean(k_nearest_labels)  # Calculate the proportion of churned customers
            probabilities.append(prob)
        return np.array(probabilities)

    def compute_distance(self, X1, X2):
        if self.distance_metric == 'euclidean':
            return np.sqrt(np.sum((X1 - X2) ** 2))
        elif self.distance_metric == 'manhattan':
            return np.sum(np.abs(X1 - X2))
        else:
            raise ValueError("Unsupported distance metric.")

In [84]:
def preprocess_data(train_data, test_data):
    y_train = train_data['Exited']
    X_train = train_data.drop(columns=['Exited', 'id', 'CustomerId', 'Surname'])
    test_data = test_data.drop(columns=['id', 'CustomerId', 'Surname'])

    # Fill missing values
    X_train.fillna(X_train.median(numeric_only=True), inplace=True)
    test_data.fillna(test_data.median(numeric_only=True), inplace=True)

    # Encode categorical features
    X_train = pd.get_dummies(X_train, columns=['Geography', 'Gender'], drop_first=True)
    test_data = pd.get_dummies(test_data, columns=['Geography', 'Gender'], drop_first=True)

    # Align the columns
    X_train, test_data = X_train.align(test_data, join='left', axis=1, fill_value=0)

    # Scale numerical features
    numerical_features = ['CreditScore', 'Age', 'Balance', 'NumOfProducts', 'EstimatedSalary']
    X_train[numerical_features] = (X_train[numerical_features] - X_train[numerical_features].mean()) / X_train[numerical_features].std()
    test_data[numerical_features] = (test_data[numerical_features] - test_data[numerical_features].mean()) / test_data[numerical_features].std()

    # Ensure all columns are numeric and convert to numpy arrays
    X_train = X_train.apply(pd.to_numeric, errors='coerce').values  # Convert to numpy array
    test_data = test_data.apply(pd.to_numeric, errors='coerce').values  # Convert to numpy array

    return X_train, y_train.values, test_data  # Convert y_train to numpy array

In [85]:
def cross_validate(X, y, knn, n_splits=5):
    fold_size = len(X) // n_splits
    auc_scores = []

    for i in range(n_splits):
        # Split data manually
        X_train = np.concatenate((X[:i * fold_size], X[(i + 1) * fold_size:]), axis=0)
        y_train = np.concatenate((y[:i * fold_size], y[(i + 1) * fold_size:]), axis=0)
        X_val = X[i * fold_size:(i + 1) * fold_size]
        y_val = y[i * fold_size:(i + 1) * fold_size]

        # Train and predict
        knn.fit(X_train, y_train)
        predictions = knn.predict(X_val)
        
        # Simplified accuracy calculation for validation purposes
        accuracy = np.mean(predictions == y_val)
        auc_scores.append(accuracy)

    return np.mean(auc_scores)

In [86]:
# Load the datasets
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

# Preprocess the data
X, y, X_test = preprocess_data(train_data, test_data)

# Check shapes and types instead of dtypes
print(f"X_train shape: {X.shape}, X_train type: {type(X)}")
print(f"y_train shape: {y.shape}, y_train type: {type(y)}")
print(f"X_test shape: {X_test.shape}, X_test type: {type(X_test)}")

# Create and evaluate model
knn = KNN(k=5, distance_metric='euclidean')

# Perform cross-validation
cv_scores = cross_validate(X, y, knn)
print("Cross-validation scores:", cv_scores)

# Train on full dataset and make predictions on test set
knn.fit(X, y)
test_probabilities = knn.predict_proba(X_test)

# Save test predictions to CSV
submission = pd.DataFrame({'id': pd.read_csv('test.csv')['id'], 'Exited': test_probabilities})
submission.to_csv('submissions.csv', index=False)

X_train shape: (15000, 11), X_train type: <class 'numpy.ndarray'>
y_train shape: (15000,), y_train type: <class 'numpy.ndarray'>
X_test shape: (10000, 11), X_test type: <class 'numpy.ndarray'>


  predictions.append(np.bincount(k_nearest_labels).argmax())  # Majority vote


Cross-validation scores: 0.8768666666666667
