In [65]:
import numpy as np
import pandas as pd

import pandas as pd

# Example for checking columns in the train dataset
train_data = pd.read_csv('/Users/anamuuenishi/Desktop/knn_customer_data/train.csv')
print(train_data.columns)


Index(['id', 'CustomerId', 'Surname', 'CreditScore', 'Geography', 'Gender',
       'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard',
       'IsActiveMember', 'EstimatedSalary', 'Exited'],
      dtype='object')


In [66]:
import numpy as np

class KNN:
    def __init__(self, k=3, distance_metric='euclidean'):
        self.k = k
        self.distance_metric = distance_metric

    def fit(self, X, y):
        # Ensure only numeric data is retained
        self.X_train = X.select_dtypes(include=[np.number])
        self.y_train = y

    

    def compute_distance(self, X1, X2):
        if self.distance_metric == 'euclidean':
            return np.sqrt(np.sum((X1 - X2) ** 2, axis=1))
        elif self.distance_metric == 'manhattan':
            return np.sum(np.abs(X1 - X2), axis=1)

    def predict_proba(self, X):
        # Ensure only numeric data is used for prediction
        X = X.select_dtypes(include=[np.number])
        probabilities = []
        for x in X.to_numpy():  # Convert to numpy array for computation
            # Compute distances from x to all points in X_train
            distances = self.compute_distance(x, self.X_train.to_numpy())
            # Get indices of the k nearest neighbors
            k_indices = np.argsort(distances)[:self.k]
            # Retrieve the classes of the nearest neighbors
            k_nearest_labels = self.y_train.iloc[k_indices]
            # Estimate probabilities based on the presence of each class in nearest neighbors
            class_count = np.bincount(k_nearest_labels, minlength=2)
            total = class_count.sum()
            probabilities.append(class_count / total if total > 0 else [0.5, 0.5])
        return np.array(probabilities)



    
 
    

In [67]:
def preprocess_data(train_path, test_path):
    # Load the datasets
    train_data = pd.read_csv(train_path)
    test_data = pd.read_csv(test_path)
    
    # Confirm the 'Exited' column is in the train data
    if 'Exited' not in train_data.columns:
        raise ValueError("The 'Exited' column is missing from the training data.")

    # Split the training data into features and target
    X = train_data.drop(columns=['Exited'])
    y = train_data['Exited']
    
    # Optionally, handle missing values here
    # For example, fill numeric columns with the mean
    numeric_cols = X.select_dtypes(include=['number']).columns
    for col in numeric_cols:
        mean_val = X[col].mean()
        X[col].fillna(mean_val, inplace=True)
        if col in test_data.columns:
            test_data[col].fillna(mean_val, inplace=True)

    X_test = test_data
    
    return X, y, X_test



In [68]:
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold
import numpy as np

def cross_validate(X, y, knn, n_splits=5):
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    auc_scores = []
    
    for train_index, test_index in kf.split(X):
        # Correct indexing of rows using .iloc for pandas DataFrames
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        # Train the model
        knn.fit(X_train, y_train)
        
        # Predict probabilities for the positive class
        probs = knn.predict_proba(X_test)[:, 1]
        
        # Calculate ROC AUC score
        auc_score = roc_auc_score(y_test, probs)
        auc_scores.append(auc_score)
    
    # Return the average ROC AUC score
    return np.mean(auc_scores)


In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split, KFold

# Load and preprocess data
X, y, X_test = preprocess_data('/Users/anamuuenishi/Desktop/knn_customer_data/train.csv', '/Users/anamuuenishi/Desktop/knn_customer_data/test.csv')

# Create and evaluate model
knn = KNN(k=5, distance_metric='euclidean')

# Perform cross-validation
cv_scores = cross_validate(X, y, knn)
print("Cross-validation scores:", cv_scores)

# TODO: Hyperparameter Tuning
# Split the data to have a validation set for hyperparameter tuning
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the best parameters
best_k = 5
best_metric = 'euclidean'
best_score = 0

# Iterate over various values of k and distance metrics to find the best parameters
for k in range(1, 20, 2):  # Testing odd values of k
    for metric in ['euclidean', 'manhattan']:
        knn = KNN(k=k, distance_metric=metric)
        knn.fit(X_train, y_train)
        probs = knn.predict_proba(X_val)[:, 1]  # Assuming predict_proba is implemented
        score = roc_auc_score(y_val, probs)
        print(f'K: {k}, Metric: {metric}, ROC AUC: {score}')
        if score > best_score:
            best_score = score
            best_k = k
            best_metric = metric

print(f"Best K: {best_k}, Best Metric: {best_metric}, Best ROC AUC: {best_score}")

# TODO: Train on full dataset with optimal hyperparameters and make predictions on test set
knn = KNN(k=best_k, distance_metric=best_metric)
knn.fit(X, y)
test_predictions = knn.predict_proba(X_test)[:, 1]  # Assuming predict_proba is implemented

# Save test predictions
test_ids = pd.read_csv('/Users/anamuuenishi/Desktop/knn_customer_data/test.csv')['id']
submission = pd.DataFrame({'id': test_ids, 'Exited': test_predictions})
submission.to_csv('submissions.csv', index=False)
print("Submission file saved.")



The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X[col].fillna(mean_val, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_data[col].fillna(mean_val, inplace=True)
