In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.preprocessing import StandardScaler


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/cs-506-predicting-customer-churn-using-knn/sample_submission.csv
/kaggle/input/cs-506-predicting-customer-churn-using-knn/train.csv
/kaggle/input/cs-506-predicting-customer-churn-using-knn/test.csv


In [2]:
# Data Preprocessing section

# Read Data
train_df = pd.read_csv('/kaggle/input/cs-506-predicting-customer-churn-using-knn/train.csv')
test_df = pd.read_csv('/kaggle/input/cs-506-predicting-customer-churn-using-knn/test.csv')

# Separate features and target in the training set
X_train = train_df.drop(['CustomerId', 'Surname', 'Exited'], axis=1)
y_train = train_df['Exited']

# Preprocess the training data
# Fill missing values with median
# X_train.fillna(X_train.median(), inplace=True)

# One-hot encode categorical features
X_train = pd.get_dummies(X_train, columns=['Geography', 'Gender'], drop_first=True)

# Apply the same preprocessing steps to the test data
test_ids = test_df['id']
X_test = test_df.drop(['CustomerId', 'Surname'], axis=1)
# X_test.fillna(X_test.median(), inplace=True)
X_test = pd.get_dummies(X_test, columns=['Geography', 'Gender'], drop_first=True)

# Ensure both train and test have the same columns after one-hot encoding
X_test = X_test.reindex(columns=X_train.columns, fill_value=0)

# Now, let's scale the features using NumPy and Pandas instead of StandardScaler
# Calculate the mean and standard deviation for scaling using training data
train_mean = X_train.mean()
train_std = X_train.std()

# Scale the training and test datasets manually
X_train_scaled = (X_train - train_mean) / train_std
X_test_scaled = (X_test - train_mean) / train_std

# Optionally, convert to DataFrame for better handling if needed
X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_test_scaled_df = pd.DataFrame(X_test_scaled, columns=X_train.columns)

# Convert to NumPy arrays for further processing
X_train_np = X_train_scaled_df.to_numpy()
y_train_np = y_train.to_numpy()
X_test_np = X_test_scaled_df.to_numpy()

# Display a preview of the scaled datasets
print(X_train_scaled_df.head())
print(X_test_scaled_df.head())

         id  CreditScore       Age    Tenure   Balance  NumOfProducts  \
0 -1.731878     0.170812 -0.578192  1.066915  0.990459      -1.102312   
1 -1.731647     0.060597 -1.069157  0.709002 -0.727682       0.773857   
2 -1.731416     0.708111 -0.700933 -1.438477 -0.727682       0.773857   
3 -1.731185     0.377466  1.753888  1.424828  0.650193      -1.102312   
4 -1.730954     0.225920  0.526477  0.351089  1.440448      -1.102312   

   HasCrCard  IsActiveMember  EstimatedSalary  Geography_Germany  \
0   0.529829        1.022213         1.296669           2.085476   
1  -1.887275       -0.978204         0.976132          -0.479475   
2   0.529829       -0.978204        -1.598851          -0.479475   
3  -1.887275       -0.978204         0.174050          -0.479475   
4   0.529829        1.022213        -1.018246          -0.479475   

   Geography_Spain  Gender_Male  
0        -0.520118     0.870535  
1        -0.520118     0.870535  
2        -0.520118     0.870535  
3        -0.5201

In [3]:
# Model Building section

 # Custom function to calculate Euclidean distance
def euclidean_distance(row1, row2):
    return np.sqrt(np.sum((row1 - row2) ** 2))

# Custom function to get the K nearest neighbors
def get_k_neighbors(X_train, y_train, test_instance, k):
    distances = []
    # Calculate the distance between the test_instance and all training data
    for i in range(len(X_train)):
        dist = euclidean_distance(X_train[i], test_instance)
        distances.append((dist, y_train[i]))  # Store distance and the corresponding label
    # Sort the distances by distance
    distances.sort(key=lambda x: x[0])
    # Select the top k neighbors
    neighbors = distances[:k]
    return [neighbor[1] for neighbor in neighbors]

# Custom function to predict churn probability using KNN
def predict_knn_prob(X_train, y_train, X_test, k):
    probabilities = []
    # For each test instance, find the k nearest neighbors
    for test_instance in X_test:
        neighbors = get_k_neighbors(X_train, y_train, test_instance, k)
        # Estimate probability as the proportion of class 1 (churn) among neighbors
        churn_prob = sum(neighbors) / len(neighbors)
        probabilities.append(churn_prob)
    return np.array(probabilities)


# Predict the churn probabilities for the test set
k = 45  # You can adjust the number of neighbors based on tuning
y_test_pred_prob = predict_knn_prob(X_train_np, y_train_np, X_test_np, k)

# Create submission DataFrame
submission_df_custom = pd.DataFrame({
    'id': test_ids,
    'Exited': y_test_pred_prob
})

# Save the submission file in the correct format
submission_df_custom.to_csv('knn_churn_predictions_custom3.csv', index=False)
print("Prediction results saved to 'knn_churn_predictions_custom3.csv'")


Prediction results saved to 'knn_churn_predictions_custom3.csv'


In [4]:
from sklearn.metrics import roc_auc_score

# Split data for validation (optional for evaluating)
from sklearn.model_selection import train_test_split
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(X_train_np, y_train_np, test_size=0.2, random_state=42)

# Predict probabilities on the validation set
n_neighbors_values = range(1, 50)  # Testing for neighbors from 1 to 50
for n_neighbors in n_neighbors_values:
    y_val_pred_prob = predict_knn_prob(X_train_split, y_train_split, X_val_split, n_neighbors)
    roc_auc = roc_auc_score(y_val_split, y_val_pred_prob)
    print(f'Validation ROC AUC: {roc_auc:.4f} with n_neighbors={n_neighbors}.' )
    
# Evaluate using ROC AUC
# roc_auc = roc_auc_score(y_val_split, y_val_pred_prob)
# print(f'Validation ROC AUC: {roc_auc:.4f}')


Validation ROC AUC: 0.7263 with n_neighbors=1.
Validation ROC AUC: 0.7942 with n_neighbors=2.
Validation ROC AUC: 0.8244 with n_neighbors=3.
Validation ROC AUC: 0.8396 with n_neighbors=4.
Validation ROC AUC: 0.8555 with n_neighbors=5.
Validation ROC AUC: 0.8629 with n_neighbors=6.
Validation ROC AUC: 0.8645 with n_neighbors=7.
Validation ROC AUC: 0.8653 with n_neighbors=8.
Validation ROC AUC: 0.8703 with n_neighbors=9.
Validation ROC AUC: 0.8751 with n_neighbors=10.
Validation ROC AUC: 0.8767 with n_neighbors=11.
Validation ROC AUC: 0.8762 with n_neighbors=12.
Validation ROC AUC: 0.8787 with n_neighbors=13.
Validation ROC AUC: 0.8814 with n_neighbors=14.
Validation ROC AUC: 0.8844 with n_neighbors=15.
Validation ROC AUC: 0.8855 with n_neighbors=16.
Validation ROC AUC: 0.8874 with n_neighbors=17.
Validation ROC AUC: 0.8885 with n_neighbors=18.
Validation ROC AUC: 0.8900 with n_neighbors=19.
Validation ROC AUC: 0.8909 with n_neighbors=20.
Validation ROC AUC: 0.8923 with n_neighbors=21.
V