In [22]:
import numpy as np
import pandas as pd
from sklearn.calibration import LabelEncoder

In [23]:
# Define the KNN class
class KNN:
    def __init__(self, k=3, distance_metric='euclidean'):
        self.k = k
        self.distance_metric = distance_metric

    def fit(self, X, y):
        # TODO: Implement the fit method
        self.X_train = X
        self.y_train = y

    def predict(self, X):
        prediction_arr = [self._predict(x) for x in X]
        return np.array(prediction_arr)
    
    def _predict(self, x):
        distances = [self.compute_distance(x, x_train) for x_train in self.X_train]
        k_indices = np.argsort(distances)[:self.k]
        k_nearest_label = [self.y_train.iloc[i] for i in k_indices]
        most_frequent = np.bincount(k_nearest_label).argmax()
        return most_frequent
    
    def compute_distance(self, X, Y):
        if self.distance_metric == 'euclidean':
            return np.sqrt(np.sum((X - Y) ** 2))
        
        else:
            raise ValueError("Incorrect distance metric")
            

In [24]:
# Define data preprocessing function
from sklearn.discriminant_analysis import StandardScaler
from sklearn.preprocessing import PolynomialFeatures

def preprocess_data(train_path, test_path):
    train_data = pd.read_csv(train_path)
    test_data = pd.read_csv(test_path)

    # TODO: Implement data preprocessing
    # Handle categorical variables, scale features, etc.
    concatenated = pd.concat([train_data, test_data], axis=0)
    encoders = {}
    
    for col in concatenated.select_dtypes(include=['object']).columns:
        label_encoder = LabelEncoder()
        concatenated[col] = label_encoder.fit_transform(concatenated[col])
        encoders[col] = label_encoder
    
    l = len(train_data)
    train_data = concatenated.iloc[:l]
    test_data = concatenated.iloc[l:]
    
    # Separet the features and the target
    X_train = train_data.drop(columns=['Exited'])
    y_train = train_data['Exited']
    X_test = test_data.drop(columns=['Exited'])
    
    feature_eng = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
    X_train_with_features = feature_eng.fit_transform(X_train)
    X_test_with_features = feature_eng.transform(X_test)
    
    # Standard Scaler to scale features
    scaler = StandardScaler()
    X_train_with_features = scaler.fit_transform(X_train_with_features)
    X_test_with_features = scaler.transform(X_test_with_features)
    
    return X_train_with_features, y_train, X_test_with_features      

In [25]:
# Define cross-validation function
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold


def cross_validate(X, y, knn, n_splits=5):
    # TODO: Implement cross-validation
    # Compute ROC AUC scores
    
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    score_list = []
    
    for tr_idx, val_idx in kf.split(X):
        # Separate the data into test & training sets based on their current indices. 
        X_train, X_val = X[tr_idx], X[val_idx]
        y_train, y_val = y.iloc[tr_idx], y.iloc[val_idx]
        
        # Fit the training data to the KNN Model
        knn.fit(X_train, y_train)
        y_pred = knn.predict(X_val)
        
        # Estimate the target values for the validation set
        score = roc_auc_score(y_val, y_pred)
        score_list.append(score)
    return score_list

In [26]:
from imblearn.over_sampling import SMOTE

# Load and preprocess data
X, y, X_test = preprocess_data('train.csv', 'test.csv')

# Balance the dataset using SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Define a range of k values to search for the best one
k_values = range(1, 5)
best_k = 13
best_score = 0

# Search for the best k value using cross-validation
print("Starting KNN hyperparameter tuning - Finding the best k")

# for k in k_values:
#     knn = KNN(k=k)
#     scores = cross_validate(X_resampled, y_resampled, knn, n_splits=5)
#     avg_score = np.mean(scores)
#     print(f"Average ROC AUC score for k={k}: {avg_score}")
    
#     if avg_score > best_score:
#         best_score = avg_score
#         best_k = k

print(f"Best k value: {best_k}")
print(f"Best cross-validation score: {best_score}")

# Train the final model using the best k value
best_knn = KNN(k=best_k)
best_knn.fit(X_resampled, y_resampled)
test_predictions = best_knn.predict(X_test)

# Save test predictions
pd.DataFrame({'id': pd.read_csv('test.csv')['id'], 'Exited': test_predictions}).to_csv('submissions.csv', index=False)


Starting KNN hyperparameter tuning - Finding the best k
Best k value: 13
Best cross-validation score: 0


  most_frequent = np.bincount(k_nearest_label).argmax()
