In [18]:
import numpy as np
import pandas as pd

In [20]:
# Define the KNN class
class KNN:
    def __init__(self, k=3, distance_metric='euclidean'):
        self.k = k
        self.distance_metric = distance_metric

    def fit(self, X, y):
        # TODO: Implement the fit method
        self.X_train = X
        self.y_train = y

    def predict(self, X):
        # TODO: Implement the predict method
        predictions = []
        for x in X:
            # Compute the distances
            distances = self.compute_distance(self.X_train, x)
            
            # Get k nearest neighboring nodes
            k_index = np.argsort(distances)[:self.k]
            
            # Obtain the k nearest labels
            k_nearest_labels = self.y_train[k_index]
            
            # Determine the most frequent labels:
            prediction = np.argmax(np.bincount(k_nearest_labels))
        
        return np.array(predictions)

    def compute_distance(self, X1, X2):
        # TODO: Implement distance computation based on self.distance_metric
        # Hint: Use numpy operations for efficient computation
        if self.distance_metric == 'euclidean':
            return np.sqrt(np.sum(X1 - X2)**2, axis=1)
        else:
            raise ValueError("Invalid distance metric given")
            

In [21]:
# Define data preprocessing function
def preprocess_data(train_path, test_path):
    train_data = pd.read_csv(train_path)
    test_data = pd.read_csv(test_path)

    # TODO: Implement data preprocessing
    # Handle categorical variables, scale features, etc.
    for col in ['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'EstimatedSalary']:
        train_data[col].fillna(train_data[col].mean(), inplace=True)
        test_data[col].fillna(train_data[col].mean(), inplace=True)
        
    for col in ['Geography', 'Gender']:
        train_data[col].fillna(train_data[col].mode()[0], inplace = True)
        test_data[col].fillna(train_data[col].mode()[0], inplace=True)
    
    # One hot encoding for the Geography col    
    train_data = pd.get_dummies(train_data, columns=['Geography'], drop_first=True)
    test_data = pd.get_dummies(test_data, columns=['Geography'], drop_first=True)

    # Convert 'Gender' to binary
    train_data['Gender'] = train_data['Gender'].apply(lambda x: 1 if x == 'Male' else 0)
    test_data['Gender'] = test_data['Gender'].apply(lambda x: 1 if x == 'Male' else 0)
    
    # Scaling numerical columns
    numerical_columns = ['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'EstimatedSalary']

    for col in numerical_columns:
        min_val = train_data[col].min()
        max_val = train_data[col].max()
        
        train_data[col] = (train_data[col] - min_val) / (max_val - min_val)
        test_data[col] = (test_data[col] - min_val) / (max_val - min_val)
        
    # Drop unwanted columns
    train_data.drop(['CustomerId', 'Surname'], axis=1, inplace=True) 
    test_data.drop(['CustomerId', 'Surname'], axis=1, inplace=True)
     
    X_train = train_data.drop('Exited', axis=1).values # Features
    y_train = train_data['Exited'].values # Labels
    
    X_test = test_data.values
    
    return X_train, y_train, X_test

In [13]:
# Define cross-validation function
def cross_validate(X, y, knn, n_splits=5):
    # TODO: Implement cross-validation
    # Compute ROC AUC scores
    pass

In [17]:
# Load and preprocess data
X, y, X_test = preprocess_data('/path/of/train.csv', '/path/of/test.csv')

# Create and evaluate model
knn = KNN(k=5, distance_metric='euclidean')

# Perform cross-validation
cv_scores = cross_validate(X, y, knn)

print("Cross-validation scores:", cv_scores)

# TODO: hyperparamters tuning


# TODO: Train on full dataset with optimal hyperparameters and make predictions on test set
knn = ...
knn.fit(X, y)
test_predictions = knn.predict(X_test)

# Save test predictions
pd.DataFrame({'id': pd.read_csv('/path/of/test.csv')['id'], 'Exited': test_predictions}).to_csv('submissions.csv', index=False)

TypeError: cross_validate() got an unexpected keyword argument 'k'