In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV

In [20]:
# Prepare features (X) and target (y) from raw data.

#     Args:
#         data (DataFrame): The raw dataset as a Pandas DataFrame.
#         target_column (str): The name of the target column.

#     Returns:
#         X (DataFrame): Features.
#         y (Series): Target.

In [2]:
def load_data(file_path, target_column="target"):
    data = pd.read_csv(file_path)  # Use file_path to read the dataset
    X = data.drop(columns=[target_column])
    y = data[target_column]
    return X, y

In [22]:
# Scale features using StandardScaler.
#     Args:
#         X_train (DataFrame): Training features.
#         X_test (DataFrame): Testing features.

#     Returns:
#         X_train_scaled (DataFrame): Scaled training features.
#         X_test_scaled (DataFrame): Scaled testing features.
#     """

In [3]:
def preprocess_data(X_train, X_test):
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    return X_train_scaled, X_test_scaled

In [24]:
# Train and evaluate a Logistic Regression model.

In [4]:
def train_logistic_regression(X_train, y_train, X_test, y_test):
    lr = LogisticRegression()
    lr.fit(X_train, y_train)
    predictions = lr.predict(X_test)
    accuracy = accuracy_score(y_test, predictions)
    print("Logistic Regression Accuracy:", accuracy)
    print(classification_report(y_test, predictions))
    return accuracy

In [26]:
# Train and evaluate a Decision Tree model.

In [5]:
def train_decision_tree(X_train, y_train, X_test, y_test):
    dt = DecisionTreeClassifier(random_state=42)
    dt.fit(X_train, y_train)
    predictions = dt.predict(X_test)
    accuracy = accuracy_score(y_test, predictions)
    print("Decision Tree Accuracy:", accuracy)
    print(classification_report(y_test, predictions))
    return accuracy

In [None]:
# Train and evaluate a K-Nearest Neighbors (KNN) model. 
# Enhanced by using Grid Search CV to choose the best K value.

In [None]:
def train_knn(X_train, y_train, X_test, y_test):
    n_neighbors = min(5, len(X_train))
    knn = KNeighborsClassifier(n_neighbors=n_neighbors)
    
    param_grid = {
        'n_neighbors': range(1, min(10, len(X_train)) + 1),
        'weights': ['uniform', 'distance'],
        'metric': ['euclidean']
    }
    grid_search = GridSearchCV(estimator=knn, param_grid=param_grid, cv=5, scoring='accuracy', verbose=1)
    grid_search.fit(X_train, y_train)
    best_knn = grid_search.best_estimator_
    print("Best Parameters:", grid_search.best_params_)

    predictions = best_knn.predict(X_test)
    accuracy = accuracy_score(y_test, predictions)
    print("KNN Accuracy:", accuracy)
    print(classification_report(y_test, predictions, zero_division=0))
    return accuracy


In [None]:
# Use ensemble to stack knn and svm
# add precision and recall and other checks

# look at assignments, add models and other things, enhance!!!