In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.dummy import DummyClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score


In [2]:
def extract_X_Y(df):
    X = df[['x', 'y']]
    Y = df['result']
    return X, Y


In [6]:
def retrain_classifier(data_path, classifier_type='knn', k=15):
    """
    Retrains a model using KNN or Dummy with stratification.

    Parameters:
    - data_path (str): Path to the CSV file
    - classifier_type (str): 'knn' or 'dummy'
    - k (int): Number of neighbors if KNN is used

    Returns:
    - dict: accuracy, recall, precision
    """
    df = pd.read_csv(data_path)
    X, Y = extract_X_Y(df)

    # Stratified split
    X_train, X_test, y_train, y_test = train_test_split(
        X, Y, test_size=0.33, stratify=Y, random_state=42
    )

    # Choose classifier
    if classifier_type == 'dummy':
        clf = DummyClassifier(strategy='most_frequent')
    elif classifier_type == 'knn':
        clf = KNeighborsClassifier(n_neighbors=k)
    else:
        raise ValueError("classifier_type must be 'knn' or 'dummy'")

    # Train and evaluate
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    return {
        "accuracy": accuracy_score(y_test, y_pred),
        "recall": recall_score(y_test, y_pred),
        "precision": precision_score(y_test, y_pred)
    }


In [7]:
datasets = ["first.csv", "second.csv", "third.csv", "fourth.csv"]

for file in datasets:
    print(f"=== Results for {file} ===")
    print("KNN:", retrain_classifier(file, classifier_type="knn"))
    print("Dummy:", retrain_classifier(file, classifier_type="dummy"))
    print()


=== Results for first.csv ===
KNN: {'accuracy': 0.9090909090909091, 'recall': 0.8982035928143712, 'precision': 0.9202453987730062}
Dummy: {'accuracy': 0.5060606060606061, 'recall': 1.0, 'precision': 0.5060606060606061}

=== Results for second.csv ===
KNN: {'accuracy': 0.9121212121212121, 'recall': 0.926829268292683, 'precision': 0.8994082840236687}
Dummy: {'accuracy': 0.503030303030303, 'recall': 0.0, 'precision': 0.0}

=== Results for third.csv ===


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


KNN: {'accuracy': 0.9606060606060606, 'recall': 0.8484848484848485, 'precision': 0.9491525423728814}
Dummy: {'accuracy': 0.8, 'recall': 0.0, 'precision': 0.0}

=== Results for fourth.csv ===
KNN: {'accuracy': 0.9606060606060606, 'recall': 0.9135802469135802, 'precision': 0.925}
Dummy: {'accuracy': 0.7545454545454545, 'recall': 0.0, 'precision': 0.0}



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
