# Code for testing whether over sampling minority class with SMOTE helps

## Auth

In [1]:
from dotenv import load_dotenv

load_dotenv()

from kaggle.api.kaggle_api_extended import KaggleApi

api = KaggleApi()
api.authenticate()

## Download Data

In [None]:
!kaggle competitions download -c playground-series-s3e22
!unzip -o playground-series-s3e22.zip
!kaggle datasets download yasserh/horse-survival-dataset
!unzip -o horse-survival-dataset.zip
!rm -rf playground-series-s3e22.zip horse-survival-dataset.zip

## Globals

In [2]:
_TRAIN_FILE = "train.csv"
_TEST_FILE = "test.csv"
_ORIGINAL_FILE = "horse.csv"
_SAMPLE_SUBMISSION_FILE = "sample_submission.csv"

_TOP_FEATURES = [
    "rectal_temp",
    "pulse",
    "respiratory_rate",
    "nasogastric_reflux_ph",
    "packed_cell_volume",
    "total_protein",
    "abdomo_protein",
    "lesion_1",
    "surgery_no",
    "surgery_yes",
    "temp_of_extremities_cool",
    "peripheral_pulse_reduced",
    "capillary_refill_time_more_3_sec",
    "pain_depressed",
    "pain_mild_pain",
    "pain_severe_pain",
    "peristalsis_absent",
    "abdominal_distention_moderate",
    "nasogastric_reflux_more_1_liter",
    "rectal_exam_feces_absent",
    "abdomen_distend_large",
    "abdomo_appearance_serosanguious",
    "surgical_lesion_no",
    "surgical_lesion_yes",
    "cp_data_no",
    "mucous_membrane_normal_pink",
    "abdomo_appearance_cloudy",
    "capillary_refill_time_less_3_sec",
    "peripheral_pulse_normal",
    "nasogastric_tube_slight",
    "mucous_membrane_pale_pink",
    "pain_extreme_pain",
    "mucous_membrane_pale_cyanotic",
    "abdomen_distend_small",
    "cp_data_yes",
    "abdominal_distention_slight",
    "temp_of_extremities_normal",
    "mucous_membrane_bright_red",
    "abdominal_distention_severe",
    "abdomo_appearance_clear",
    "rectal_exam_feces_decreased",
    "peristalsis_hypomotile",
    "age_young",
    "nasogastric_reflux_less_1_liter",
    "rectal_exam_feces_normal",
    "temp_of_extremities_cold",
    "abdomen_firm",
    "pain_alert",
    "nasogastric_tube_significant",
    "mucous_membrane_dark_cyanotic",
    "peristalsis_normal",
    "abdomen_normal",
    "mucous_membrane_bright_pink",
    "age_adult",
    "peripheral_pulse_absent",
    "rectal_exam_feces_increased",
]

_TARGET = "outcome"

_SEED = 42

## Data Preprocessing

In [3]:
from sklearn.cluster import KMeans


class KMeansClusterer:
    def __init__(self):
        self.kmeans = None
        self.X_columns = None

    def fit(self, X_train, y_train, n_clusters=8):
        # Initialize X_columns
        self.X_columns = X_train.columns

        # Initialize the KNN model
        self.kmeans = KMeans(n_clusters=n_clusters, random_state=_SEED)

        # Train the KNN model
        self.kmeans.fit(X_train, y_train)

    def get_cluster_numbers(self, X):
        if not self.kmeans:
            raise Exception("KMeansClusterer not initialised!")

        knn_cluster_numbers = self.kmeans.predict(X)
        return knn_cluster_numbers

In [4]:
from sklearn.impute import SimpleImputer, KNNImputer
import pandas as pd


def impute(
    df: pd.DataFrame,
    strategy: str = "mean",
    fill_value=None,
    n_neighbors=5,
    columns=None,
) -> pd.DataFrame:
    """
    Generalized function to handle various imputation strategies.

    Parameters:
        df (pd.DataFrame): Input DataFrame.
        strategy (str): Imputation strategy - "mean", "median", "most_frequent", "constant", or "knn".
        fill_value: Value to use for constant imputation (if strategy="constant").
        n_neighbors (int): Number of neighbors to use for KNN Imputation (if strategy="knn").
        columns (list, optional): Specific columns to impute. If None, applies to all columns.

    Returns:
        pd.DataFrame: DataFrame with imputed values.
    """
    if strategy == "knn":
        imputer = KNNImputer(n_neighbors=n_neighbors)
    elif strategy == "constant":
        imputer = SimpleImputer(strategy="constant", fill_value=fill_value)
    else:
        imputer = SimpleImputer(strategy=strategy)

    # Select columns for imputation
    target_columns = columns or df.columns
    df[target_columns] = pd.DataFrame(
        imputer.fit_transform(df[target_columns]),
        columns=target_columns,
        index=df.index,
    )
    return df


# Specific functions for convenience
def mean_imputation(df: pd.DataFrame, columns=None) -> pd.DataFrame:
    return impute(df, strategy="mean", columns=columns)


def median_imputation(df: pd.DataFrame, columns=None) -> pd.DataFrame:
    return impute(df, strategy="median", columns=columns)


def most_frequent_imputation(df: pd.DataFrame, columns=None) -> pd.DataFrame:
    return impute(df, strategy="most_frequent", columns=columns)


def constant_imputation(df: pd.DataFrame, fill_value=0, columns=None) -> pd.DataFrame:
    return impute(df, strategy="constant", fill_value=fill_value, columns=columns)


def knn_imputation(df: pd.DataFrame, n_neighbors=5, columns=None) -> pd.DataFrame:
    return impute(df, strategy="knn", n_neighbors=n_neighbors, columns=columns)

In [11]:
import pandas as pd
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler, LabelEncoder


def preprocess_data(
    df: pd.DataFrame,
    kmeans_clusterer: KMeansClusterer,
    train=True,
    standard_scaler: StandardScaler = None,
    label_encoder: LabelEncoder = None
):
    # Separate features and target
    if train:
        cols_to_drop = ["outcome", "id"]
    else:
        cols_to_drop = ["id"]

    X = df.drop(columns=cols_to_drop)

    if "outcome" in df.columns:
        y = df["outcome"]

        if label_encoder:
            y_encoded = label_encoder.fit_transform(y)
            y = pd.Series(y_encoded, index=y.index)
    else:
        y = None

    # One hot encoding
    X = pd.get_dummies(X, drop_first=False)

    # Only use top features
    X = X[_TOP_FEATURES]

    # Imputation for NA values
    X = mean_imputation(X)

    # Oversampling minority class
    if train:
        smote = SMOTE(random_state=_SEED)
        X, y = smote.fit_resample(X, y)

    # Generate new features
    if train:
        kmeans_clusterer.fit(X, y, n_clusters=6) # best is 6 for this combination

    cluster_numbers = kmeans_clusterer.get_cluster_numbers(X)
    X["kmeans_cluster_numbers"] = cluster_numbers


    if standard_scaler:
        if train:
            standard_scaler.fit(X)
            
        X_scaled = standard_scaler.transform(X)
        X = pd.DataFrame(X_scaled, columns=X.columns, index=X.index)

    return X, y

## Experiment

In [6]:
import pandas as pd


train_df = pd.read_csv(_TRAIN_FILE)
test_df = pd.read_csv(_TEST_FILE)
original_df = pd.read_csv(_ORIGINAL_FILE)

train_df = pd.concat((train_df, original_df), axis=0)

In [8]:
from sklearn.model_selection import KFold, cross_val_score


def run(
    df: pd.DataFrame,
    experiment_name: str,
    model,
    kmeans_clusterer: KMeansClusterer,
    standard_scaler: StandardScaler = None,
    label_encoder: LabelEncoder = None,
):
    print(f"========== {experiment_name} ==========")

    X, y = preprocess_data(
        df,
        kmeans_clusterer=kmeans_clusterer,
        standard_scaler=standard_scaler,
        label_encoder=label_encoder,
    )

    k_folds = KFold(n_splits=20)

    scores = cross_val_score(model, X, y, cv=k_folds, scoring="f1_micro")

    print("F1 Score (Micro-Averaged):", scores.mean())

    return scores.mean()

In [13]:
from sklearn.ensemble import (
    RandomForestClassifier,
    ExtraTreesClassifier,
)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier


def generate_base_models():
    base_models = [
        ("random_forest", RandomForestClassifier(random_state=_SEED)),
        ("knn", KNeighborsClassifier(n_neighbors=5)),
        ("logistic", LogisticRegression(random_state=_SEED)),
        ("svm", SVC(probability=True, random_state=_SEED)),  # SVM with probability output
        ("naive_bayes", GaussianNB()),
        ("xgboost",XGBClassifier(random_state=_SEED, use_label_encoder=False, eval_metric="logloss")),
        ("lightgbm", LGBMClassifier(random_state=_SEED)),
        ("decision_tree", DecisionTreeClassifier(random_state=_SEED)),
        ("extra_trees", ExtraTreesClassifier(random_state=_SEED)),
        ("neural_net", MLPClassifier(random_state=_SEED, max_iter=500)),
    ]

    return base_models


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder


results = []

for model_name, model in generate_base_models():
    standard_scaler = StandardScaler()
    kmeans_clusterer = KMeansClusterer()
    label_encoder = LabelEncoder()

    score = run(
        train_df,
        model_name,
        model=model,
        standard_scaler=standard_scaler,
        kmeans_clusterer=kmeans_clusterer,
        label_encoder=label_encoder,
    )

    results.append({"model": model_name, "f1 score (micro-averaged)": score})

In [None]:
from sklearn.ensemble import StackingClassifier

standard_scaler = StandardScaler()
kmeans_clusterer = KMeansClusterer()

# Define the meta-model (e.g., Logistic Regression or XGBoost)
meta_model = LogisticRegression()

# Create the StackingClassifier with all base models and the meta-model
stacking_model = StackingClassifier(
    estimators=generate_base_models(),
    final_estimator=meta_model,
    cv=5,
)

score = run(
    train_df,
    "ensemble_all",
    model=stacking_model,
    standard_scaler=standard_scaler,
    kmeans_clusterer=kmeans_clusterer,
)

results.append({"model": "ensemble_all", "f1 score (micro-averaged)": score})

In [None]:
from sklearn.ensemble import StackingClassifier

standard_scaler = StandardScaler()
kmeans_clusterer = KMeansClusterer()

top_3_models = [
    ("xgboost", XGBClassifier(random_state=_SEED, use_label_encoder=False, eval_metric="logloss")),
    ("extra_trees", ExtraTreesClassifier(random_state=_SEED)),
    ("neural_net", MLPClassifier(random_state=_SEED, max_iter=500)),
]

# Define the meta-model (e.g., Logistic Regression or XGBoost)
meta_model = LogisticRegression()

# Create the StackingClassifier with all base models and the meta-model
stacking_model = StackingClassifier(
    estimators=top_3_models,
    final_estimator=meta_model,
    cv=5,
)

score = run(
    train_df,
    "ensemble_top_3",
    model=stacking_model,
    standard_scaler=standard_scaler,
    kmeans_clusterer=kmeans_clusterer,
)

results.append({"model": "ensemble_top_3", "f1 score (micro-averaged)": score})

In [18]:
pd.DataFrame(results).sort_values(by="f1 score (micro-averaged)", ascending=False)

Unnamed: 0,model,f1 score (micro-averaged)
10,ensemble_all,0.855736
11,ensemble_top_3,0.851304
9,neural_net,0.835817
8,extra_trees,0.834012
5,xgboost,0.828722
0,random_forest,0.827793
6,lightgbm,0.827394
3,svm,0.806937
1,knn,0.740558
2,logistic,0.719062


## Submit

In [None]:
import pandas as pd

standard_scaler = StandardScaler()
kmeans_clusterer = KMeansClusterer()
label_encoder = LabelEncoder()


X, y = preprocess_data(
    train_df,
    train=True,
    kmeans_clusterer=kmeans_clusterer,
    standard_scaler=standard_scaler,
    label_encoder=label_encoder,
)

# Choose model
# Define the meta-model (e.g., Logistic Regression or XGBoost)
meta_model = LogisticRegression()

# Create the StackingClassifier with all base models and the meta-model
model = StackingClassifier(
    estimators=generate_base_models(),
    final_estimator=meta_model,
    cv=5,
)

model.fit(X, y)

X_submit, _ = preprocess_data(
    test_df,
    train=False,
    kmeans_clusterer=kmeans_clusterer,
    standard_scaler=standard_scaler,
    label_encoder=label_encoder,
)
X_submit = X_submit.reindex(columns=X.columns, fill_value=0)

y_pred_submit = model.predict(X_submit)
y_pred_submit = label_encoder.inverse_transform(y_pred_submit)

save_df = pd.DataFrame({"id": test_df["id"], "outcome": y_pred_submit})
save_df.to_csv("submission.csv", index=False, header=True)

In [23]:
!kaggle competitions submit -c playground-series-s3e22 -f submission.csv -m "ensemble_top_3 + top features + feature_scaling + k_cluster_feature + mean_imputation + smote + random forest + original data"

100%|██████████████████████████████████████| 9.15k/9.15k [00:01<00:00, 6.07kB/s]
Successfully submitted to Predict Health Outcomes of Horses