# Code for testing whether over sampling minority class with SMOTE helps

## Auth

In [1]:
from dotenv import load_dotenv

load_dotenv()

from kaggle.api.kaggle_api_extended import KaggleApi

api = KaggleApi()
api.authenticate()

## Download Data

In [None]:
!kaggle competitions download -c playground-series-s3e22
!unzip -o playground-series-s3e22.zip
!kaggle datasets download yasserh/horse-survival-dataset
!unzip -o horse-survival-dataset.zip
!rm -rf playground-series-s3e22.zip horse-survival-dataset.zip

## Globals

In [4]:
_TRAIN_FILE = "train.csv"
_TEST_FILE = "test.csv"
_ORIGINAL_FILE = "horse.csv"
_SAMPLE_SUBMISSION_FILE = "sample_submission.csv"

_TOP_FEATURES = [
    "rectal_temp",
    "pulse",
    "respiratory_rate",
    "nasogastric_reflux_ph",
    "packed_cell_volume",
    "total_protein",
    "abdomo_protein",
    "lesion_1",
    "surgery_no",
    "surgery_yes",
    "temp_of_extremities_cool",
    "peripheral_pulse_reduced",
    "capillary_refill_time_more_3_sec",
    "pain_depressed",
    "pain_mild_pain",
    "pain_severe_pain",
    "peristalsis_absent",
    "abdominal_distention_moderate",
    "nasogastric_reflux_more_1_liter",
    "rectal_exam_feces_absent",
    "abdomen_distend_large",
    "abdomo_appearance_serosanguious",
    "surgical_lesion_no",
    "surgical_lesion_yes",
    "cp_data_no",
    "mucous_membrane_normal_pink",
    "abdomo_appearance_cloudy",
    "capillary_refill_time_less_3_sec",
    "peripheral_pulse_normal",
    "nasogastric_tube_slight",
    "mucous_membrane_pale_pink",
    "pain_extreme_pain",
    "mucous_membrane_pale_cyanotic",
    "abdomen_distend_small",
    "cp_data_yes",
    "abdominal_distention_slight",
    "temp_of_extremities_normal",
    "mucous_membrane_bright_red",
    "abdominal_distention_severe",
    "abdomo_appearance_clear",
    "rectal_exam_feces_decreased",
    "peristalsis_hypomotile",
    "age_young",
    "nasogastric_reflux_less_1_liter",
    "rectal_exam_feces_normal",
    "temp_of_extremities_cold",
    "abdomen_firm",
    "pain_alert",
    "nasogastric_tube_significant",
    "mucous_membrane_dark_cyanotic",
    "peristalsis_normal",
    "abdomen_normal",
    "mucous_membrane_bright_pink",
    "age_adult",
    "peripheral_pulse_absent",
    "rectal_exam_feces_increased",
]

_TARGET = "outcome"

_SEED = 42

## Data Preprocessing

In [5]:
from sklearn.cluster import KMeans


class KMeansClusterer:
    def __init__(self):
        self.kmeans = None
        self.X_columns = None

    def fit(self, X_train, y_train, n_clusters=8):
        # Initialize X_columns
        self.X_columns = X_train.columns

        # Initialize the KNN model
        self.kmeans = KMeans(n_clusters=n_clusters, random_state=_SEED)

        # Train the KNN model
        self.kmeans.fit(X_train, y_train)

    def get_cluster_numbers(self, X):
        if not self.kmeans:
            raise Exception("KMeansClusterer not initialised!")

        knn_cluster_numbers = self.kmeans.predict(X)
        return knn_cluster_numbers

In [6]:
from sklearn.impute import SimpleImputer, KNNImputer
import pandas as pd


def impute(
    df: pd.DataFrame,
    strategy: str = "mean",
    fill_value=None,
    n_neighbors=5,
    columns=None,
) -> pd.DataFrame:
    """
    Generalized function to handle various imputation strategies.

    Parameters:
        df (pd.DataFrame): Input DataFrame.
        strategy (str): Imputation strategy - "mean", "median", "most_frequent", "constant", or "knn".
        fill_value: Value to use for constant imputation (if strategy="constant").
        n_neighbors (int): Number of neighbors to use for KNN Imputation (if strategy="knn").
        columns (list, optional): Specific columns to impute. If None, applies to all columns.

    Returns:
        pd.DataFrame: DataFrame with imputed values.
    """
    if strategy == "knn":
        imputer = KNNImputer(n_neighbors=n_neighbors)
    elif strategy == "constant":
        imputer = SimpleImputer(strategy="constant", fill_value=fill_value)
    else:
        imputer = SimpleImputer(strategy=strategy)

    # Select columns for imputation
    target_columns = columns or df.columns
    df[target_columns] = pd.DataFrame(
        imputer.fit_transform(df[target_columns]),
        columns=target_columns,
        index=df.index,
    )
    return df


# Specific functions for convenience
def mean_imputation(df: pd.DataFrame, columns=None) -> pd.DataFrame:
    return impute(df, strategy="mean", columns=columns)


def median_imputation(df: pd.DataFrame, columns=None) -> pd.DataFrame:
    return impute(df, strategy="median", columns=columns)


def most_frequent_imputation(df: pd.DataFrame, columns=None) -> pd.DataFrame:
    return impute(df, strategy="most_frequent", columns=columns)


def constant_imputation(df: pd.DataFrame, fill_value=0, columns=None) -> pd.DataFrame:
    return impute(df, strategy="constant", fill_value=fill_value, columns=columns)


def knn_imputation(df: pd.DataFrame, n_neighbors=5, columns=None) -> pd.DataFrame:
    return impute(df, strategy="knn", n_neighbors=n_neighbors, columns=columns)

In [121]:
import pandas as pd
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler


def preprocess_data(
    df: pd.DataFrame,
    kmeans_clusterer: KMeansClusterer,
    train=True,
    standard_scaler: StandardScaler = None,
):
    # Separate features and target
    if train:
        cols_to_drop = ["outcome", "id"]
    else:
        cols_to_drop = ["id"]

    X = df.drop(columns=cols_to_drop)

    if "outcome" in df.columns:
        y = df["outcome"]
    else:
        y = None

    # One hot encoding
    X = pd.get_dummies(X, drop_first=False)

    # Only use top features
    X = X[_TOP_FEATURES]

    # Imputation for NA values
    X = mean_imputation(X)

    # Oversampling minority class
    if train:
        smote = SMOTE(random_state=_SEED)
        X, y = smote.fit_resample(X, y)

    # Generate new features
    if train:
        kmeans_clusterer.fit(X, y, n_clusters=6) # best is 6 for this combination

    cluster_numbers = kmeans_clusterer.get_cluster_numbers(X)
    X["kmeans_cluster_numbers"] = cluster_numbers

    # Format for test
    # if not train:
    #     # Reindex test columns to include all categorical features encoded during training
    #     X = X.reindex(columns=kmeans_clusterer.X_columns, fill_value=0)
    
    if standard_scaler:
        if train:
            standard_scaler.fit(X)
            
        X_scaled = standard_scaler.transform(X)
        X = pd.DataFrame(X_scaled, columns=X.columns, index=X.index)

    return X, y

## Experiment

In [108]:
import pandas as pd


train_df = pd.read_csv(_TRAIN_FILE)
test_df = pd.read_csv(_TEST_FILE)
original_df = pd.read_csv(_ORIGINAL_FILE)

train_df = pd.concat((train_df, original_df), axis=0)

In [109]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold, cross_val_score


def run(
    df: pd.DataFrame,
    experiment_name: str,
    kmeans_clusterer: KMeansClusterer,
    standard_scaler: StandardScaler = None,
):
    print(f"========== {experiment_name} ==========")

    X, y = preprocess_data(
        df,
        kmeans_clusterer=kmeans_clusterer,
        standard_scaler=standard_scaler,
    )

    forest = RandomForestClassifier(random_state=_SEED)

    k_folds = KFold(n_splits=20)

    scores = cross_val_score(forest, X, y, cv=k_folds, scoring="f1_micro")

    print("F1 Score (Micro-Averaged):", scores.mean())

    return scores.mean()

In [110]:
results = []

In [113]:
from sklearn.preprocessing import StandardScaler

standard_scaler = StandardScaler()
kmeans_clusterer = KMeansClusterer()

score = run(
    train_df,
    "Standard Scaler",
    standard_scaler=standard_scaler,
    kmeans_clusterer=kmeans_clusterer,
)

results.append({"method": "Standard Scaler", "f1 score (micro-averaged)": score})

F1 Score (Micro-Averaged): 0.8277931415929203


In [114]:
from sklearn.preprocessing import StandardScaler

standard_scaler = StandardScaler()
kmeans_clusterer = KMeansClusterer()

score = run(
    train_df,
    "No Standard Scaler",
    standard_scaler=None,
    kmeans_clusterer=kmeans_clusterer,
)
results.append({"method": "No Standard Scaler", "f1 score (micro-averaged)": score})

F1 Score (Micro-Averaged): 0.827350663716814


In [115]:
pd.DataFrame(results).sort_values(by="f1 score (micro-averaged)", ascending=False)

Unnamed: 0,method,f1 score (micro-averaged)
0,Standard Scaler,0.827793
1,No Standard Scaler,0.827351


## Submit

In [None]:
import pandas as pd

standard_scaler = StandardScaler()
kmeans_clusterer = KMeansClusterer()


X, y = preprocess_data(train_df, train=True, kmeans_clusterer=kmeans_clusterer, standard_scaler=standard_scaler)
forest = RandomForestClassifier(random_state=_SEED)
forest.fit(X, y)

X_submit, _ = preprocess_data(
    test_df,
    train=False,
    kmeans_clusterer=kmeans_clusterer,
    standard_scaler=standard_scaler,
)
X_submit = X_submit.reindex(columns=X.columns, fill_value=0)

y_pred_submit = forest.predict(X_submit)

save_df = pd.DataFrame({"id": test_df["id"], "outcome": y_pred_submit})
save_df.to_csv("submission.csv", index=False, header=True)

In [None]:
!kaggle competitions submit -c playground-series-s3e22 -f submission.csv -m "top features + feature_scaling + k_cluster_feature + mean_imputation + smote + random forest + original data"

100%|██████████████████████████████████████| 9.21k/9.21k [00:00<00:00, 11.6kB/s]
Successfully submitted to Predict Health Outcomes of Horses