# Code for testing whether over sampling minority class with SMOTE helps

## Auth

In [1]:
from dotenv import load_dotenv

load_dotenv()

from kaggle.api.kaggle_api_extended import KaggleApi

api = KaggleApi()
api.authenticate()

## Download Data

In [None]:
!kaggle competitions download -c playground-series-s3e22
!unzip -o playground-series-s3e22.zip
!kaggle datasets download yasserh/horse-survival-dataset
!unzip -o horse-survival-dataset.zip
!rm -rf playground-series-s3e22.zip horse-survival-dataset.zip

## Globals

In [89]:
_TRAIN_FILE = "train.csv"
_TEST_FILE = "test.csv"
_ORIGINAL_FILE = "horse.csv"
_SAMPLE_SUBMISSION_FILE = "sample_submission.csv"

_TOP_FEATURES = [
    "rectal_temp",
    "pulse",
    "respiratory_rate",
    "nasogastric_reflux_ph",
    "packed_cell_volume",
    "total_protein",
    "abdomo_protein",
    "lesion_1",
    "surgery_no",
    "surgery_yes",
    "temp_of_extremities_cool",
    "peripheral_pulse_reduced",
    "capillary_refill_time_more_3_sec",
    "pain_depressed",
    "pain_mild_pain",
    "pain_severe_pain",
    "peristalsis_absent",
    "abdominal_distention_moderate",
    "nasogastric_reflux_more_1_liter",
    "rectal_exam_feces_absent",
    "abdomen_distend_large",
    "abdomo_appearance_serosanguious",
    "surgical_lesion_no",
    "surgical_lesion_yes",
    "cp_data_no",
    "mucous_membrane_normal_pink",
    "abdomo_appearance_cloudy",
    "capillary_refill_time_less_3_sec",
    "peripheral_pulse_normal",
    "nasogastric_tube_slight",
    "mucous_membrane_pale_pink",
    "pain_extreme_pain",
    "mucous_membrane_pale_cyanotic",
    "abdomen_distend_small",
    "cp_data_yes",
    "abdominal_distention_slight",
    "temp_of_extremities_normal",
    "mucous_membrane_bright_red",
    "abdominal_distention_severe",
    "abdomo_appearance_clear",
    "rectal_exam_feces_decreased",
    "peristalsis_hypomotile",
    "age_young",
    "nasogastric_reflux_less_1_liter",
    "rectal_exam_feces_normal",
    "temp_of_extremities_cold",
    "abdomen_firm",
    "pain_alert",
    "nasogastric_tube_significant",
    "mucous_membrane_dark_cyanotic",
    "peristalsis_normal",
    "abdomen_normal",
    "mucous_membrane_bright_pink",
    "age_adult",
    "peripheral_pulse_absent",
    "rectal_exam_feces_increased",
]

_TARGET = "outcome"

_SEED = 42

## Data Preprocessing

In [95]:
import pandas as pd
from imblearn.over_sampling import SMOTE


def preprocess_data(df: pd.DataFrame, impute_function, columns):
    cols_to_drop = ["outcome", "id"]

    if not impute_function:
        df = df.dropna()

    X = df.drop(columns=cols_to_drop)
    y = df["outcome"]

    # One hot encoding
    X = pd.get_dummies(X, drop_first=False)

    X = X[_TOP_FEATURES]

    if impute_function:
        X = impute_function(X, columns=columns)

    smote = SMOTE(random_state=_SEED)
    X, y = smote.fit_resample(X, y)

    return X, y

## Experiment

In [92]:
import pandas as pd


train_df = pd.read_csv(_TRAIN_FILE)
test_df = pd.read_csv(_TEST_FILE)
original_df = pd.read_csv(_ORIGINAL_FILE)

train_df = pd.concat((train_df, original_df), axis=0)

In [45]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold, cross_val_score


def run(df: pd.DataFrame, experiment_name: str, impute_function, columns=None):
    print(f"========== {experiment_name} ==========")

    X, y = preprocess_data(df, impute_function=impute_function, columns=columns)

    forest = RandomForestClassifier(random_state=_SEED)

    k_folds = KFold(n_splits=20)

    scores = cross_val_score(forest, X, y, cv=k_folds, scoring="f1_micro")

    print("F1 Score (Micro-Averaged):", scores.mean())

    return scores.mean()

In [32]:
from sklearn.impute import SimpleImputer, KNNImputer
import pandas as pd


def impute(
    df: pd.DataFrame,
    strategy: str = "mean",
    fill_value=None,
    n_neighbors=5,
    columns=None,
) -> pd.DataFrame:
    """
    Generalized function to handle various imputation strategies.

    Parameters:
        df (pd.DataFrame): Input DataFrame.
        strategy (str): Imputation strategy - "mean", "median", "most_frequent", "constant", or "knn".
        fill_value: Value to use for constant imputation (if strategy="constant").
        n_neighbors (int): Number of neighbors to use for KNN Imputation (if strategy="knn").
        columns (list, optional): Specific columns to impute. If None, applies to all columns.

    Returns:
        pd.DataFrame: DataFrame with imputed values.
    """
    if strategy == "knn":
        imputer = KNNImputer(n_neighbors=n_neighbors)
    elif strategy == "constant":
        imputer = SimpleImputer(strategy="constant", fill_value=fill_value)
    else:
        imputer = SimpleImputer(strategy=strategy)

    # Select columns for imputation
    target_columns = columns or df.columns
    df[target_columns] = pd.DataFrame(
        imputer.fit_transform(df[target_columns]),
        columns=target_columns,
        index=df.index,
    )
    return df


# Specific functions for convenience
def mean_imputation(df: pd.DataFrame, columns=None) -> pd.DataFrame:
    return impute(df, strategy="mean", columns=columns)


def median_imputation(df: pd.DataFrame, columns=None) -> pd.DataFrame:
    return impute(df, strategy="median", columns=columns)


def most_frequent_imputation(df: pd.DataFrame, columns=None) -> pd.DataFrame:
    return impute(df, strategy="most_frequent", columns=columns)


def constant_imputation(df: pd.DataFrame, fill_value=0, columns=None) -> pd.DataFrame:
    return impute(df, strategy="constant", fill_value=fill_value, columns=columns)


def knn_imputation(df: pd.DataFrame, n_neighbors=5, columns=None) -> pd.DataFrame:
    return impute(df, strategy="knn", n_neighbors=n_neighbors, columns=columns)

In [103]:
results = []

In [104]:
score = run(train_df, "No Imputation", impute_function=None)

results.append({"method": "No Imputation", "f1 score (micro-averaged)": score})

F1 Score (Micro-Averaged): 0.8090895341802783


In [105]:
score = run(train_df, "KNN Imputation", impute_function=knn_imputation)

results.append({"method": "KNN Imputation", "f1 score (micro-averaged)": score})

F1 Score (Micro-Averaged): 0.8127251896333755


In [106]:
score = run(
    train_df, "Most Frequent Inputation", impute_function=most_frequent_imputation
)

results.append(
    {"method": "Most Frequent Imputation", "f1 score (micro-averaged)": score}
)

F1 Score (Micro-Averaged): 0.8140605246523389


In [107]:
score = run(train_df, "Mean Imputation", impute_function=mean_imputation)

results.append({"method": "Mean Imputation", "f1 score (micro-averaged)": score})

F1 Score (Micro-Averaged): 0.8184932048040455


In [108]:
score = run(train_df, "Median Imputation", impute_function=median_imputation)

results.append({"method": "Median Imputation", "f1 score (micro-averaged)": score})

F1 Score (Micro-Averaged): 0.8136061946902654


In [109]:
score = run(train_df, "Constant Imputation", impute_function=constant_imputation)

results.append({"method": "Constant Imputation", "f1 score (micro-averaged)": score})

F1 Score (Micro-Averaged): 0.8145148546144123


In [110]:
pd.DataFrame(results).sort_values(by="f1 score (micro-averaged)", ascending=False)

Unnamed: 0,method,f1 score (micro-averaged)
3,Mean Imputation,0.818493
5,Constant Imputation,0.814515
2,Most Frequent Imputation,0.814061
4,Median Imputation,0.813606
1,KNN Imputation,0.812725
0,No Imputation,0.80909


## KNN Hyperparameter

In [None]:
knn_results = []


for n_neighbors in range(1, 30):
    def _knn_imputation(df: pd.DataFrame, columns=None):
        return knn_imputation(df, n_neighbors=n_neighbors, columns=columns)

    score = run(train_df, f"KNN Imputation ({n_neighbors} neighbors)", impute_function=_knn_imputation)

    knn_results.append(
        {
            "n_neighbors": n_neighbors,
            "f1 score (micro-averaged)": score,
        }
    )

In [112]:
results_df = pd.DataFrame(knn_results).sort_values(
    "f1 score (micro-averaged)", ascending=False
)
results_df

Unnamed: 0,n_neighbors,f1 score (micro-averaged)
10,11,0.816265
13,14,0.815372
12,13,0.814495
8,9,0.813176
16,17,0.813156
4,5,0.812725
5,6,0.812713
7,8,0.812283
11,12,0.812267
15,16,0.810959


## Submit

In [None]:
import pandas as pd
from imblearn.over_sampling import SMOTE

df = pd.read_csv(_TRAIN_FILE)
test_df = pd.read_csv(_TEST_FILE)
submit_df = test_df

cols_to_drop = ["id"]

X = df.drop(columns=cols_to_drop)
y = df["outcome"]

# One hot encoding
X = pd.get_dummies(X, drop_first=False)
X = X[_TOP_FEATURES]

X = mean_imputation(X)

smote = SMOTE(random_state=_SEED)
X, y = smote.fit_resample(X, y)

forest = RandomForestClassifier(random_state=_SEED)
forest.fit(X, y)

test_df = test_df.drop(columns=cols_to_drop)
test_df = pd.get_dummies(test_df, drop_first=False)
test_df = test_df.reindex(columns=X.columns, fill_value=0)
test_df = test_df[_TOP_FEATURES]
y_pred_submit = forest.predict(test_df)

save_df = pd.DataFrame({"id": submit_df["id"], "outcome": y_pred_submit})
save_df.to_csv("submission.csv", index=False, header=True)

In [None]:
!kaggle competitions submit -c playground-series-s3e22 -f submission.csv -m "top features + mean_imputation + smote + random forest + original data"

100%|██████████████████████████████████████| 9.25k/9.25k [00:00<00:00, 11.7kB/s]
Successfully submitted to Predict Health Outcomes of Horses