# Code for testing whether over sampling minority class with SMOTE helps

## Auth

In [1]:
from dotenv import load_dotenv

load_dotenv()

from kaggle.api.kaggle_api_extended import KaggleApi

api = KaggleApi()
api.authenticate()

## Download Data

In [None]:
!kaggle competitions download -c playground-series-s3e22
!unzip -o playground-series-s3e22.zip
!kaggle datasets download yasserh/horse-survival-dataset
!unzip -o horse-survival-dataset.zip
!rm -rf playground-series-s3e22.zip horse-survival-dataset.zip

## Globals

In [2]:
_TRAIN_FILE = "train.csv"
_TEST_FILE = "test.csv"
_ORIGINAL_FILE = "horse.csv"
_SAMPLE_SUBMISSION_FILE = "sample_submission.csv"

_SEED = 42

## Data Preprocessing

In [26]:
import pandas as pd
from imblearn.over_sampling import SMOTE


def preprocess_data(df: pd.DataFrame, oversample_minority: bool):
    cols_to_drop = ["outcome", "id"]

    # Simple handling of NA values: drop rows with missing values
    df = df.dropna()

    X = df.drop(columns=cols_to_drop)
    y = df["outcome"]

    # One hot encoding
    X = pd.get_dummies(X, drop_first=True)

    if oversample_minority:
        smote = SMOTE(random_state=_SEED)
        X, y = smote.fit_resample(X, y)

    return X, y

## Experiment

In [13]:
import pandas as pd


train_df = pd.read_csv(_TRAIN_FILE)
test_df = pd.read_csv(_TEST_FILE)
original_df = pd.read_csv(_ORIGINAL_FILE)

train_df = pd.concat((train_df, original_df), axis=0)

In [14]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold, cross_val_score


def run(df: pd.DataFrame, experiment_name: str, oversample_minority: bool):
    print(f"========== {experiment_name} ==========")

    X, y = preprocess_data(df, oversample_minority=oversample_minority)

    forest = RandomForestClassifier(random_state=_SEED)

    k_folds = KFold(n_splits=20)

    scores = cross_val_score(forest, X, y, cv=k_folds, scoring="f1_micro")

    print("F1 Score (Micro-Averaged):", scores.mean())

In [27]:
run(train_df, "Oversample Minority", oversample_minority=True)

F1 Score (Micro-Averaged): 0.8065638233514821


In [28]:
run(train_df, "No Oversample Minority", oversample_minority=False)

F1 Score (Micro-Averaged): 0.7148448043184885
