# Code for testing whether augmenting with original data improves model performance

Competition data: https://www.kaggle.com/competitions/playground-series-s3e22/data

Original data: https://www.kaggle.com/datasets/yasserh/horse-survival-dataset

## Auth

In [1]:
from dotenv import load_dotenv

load_dotenv()

from kaggle.api.kaggle_api_extended import KaggleApi

api = KaggleApi()
api.authenticate()

## Download Data

In [None]:
!kaggle competitions download -c playground-series-s3e22
!unzip -o playground-series-s3e22.zip
!kaggle datasets download yasserh/horse-survival-dataset
!unzip -o horse-survival-dataset.zip
!rm -rf playground-series-s3e22.zip horse-survival-dataset.zip

## Globals

In [4]:
_TRAIN_FILE = "train.csv"
_TEST_FILE = "test.csv"
_ORIGINAL_FILE = "horse.csv"
_SAMPLE_SUBMISSION_FILE = "sample_submission.csv"

_SEED = 42

## Data Preprocessing

In [19]:
import pandas as pd


def preprocess_data(df, train=True):
    # Separate features and target
    if train:
        cols_to_drop = ["outcome", "id"]
    else:
        cols_to_drop = ["id"]

    X = df.drop(columns=cols_to_drop)

    if "outcome" in df.columns:
        y = df["outcome"]
    else:
        y = None
    
    # One hot encoding
    X = pd.get_dummies(X, drop_first=True)
    
    return X, y

## Experiment

In [7]:
import pandas as pd


train_df = pd.read_csv(_TRAIN_FILE)
test_df = pd.read_csv(_TEST_FILE)
original_df = pd.read_csv(_ORIGINAL_FILE)

In [49]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold, cross_val_score


def run(df: pd.DataFrame, experiment_name: str):
    print(f"========== {experiment_name} ==========")

    X, y = preprocess_data(df)

    forest = RandomForestClassifier(random_state=_SEED)

    k_folds = KFold(n_splits=20)

    scores = cross_val_score(forest, X, y, cv=k_folds, scoring="f1_micro")

    print("F1 Score (Micro-Averaged):", scores.mean())

In [50]:
run(train_df, "Only competition data")

F1 Score (Micro-Averaged): 0.7019962982548916


In [51]:
run(pd.concat((train_df, original_df), axis=0), "Competition data + Original data")

F1 Score (Micro-Averaged): 0.726384142173616
