# Code for testing using hospital quality (`hospital_number`) as a new feature (Feature generation)


## Auth

In [1]:
from dotenv import load_dotenv

load_dotenv()

from kaggle.api.kaggle_api_extended import KaggleApi

api = KaggleApi()
api.authenticate()

## Download Data

In [2]:
# !kaggle competitions download -c playground-series-s3e22
# !unzip -o playground-series-s3e22.zip
# !kaggle datasets download yasserh/horse-survival-dataset
# !unzip -o horse-survival-dataset.zip
# !rm -rf playground-series-s3e22.zip horse-survival-dataset.zip

## Globals

In [3]:
_TRAIN_FILE = "train.csv"
_TEST_FILE = "test.csv"
_ORIGINAL_FILE = "horse.csv"
_SAMPLE_SUBMISSION_FILE = "sample_submission.csv"

_SEED = 42

## Hospital Quality for each `hospital_number`

In [4]:
import pandas as pd


train_df = pd.read_csv(_TRAIN_FILE)
test_df = pd.read_csv(_TEST_FILE)
original_df = pd.read_csv(_ORIGINAL_FILE)

In [5]:
# Number of unique hospitals
n_unique_hospitals = len(set(train_df["hospital_number"]))
print("Number of unique hospitals:", n_unique_hospitals)

Number of unique hospitals: 255


In [6]:
"""
  Quality Score
    - Range: [-1, 1]
    - `-1` representing poorer hospital quality (horse higher chance of dying in their care)
    - `+1` representing higher hospital quality (horse higher chance of living in their care)
"""

QUALITY_SCORE = {
  "died": -1,
  "euthanized": 0,
  "lived": 1
}

In [7]:
from collections import defaultdict
hospitals_quality_scores_list = defaultdict(list)
hospitals_quality_scores = defaultdict(float)

n_data = len(train_df)
for i in range(n_data):
  row = train_df.iloc[i]
  hospital_number, outcome = row["hospital_number"], row["outcome"]

  hospitals_quality_scores_list[int(hospital_number)].append(QUALITY_SCORE[outcome])

for x in hospitals_quality_scores_list.keys():
  scores = hospitals_quality_scores_list[x]
  avg_score = sum(scores)/len(scores)
  hospitals_quality_scores[x] = avg_score

In [8]:
hospitals_quality_scores

defaultdict(float,
            {530001: -0.5333333333333333,
             533836: 0.6666666666666666,
             529812: 1.0,
             5262541: 1.0,
             5299629: 0.6666666666666666,
             529642: 0.10526315789473684,
             534787: 0.2777777777777778,
             529461: -0.6086956521739131,
             528742: 0.3333333333333333,
             529640: 0.0,
             528682: 0.8,
             530028: -0.25,
             528548: 0.5625,
             528134: 0.0,
             528305: 0.3333333333333333,
             534885: 0.5,
             5290482: 0.14285714285714285,
             5279822: -0.5,
             533692: 0.5,
             535208: 0.3333333333333333,
             528523: 0.42857142857142855,
             529893: 0.5,
             534145: 0.2857142857142857,
             530233: -0.8333333333333334,
             529399: 0.14285714285714285,
             530354: 0.0,
             528503: 0.0,
             529796: 0.3125,
             527916: 0.

In [9]:
copy_df = train_df.copy()

for i in range(n_data):
  hospital_number = int(copy_df.iloc[i]["hospital_number"])
  copy_df.loc[i, ["hospital_quality_score"]] = hospitals_quality_scores[hospital_number]

copy_df.loc[:, ["id", "hospital_number", "outcome", "hospital_quality_score"]]

Unnamed: 0,id,hospital_number,outcome,hospital_quality_score
0,0,530001,died,-0.533333
1,1,533836,euthanized,0.666667
2,2,529812,lived,1.000000
3,3,5262541,lived,1.000000
4,4,5299629,lived,0.666667
...,...,...,...,...
1230,1230,535246,lived,1.000000
1231,1231,528570,died,-0.187500
1232,1232,529685,lived,0.333333
1233,1233,534784,lived,0.636364


## Data Preprocessing

In [10]:
import pandas as pd
from imblearn.over_sampling import SMOTE


def preprocess_data(df, hospitals_quality_scores: dict = None, train=True):
    # Separate features and target
    if train:
        cols_to_drop = ["outcome", "id"]
    else:
        cols_to_drop = ["id"]

    # Simple handling of NA values: drop rows with missing values
    if train:
        df = df.dropna()

    X = df.drop(columns=cols_to_drop)

    if "outcome" in df.columns:
        y = df["outcome"]
    else:
        y = None
    
    # One hot encoding
    X = pd.get_dummies(X)

    # Only Use Important Features
    selected_features = [
        'rectal_temp', 'pulse', 'respiratory_rate', 'nasogastric_reflux_ph',
        'packed_cell_volume', 'total_protein', 'abdomo_protein', 'lesion_1',
        'surgery_no', 'surgery_yes', 'temp_of_extremities_cool',
        'peripheral_pulse_reduced', 'capillary_refill_time_more_3_sec',
        'pain_depressed', 'pain_mild_pain', 'pain_severe_pain', 'peristalsis_absent',
        'abdominal_distention_moderate', 'nasogastric_reflux_more_1_liter',
        'rectal_exam_feces_absent', 'abdomen_distend_large',
        'abdomo_appearance_serosanguious', 'surgical_lesion_no',
        'surgical_lesion_yes', 'cp_data_no', 'mucous_membrane_normal_pink',
        'abdomo_appearance_cloudy', 'capillary_refill_time_less_3_sec',
        'peripheral_pulse_normal', 'nasogastric_tube_slight',
        'mucous_membrane_pale_pink', 'pain_extreme_pain',
        'mucous_membrane_pale_cyanotic', 'abdomen_distend_small', 'cp_data_yes',
        'abdominal_distention_slight', 'temp_of_extremities_normal',
        'mucous_membrane_bright_red', 'abdominal_distention_severe',
        'abdomo_appearance_clear', 'rectal_exam_feces_decreased',
        'peristalsis_hypomotile', 'age_young', 'nasogastric_reflux_less_1_liter',
        'rectal_exam_feces_normal', 'temp_of_extremities_cold', 'abdomen_firm',
        'pain_alert', 'nasogastric_tube_significant',
        'mucous_membrane_dark_cyanotic', 'peristalsis_normal', 'abdomen_normal',
        'mucous_membrane_bright_pink', 'age_adult', 'peripheral_pulse_absent',
        'rectal_exam_feces_increased',
    ]
    if hospitals_quality_scores:
        selected_features.append("hospital_number") # To be dropped later after creating new feature `hospital_quality`
    X = X[selected_features]

    # SMOTE Oversampling of minority classes (During training stage)
    if train:
        smote = SMOTE(random_state=_SEED)
        X, y = smote.fit_resample(X, y)

    # Generate `hospital_quality` feature
    if hospitals_quality_scores:
        for i in range(len(X)):
            hospital_number = int(X.iloc[i]["hospital_number"])
            X.loc[i, ["hospital_quality_score"]] = hospitals_quality_scores[hospital_number]
        X.drop(columns=["hospital_number"], inplace=True)

    return X, y


## Experiment

In [11]:
import pandas as pd


train_df = pd.read_csv(_TRAIN_FILE)
test_df = pd.read_csv(_TEST_FILE)
original_df = pd.read_csv(_ORIGINAL_FILE)

In [12]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold, cross_val_score


def run(df: pd.DataFrame, experiment_name: str, hospitals_quality_scores: dict = None):
    print(f"========== {experiment_name} ==========")

    X, y = preprocess_data(df, hospitals_quality_scores)

    forest = RandomForestClassifier(random_state=_SEED)

    k_folds = KFold(n_splits=20)

    scores = cross_val_score(forest, X, y, cv=k_folds, scoring="f1_micro")
    f1_score_micro_avg = scores.mean()

    print("F1 Score (Micro-Averaged):", f1_score_micro_avg)

In [13]:
base_f1_score_micro_avg = run(pd.concat((train_df, original_df), axis=0), "Without Hospital Quality Feature")

F1 Score (Micro-Averaged): 0.8090895341802783


In [14]:
f1_score_micro_avg = run(pd.concat((train_df, original_df), axis=0), "With Hospital Quality Feature", hospitals_quality_scores)

F1 Score (Micro-Averaged): 0.8212341197822143


# Submit

In [15]:
X, y = preprocess_data(pd.concat((train_df, original_df), axis=0), hospitals_quality_scores=hospitals_quality_scores)
forest = RandomForestClassifier(random_state=_SEED)
forest.fit(X, y)

X_submit, _ = preprocess_data(test_df, hospitals_quality_scores=hospitals_quality_scores, train=False)
X_submit = X_submit.reindex(columns=X.columns, fill_value=0)

y_pred_submit = forest.predict(X_submit)

In [16]:
save_df = pd.DataFrame({"id": test_df["id"], "outcome": y_pred_submit})
save_df.to_csv("submission.csv", index=False, header=True)

In [17]:
# !kaggle competitions submit -c playground-series-s3e22 -f submission.csv -m "SMOTE + Top Features + hospital_quality Feature"