In [1]:
'''
import os
print("OMP_NUM_THREADS =", os.environ.get("OMP_NUM_THREADS"))
print("MKL_NUM_THREADS =", os.environ.get("MKL_NUM_THREADS"))
print("OPENBLAS_NUM_THREADS =", os.environ.get("OPENBLAS_NUM_THREADS"))

os.environ["OMP_NUM_THREADS"] = "8"
os.environ["MKL_NUM_THREADS"] = "8"
os.environ["OPENBLAS_NUM_THREADS"] = "8"
os.environ["NUMEXPR_NUM_THREADS"] = "8"
os.environ["VECLIB_MAXIMUM_THREADS"] = "8"
'''

OMP_NUM_THREADS = None
MKL_NUM_THREADS = None
OPENBLAS_NUM_THREADS = None


In [2]:
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

import math
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

from catboost import CatBoostClassifier, Pool

In [3]:
df = pd.read_csv("train_data.csv")
df = df.drop(['case_id', 'patientid'], axis=1)

LE = LabelEncoder() # To turn categorical values into numerical ones (Mainly used for Stay Feature)
df["Stay"] = LE.fit_transform(df["Stay"])

df['Admission_Deposit'] = pd.to_numeric(df['Admission_Deposit'], errors='coerce')

X = df.drop("Stay", axis = 1)
y = df["Stay"]

print("COLUMNS PASSED TO CATBOOST:")
print(X.columns)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print(X_train.shape)
print(X_train.select_dtypes(include=['object', 'category']).columns)
print(X_train.nunique().sort_values(ascending=False).head(10))

cat_cols = X_train.select_dtypes(include=['object', 'category']).columns
cat_idx = [X_train.columns.get_loc(col) for col in cat_cols]

print("TRAINING DF SHAPE:", df.shape)
print(df.head())
print(df.dtypes)

COLUMNS PASSED TO CATBOOST:
Index(['Hospital_code', 'Hospital_type_code', 'City_Code_Hospital',
       'Hospital_region_code', 'Available Extra Rooms in Hospital',
       'Department', 'Ward_Type', 'Ward_Facility_Code', 'Bed Grade',
       'City_Code_Patient', 'Type of Admission', 'Severity of Illness',
       'Visitors with Patient', 'Age', 'Admission_Deposit'],
      dtype='object')
(254750, 15)
Index(['Hospital_type_code', 'Hospital_region_code', 'Department', 'Ward_Type',
       'Ward_Facility_Code', 'Type of Admission', 'Severity of Illness',
       'Age'],
      dtype='object')
Admission_Deposit                    7151
City_Code_Patient                      37
Hospital_code                          32
Visitors with Patient                  28
Available Extra Rooms in Hospital      18
City_Code_Hospital                     11
Age                                    10
Hospital_type_code                      7
Ward_Facility_Code                      6
Ward_Type                      

In [None]:
'''
EXCESS TEST PRINT STATEMENTS
print("cat_idx =", cat_idx)
print("Number of categorical columns:", len(cat_idx))
print("TRAIN POOL SHAPE:", train_pool.num_row(), "rows,", train_pool.num_col(), "columns")
print("X_train SHAPE:", X_train.shape)
print(X_train.columns)
print(X_train.dtypes)
'''

train_pool = Pool(X_train, y_train, cat_features=cat_idx)
test_pool = Pool(X_test, y_test, cat_features=cat_idx)

model = CatBoostClassifier(
    loss_function='MultiClass',
    eval_metric='Accuracy',
    iterations=2000,     
    depth=6,
    learning_rate=0.1,
    l2_leaf_reg=3,
    random_seed=42,
    thread_count=4,
    od_type='Iter',     
    od_wait=50,
    use_best_model=True,
    verbose=50
)

model.fit(train_pool, eval_set=test_pool)

y_pred = model.predict(X_test).reshape(-1)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

In [16]:
df.nunique().sort_values(ascending=False)

Admission_Deposit                    7300
City_Code_Patient                      37
Hospital_code                          32
Visitors with Patient                  28
Available Extra Rooms in Hospital      18
Stay                                   11
City_Code_Hospital                     11
Age                                    10
Hospital_type_code                      7
Ward_Facility_Code                      6
Ward_Type                               6
Department                              5
Bed Grade                               4
Hospital_region_code                    3
Severity of Illness                     3
Type of Admission                       3
dtype: int64

## Final Verdict:

To train this model, we would have to wait 8+ hours, espescially without access to GPUs of any kind. The number of computations is way to high due to the Admission_Deposit feature, and it simply is not working for us.

After doing some research, we don't think CatBoost would even be able to get a desirable score regardless, we have a more fundamental problem within our Dataset, the data is too noisy, has way too much volatility. This issue, coupled with the fact that our Target Value has 11 different categorical options AND it is unbalanced, would make it very tough for us to actually make use of this data