# Installing python packages

In [None]:
!pip install catboost
!pip install ipywidgets
!jupyter nbextension enable  — py widgetsnbextension

# Importing the necessary libraries

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Loading Dataset

In [None]:
df = pd.read_csv("/content/drive/MyDrive/CDSAML_P13/NY.csv", low_memory = False)
df.head()

# Feature Selection

In [4]:
df = df.drop(["Facility Id", "Total Charges", "Total Costs", "Health Service Area", "Hospital County","Zip Code - 3 digits", "Race", "Ethnicity", "Patient Disposition", "Birth Weight", "Payment Typology 3", "Payment Typology 2", "Operating Certificate Number","Facility Name", "Gender", "CCS Diagnosis Description", "CCS Procedure Description", "APR DRG Description", "APR MDC Description", "APR Severity of Illness Description", "APR Medical Surgical Description", "Abortion Edit Indicator", "Discharge Year"], axis = 1)

In [5]:
df = df.dropna(subset=['APR Risk of Mortality'])

In [6]:
df['Length of Stay'] = df['Length of Stay'].apply(lambda x: str(x).split(' ')[0])
df['Length of Stay'] = pd.to_numeric(df['Length of Stay'])

In [None]:
df.info()

# CatBoost Regression Model

In [8]:
from catboost import CatBoostRegressor, Pool
from sklearn.model_selection import train_test_split

cb = CatBoostRegressor(n_estimators=100,
                       loss_function='MAE',
                       verbose=False)

new_X = df.drop(['Length of Stay'], axis=1)
new_y = df[['Length of Stay']]
X_train, X_test, y_train, y_test = train_test_split(new_X, new_y, test_size=0.30, random_state=1)

numeric_columns = ['CCS Diagnosis Code', 'CCS Procedure Code', 'APR DRG Code', 'APR MDC Code', 'APR Severity of Illness Code']
categoric_columns = ['Age Group', 'Type of Admission', 'APR Risk of Mortality', 'Payment Typology 1', 'Emergency Department Indicator']

X_train[numeric_columns] = StandardScaler().fit_transform(X_train[numeric_columns])
X_test[numeric_columns] = StandardScaler().fit_transform(X_test[numeric_columns])

pool_train = Pool(X_train, y_train, cat_features = categoric_columns)
pool_test = Pool(X_test, cat_features = categoric_columns)

cb.fit(pool_train)
y_pred = cb.predict(pool_test)

In [9]:
from sklearn.metrics import mean_absolute_error
preds = cb.predict(pool_test)
mae = mean_absolute_error(y_test, preds)
print(f"MAE of CB model: {mae:.3f}")

MAE of CB model: 2.973


# CatBoost Classification Model

## Two Bins

In [None]:
bins = [0,6,120]
labels = [6,120]
df['stay_bin'] = pd.cut(x = df['Length of Stay'], bins = bins)
df['stay_label'] = pd.cut(x = df['Length of Stay'], bins = bins, labels = labels)
df['stay_bin'] = df['stay_bin'].apply(lambda x: str(x).replace(',',' -'))
df['stay_bin'] = df['stay_bin'].apply(lambda x: str(x).replace('120','120+'))
df

In [None]:
import numpy as np
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

cb = CatBoostClassifier(n_estimators=100,
                       loss_function='MultiClass',
                       verbose=False)

new_X = df.drop(['Length of Stay','stay_bin','stay_label'], axis=1)
new_y = df[['stay_label']]
X_train, X_test, y_train, y_test = train_test_split(new_X, new_y, test_size=0.30, random_state=1)

numeric_columns = ['CCS Diagnosis Code', 'CCS Procedure Code', 'APR DRG Code', 'APR MDC Code', 'APR Severity of Illness Code']
categoric_columns = ['Age Group', 'Type of Admission', 'APR Risk of Mortality', 'Payment Typology 1', 'Emergency Department Indicator']

X_train[numeric_columns] = StandardScaler().fit_transform(X_train[numeric_columns])
X_test[numeric_columns] = StandardScaler().fit_transform(X_test[numeric_columns])

pool_train = Pool(X_train, y_train, cat_features = categoric_columns)
pool_test = Pool(X_test, cat_features = categoric_columns)

cb.fit(pool_train)
y_pred = cb.predict(pool_test)

cb_accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", cb_accuracy)

In [12]:
from sklearn.metrics import mean_absolute_error
preds = cb.predict(pool_test)
mae = mean_absolute_error(y_test, preds)
print(f"MAE of CB model with 2 bins: {mae:.3f}")

MAE of CB model with many bins: 16.691


## Many Bins

In [None]:
bins = [0,6,12,30,120]
labels = [6,12,30,120]
df['stay_bin'] = pd.cut(x = df['Length of Stay'], bins = bins)
df['stay_label'] = pd.cut(x = df['Length of Stay'], bins = bins, labels = labels)
df['stay_bin'] = df['stay_bin'].apply(lambda x: str(x).replace(',',' -'))
df['stay_bin'] = df['stay_bin'].apply(lambda x: str(x).replace('120','120+'))
df

In [None]:
import numpy as np
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

cb = CatBoostClassifier(n_estimators=100,
                       loss_function='MultiClass',
                       verbose=False)

new_X = df.drop(['Length of Stay','stay_bin','stay_label'], axis=1)
new_y = df[['stay_label']]
X_train, X_test, y_train, y_test = train_test_split(new_X, new_y, test_size=0.30, random_state=1)

numeric_columns = ['CCS Diagnosis Code', 'CCS Procedure Code', 'APR DRG Code', 'APR MDC Code', 'APR Severity of Illness Code']
categoric_columns = ['Age Group', 'Type of Admission', 'APR Risk of Mortality', 'Payment Typology 1', 'Emergency Department Indicator']

X_train[numeric_columns] = StandardScaler().fit_transform(X_train[numeric_columns])
X_test[numeric_columns] = StandardScaler().fit_transform(X_test[numeric_columns])

pool_train = Pool(X_train, y_train, cat_features = categoric_columns)
pool_test = Pool(X_test, cat_features = categoric_columns)

cb.fit(pool_train)
y_pred = cb.predict(pool_test)

cb_accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", cb_accuracy)

In [15]:
from sklearn.metrics import mean_absolute_error
preds = cb.predict(pool_test)
mae = mean_absolute_error(y_test, preds)
print(f"MAE of CB model with many bins: {mae:.3f}")

MAE of CB model with many bins: 3.563
