# Downloading necessary python libraries

In [None]:
!pip install xgboost

# Importing libraries

In [None]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Loading the dataset from Google Drive

In [None]:
df = pd.read_csv("/content/drive/MyDrive/CDSAML_P13/NY.csv",low_memory=False)
df.head()

# EDA

In [None]:
df.info()

In [None]:
df.nunique()

In [None]:
df.isna().sum()

In [None]:
df['Length of Stay'] = df['Length of Stay'].apply(lambda x: str(x).split(' ')[0])
df['Length of Stay'] = pd.to_numeric(df['Length of Stay'])

# Plotting Graphs

In [None]:
f, ax = plt.subplots(figsize=(25,25))
sns.barplot(x="Payment Typology 1", y="Length of Stay", data=df)
ax.set(ylim=(0,10))
plt.title("Length of Stay vs. Payment Typology 1")
plt.show()

# Feature Selection

In [None]:
df = df.drop(["Facility Id", "Total Charges", "Total Costs", "Health Service Area", "Hospital County","Zip Code - 3 digits", "Race", "Ethnicity", "Patient Disposition", "Birth Weight", "Payment Typology 3", "Payment Typology 2", "Operating Certificate Number","Facility Name", "Gender", "CCS Diagnosis Description", "CCS Procedure Description", "APR DRG Description", "APR MDC Description", "APR Severity of Illness Description", "APR Medical Surgical Description", "Abortion Edit Indicator", "Discharge Year"], axis = 1)

In [None]:
df = df.dropna(subset=['APR Risk of Mortality'])

In [None]:
df.dtypes

# Extract Features

In [None]:
new_X = df.drop(['Length of Stay'], axis=1)
new_y = df[['Length of Stay']]

In [None]:
cats = new_X.select_dtypes(exclude=np.number).columns.tolist()

# Converting Columns to dtype "category"

In [None]:
for col in cats:
   new_X[col] = new_X[col].astype('category')

In [None]:
new_X.dtypes

# Splitting the dataset into training and testing data

In [None]:
X_train, X_test, y_train, y_test = train_test_split(new_X, new_y, test_size=0.3)

In [None]:
dtrain_reg = xgb.DMatrix(X_train, y_train, enable_categorical=True)
dtest_reg = xgb.DMatrix(X_test, y_test, enable_categorical=True)

# Model Training

In [None]:
params = {"objective": "reg:squarederror", "tree_method": "gpu_hist"}

n = 100
model = xgb.train(
   params=params,
   dtrain=dtrain_reg,
   num_boost_round=n,
)

# Performance Metrics - MAE

In [None]:
from sklearn.metrics import mean_absolute_error
preds = model.predict(dtest_reg)
mae = mean_absolute_error(y_test, preds)
print(f"MAE of XGBoost model: {mae:.3f}")

MAE of XGBoost model: 2.779
