# Installing python packages

In [2]:
!pip install catboost
!pip install ipywidgets
!jupyter nbextension enable  — py widgetsnbextension

Please specify one nbextension/package at a time


# Importing the necessary libraries

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Loading Dataset

In [3]:
df = pd.read_csv("/content/drive/MyDrive/CDSAML_P13/NY.csv", low_memory = False)
df.head()

Unnamed: 0,Health Service Area,Hospital County,Operating Certificate Number,Facility Id,Facility Name,Age Group,Zip Code - 3 digits,Gender,Race,Ethnicity,...,APR Risk of Mortality,APR Medical Surgical Description,Payment Typology 1,Payment Typology 2,Payment Typology 3,Birth Weight,Abortion Edit Indicator,Emergency Department Indicator,Total Charges,Total Costs
0,Western NY,Allegany,226700.0,37.0,Cuba Memorial Hospital Inc,70 or Older,147,F,White,Not Span/Hispanic,...,Minor,Medical,Medicare,Private Health Insurance,,0,N,Y,5333.9,4818.42
1,Western NY,Allegany,226700.0,37.0,Cuba Memorial Hospital Inc,70 or Older,147,F,White,Not Span/Hispanic,...,Moderate,Medical,Medicare,,,0,N,Y,4865.99,4588.78
2,Western NY,Allegany,226700.0,37.0,Cuba Memorial Hospital Inc,70 or Older,147,F,White,Not Span/Hispanic,...,Minor,Medical,Medicare,Private Health Insurance,,0,N,Y,5901.54,5559.56
3,Western NY,Allegany,226700.0,37.0,Cuba Memorial Hospital Inc,50 to 69,147,F,White,Not Span/Hispanic,...,Minor,Medical,Medicare,,,0,N,Y,3619.08,3567.25
4,Western NY,Allegany,226700.0,37.0,Cuba Memorial Hospital Inc,70 or Older,147,M,White,Not Span/Hispanic,...,Moderate,Medical,Medicare,Blue Cross/Blue Shield,Private Health Insurance,0,N,Y,3185.87,3167.89


# Feature Selection

In [4]:
df = df.drop(["Facility Id", "Total Charges", "Total Costs", "Health Service Area", "Hospital County","Zip Code - 3 digits", "Race", "Ethnicity", "Patient Disposition", "Birth Weight", "Payment Typology 3", "Payment Typology 2", "Operating Certificate Number","Facility Name", "Gender", "CCS Diagnosis Description", "CCS Procedure Description", "APR DRG Description", "APR MDC Description", "APR Severity of Illness Description", "APR Medical Surgical Description", "Abortion Edit Indicator", "Discharge Year"], axis = 1)

In [5]:
df = df.dropna(subset=['APR Risk of Mortality'])

In [6]:
df['Length of Stay'] = df['Length of Stay'].apply(lambda x: str(x).split(' ')[0])
df['Length of Stay'] = pd.to_numeric(df['Length of Stay'])

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2346820 entries, 0 to 2346930
Data columns (total 11 columns):
 #   Column                          Dtype 
---  ------                          ----- 
 0   Age Group                       object
 1   Length of Stay                  int64 
 2   Type of Admission               object
 3   CCS Diagnosis Code              int64 
 4   CCS Procedure Code              int64 
 5   APR DRG Code                    int64 
 6   APR MDC Code                    int64 
 7   APR Severity of Illness Code    int64 
 8   APR Risk of Mortality           object
 9   Payment Typology 1              object
 10  Emergency Department Indicator  object
dtypes: int64(6), object(5)
memory usage: 214.9+ MB


# CatBoost Regression Model

In [8]:
from catboost import CatBoostRegressor, Pool
from sklearn.model_selection import train_test_split

cb = CatBoostRegressor(n_estimators=100,
                       loss_function='MAE',
                       verbose=False)

new_X = df.drop(['Length of Stay'], axis=1)
new_y = df[['Length of Stay']]
X_train, X_test, y_train, y_test = train_test_split(new_X, new_y, test_size=0.30, random_state=1)

numeric_columns = ['CCS Diagnosis Code', 'CCS Procedure Code', 'APR DRG Code', 'APR MDC Code', 'APR Severity of Illness Code']
categoric_columns = ['Age Group', 'Type of Admission', 'APR Risk of Mortality', 'Payment Typology 1', 'Emergency Department Indicator']

X_train[numeric_columns] = StandardScaler().fit_transform(X_train[numeric_columns])
X_test[numeric_columns] = StandardScaler().fit_transform(X_test[numeric_columns])

pool_train = Pool(X_train, y_train, cat_features = categoric_columns)
pool_test = Pool(X_test, cat_features = categoric_columns)

cb.fit(pool_train)
y_pred = cb.predict(pool_test)

In [9]:
from sklearn.metrics import mean_absolute_error
preds = cb.predict(pool_test)
mae = mean_absolute_error(y_test, preds)
print(f"MAE of CB model: {mae:.3f}")

MAE of CB model: 2.973


# CatBoost Classification Model

## Two Bins

In [10]:
bins = [0,6,120]
labels = [6,120]
df['stay_bin'] = pd.cut(x = df['Length of Stay'], bins = bins)
df['stay_label'] = pd.cut(x = df['Length of Stay'], bins = bins, labels = labels)
df['stay_bin'] = df['stay_bin'].apply(lambda x: str(x).replace(',',' -'))
df['stay_bin'] = df['stay_bin'].apply(lambda x: str(x).replace('120','120+'))
df

Unnamed: 0,Age Group,Length of Stay,Type of Admission,CCS Diagnosis Code,CCS Procedure Code,APR DRG Code,APR MDC Code,APR Severity of Illness Code,APR Risk of Mortality,Payment Typology 1,Emergency Department Indicator,stay_bin,stay_label
0,70 or Older,4,Urgent,122,0,139,4,2,Minor,Medicare,Y,(0 - 6],6
1,70 or Older,4,Elective,55,0,422,10,2,Moderate,Medicare,Y,(0 - 6],6
2,70 or Older,4,Urgent,122,202,139,4,1,Minor,Medicare,Y,(0 - 6],6
3,50 to 69,2,Elective,55,0,249,6,2,Minor,Medicare,Y,(0 - 6],6
4,70 or Older,2,Elective,122,0,139,4,1,Moderate,Medicare,Y,(0 - 6],6
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2346926,70 or Older,2,Emergency,112,0,47,1,2,Moderate,Medicare,Y,(0 - 6],6
2346927,50 to 69,1,Emergency,54,159,351,8,1,Minor,Private Health Insurance,Y,(0 - 6],6
2346928,50 to 69,2,Urgent,657,0,751,19,1,Minor,Medicaid,N,(0 - 6],6
2346929,70 or Older,1,Emergency,106,0,201,5,1,Moderate,Medicare,Y,(0 - 6],6


In [11]:
import numpy as np
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

cb = CatBoostClassifier(n_estimators=100,
                       loss_function='MultiClass',
                       verbose=False)

new_X = df.drop(['Length of Stay','stay_bin','stay_label'], axis=1)
new_y = df[['stay_label']]
X_train, X_test, y_train, y_test = train_test_split(new_X, new_y, test_size=0.30, random_state=1)

numeric_columns = ['CCS Diagnosis Code', 'CCS Procedure Code', 'APR DRG Code', 'APR MDC Code', 'APR Severity of Illness Code']
categoric_columns = ['Age Group', 'Type of Admission', 'APR Risk of Mortality', 'Payment Typology 1', 'Emergency Department Indicator']

X_train[numeric_columns] = StandardScaler().fit_transform(X_train[numeric_columns])
X_test[numeric_columns] = StandardScaler().fit_transform(X_test[numeric_columns])

pool_train = Pool(X_train, y_train, cat_features = categoric_columns)
pool_test = Pool(X_test, cat_features = categoric_columns)

cb.fit(pool_train)
y_pred = cb.predict(pool_test)

cb_accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", cb_accuracy)

          Age Group Type of Admission  CCS Diagnosis Code  CCS Procedure Code  \
633925     50 to 69         Emergency           -0.429326           -1.268264   
911834      0 to 17         Emergency           -0.849001           -1.268264   
896463     30 to 49         Emergency           -0.510755           -0.604873   
600131     30 to 49          Elective           -0.905376            0.137226   
624405  70 or Older         Emergency           -0.429326            1.171666   
...             ...               ...                 ...                 ...   
73349      30 to 49         Emergency            0.128154            1.329081   
836556      0 to 17          Elective            0.009142            0.227177   
491293     50 to 69            Urgent            0.078044           -1.268264   
491785     18 to 29            Urgent            2.946872            1.194154   
128038     18 to 29          Elective            0.027933            0.238421   

        APR DRG Code  APR M

In [12]:
from sklearn.metrics import mean_absolute_error
preds = cb.predict(pool_test)
mae = mean_absolute_error(y_test, preds)
print(f"MAE of CB model with 2 bins: {mae:.3f}")

MAE of CB model with many bins: 16.691


## Many Bins

In [13]:
bins = [0,6,12,30,120]
labels = [6,12,30,120]
df['stay_bin'] = pd.cut(x = df['Length of Stay'], bins = bins)
df['stay_label'] = pd.cut(x = df['Length of Stay'], bins = bins, labels = labels)
df['stay_bin'] = df['stay_bin'].apply(lambda x: str(x).replace(',',' -'))
df['stay_bin'] = df['stay_bin'].apply(lambda x: str(x).replace('120','120+'))
df

Unnamed: 0,Age Group,Length of Stay,Type of Admission,CCS Diagnosis Code,CCS Procedure Code,APR DRG Code,APR MDC Code,APR Severity of Illness Code,APR Risk of Mortality,Payment Typology 1,Emergency Department Indicator,stay_bin,stay_label
0,70 or Older,4,Urgent,122,0,139,4,2,Minor,Medicare,Y,(0 - 6],6
1,70 or Older,4,Elective,55,0,422,10,2,Moderate,Medicare,Y,(0 - 6],6
2,70 or Older,4,Urgent,122,202,139,4,1,Minor,Medicare,Y,(0 - 6],6
3,50 to 69,2,Elective,55,0,249,6,2,Minor,Medicare,Y,(0 - 6],6
4,70 or Older,2,Elective,122,0,139,4,1,Moderate,Medicare,Y,(0 - 6],6
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2346926,70 or Older,2,Emergency,112,0,47,1,2,Moderate,Medicare,Y,(0 - 6],6
2346927,50 to 69,1,Emergency,54,159,351,8,1,Minor,Private Health Insurance,Y,(0 - 6],6
2346928,50 to 69,2,Urgent,657,0,751,19,1,Minor,Medicaid,N,(0 - 6],6
2346929,70 or Older,1,Emergency,106,0,201,5,1,Moderate,Medicare,Y,(0 - 6],6


In [None]:
import numpy as np
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

cb = CatBoostClassifier(n_estimators=100,
                       loss_function='MultiClass',
                       verbose=False)

new_X = df.drop(['Length of Stay','stay_bin','stay_label'], axis=1)
new_y = df[['stay_label']]
X_train, X_test, y_train, y_test = train_test_split(new_X, new_y, test_size=0.30, random_state=1)

numeric_columns = ['CCS Diagnosis Code', 'CCS Procedure Code', 'APR DRG Code', 'APR MDC Code', 'APR Severity of Illness Code']
categoric_columns = ['Age Group', 'Type of Admission', 'APR Risk of Mortality', 'Payment Typology 1', 'Emergency Department Indicator']

X_train[numeric_columns] = StandardScaler().fit_transform(X_train[numeric_columns])
X_test[numeric_columns] = StandardScaler().fit_transform(X_test[numeric_columns])

pool_train = Pool(X_train, y_train, cat_features = categoric_columns)
pool_test = Pool(X_test, cat_features = categoric_columns)

cb.fit(pool_train)
y_pred = cb.predict(pool_test)

cb_accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", cb_accuracy)

          Age Group Type of Admission  CCS Diagnosis Code  CCS Procedure Code  \
633925     50 to 69         Emergency           -0.429326           -1.268264   
911834      0 to 17         Emergency           -0.849001           -1.268264   
896463     30 to 49         Emergency           -0.510755           -0.604873   
600131     30 to 49          Elective           -0.905376            0.137226   
624405  70 or Older         Emergency           -0.429326            1.171666   
...             ...               ...                 ...                 ...   
73349      30 to 49         Emergency            0.128154            1.329081   
836556      0 to 17          Elective            0.009142            0.227177   
491293     50 to 69            Urgent            0.078044           -1.268264   
491785     18 to 29            Urgent            2.946872            1.194154   
128038     18 to 29          Elective            0.027933            0.238421   

        APR DRG Code  APR M

In [None]:
from sklearn.metrics import mean_absolute_error
preds = cb.predict(pool_test)
mae = mean_absolute_error(y_test, preds)
print(f"MAE of CB model with many bins: {mae:.3f}")