In [None]:
# Import all relevant libraries
from sklearn.ensemble import GradientBoostingClassifier
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn import preprocessing
import warnings
warnings.filterwarnings("ignore")

In [None]:
df = pd.read_csv("/content/NY.csv")
df.head()

Unnamed: 0,Health Service Area,Hospital County,Operating Certificate Number,Facility Id,Facility Name,Age Group,Zip Code - 3 digits,Gender,Race,Ethnicity,...,APR Risk of Mortality,APR Medical Surgical Description,Payment Typology 1,Payment Typology 2,Payment Typology 3,Birth Weight,Abortion Edit Indicator,Emergency Department Indicator,Total Charges,Total Costs
0,Western NY,Allegany,226700.0,37.0,Cuba Memorial Hospital Inc,70 or Older,147,F,White,Not Span/Hispanic,...,Minor,Medical,Medicare,Private Health Insurance,,0.0,N,Y,5333.9,4818.42
1,Western NY,Allegany,226700.0,37.0,Cuba Memorial Hospital Inc,70 or Older,147,F,White,Not Span/Hispanic,...,Moderate,Medical,Medicare,,,0.0,N,Y,4865.99,4588.78
2,Western NY,Allegany,226700.0,37.0,Cuba Memorial Hospital Inc,70 or Older,147,F,White,Not Span/Hispanic,...,Minor,Medical,Medicare,Private Health Insurance,,0.0,N,Y,5901.54,5559.56
3,Western NY,Allegany,226700.0,37.0,Cuba Memorial Hospital Inc,50 to 69,147,F,White,Not Span/Hispanic,...,Minor,Medical,Medicare,,,0.0,N,Y,3619.08,3567.25
4,Western NY,Allegany,226700.0,37.0,Cuba Memorial Hospital Inc,70 or Older,147,M,White,Not Span/Hispanic,...,Moderate,Medical,Medicare,Blue Cross/Blue Shield,Private Health Insurance,0.0,N,Y,3185.87,3167.89


In [None]:
df.isnull().sum()

Health Service Area                     20
Hospital County                         20
Operating Certificate Number            20
Facility Id                             20
Facility Name                            0
Age Group                                0
Zip Code - 3 digits                     48
Gender                                   0
Race                                     0
Ethnicity                                0
Length of Stay                           1
Type of Admission                        1
Patient Disposition                      1
Discharge Year                           1
CCS Diagnosis Code                       1
CCS Diagnosis Description                1
CCS Procedure Code                       1
CCS Procedure Description                1
APR DRG Code                             1
APR DRG Description                      1
APR MDC Code                             1
APR MDC Description                      1
APR Severity of Illness Code             1
APR Severit

In [None]:
df['Length of Stay']

0       4.0
1       4.0
2       4.0
3       2.0
4       2.0
       ... 
2629    1.0
2630    3.0
2631    3.0
2632    4.0
2633    NaN
Name: Length of Stay, Length: 2634, dtype: float64

In [None]:
df = df.dropna(subset=['Length of Stay'])

In [None]:
df['Length of Stay'].isnull().sum()

0

In [None]:
df['Length of Stay'] = df['Length of Stay'].apply(lambda x: str(x).split(' ')[0])
df['Length of Stay'] = pd.to_numeric(df['Length of Stay'])

In [None]:
df['Length of Stay'].mean()

3.831371059627801

In [None]:
df.dtypes

Health Service Area                     object
Hospital County                         object
Operating Certificate Number           float64
Facility Id                            float64
Facility Name                           object
Age Group                               object
Zip Code - 3 digits                     object
Gender                                  object
Race                                    object
Ethnicity                               object
Length of Stay                         float64
Type of Admission                       object
Patient Disposition                     object
Discharge Year                         float64
CCS Diagnosis Code                     float64
CCS Diagnosis Description               object
CCS Procedure Code                     float64
CCS Procedure Description               object
APR DRG Code                           float64
APR DRG Description                     object
APR MDC Code                           float64
APR MDC Descr

In [None]:
df = df.drop(["Facility Id", "Total Charges", "Total Costs", "Health Service Area", "Hospital County","Zip Code - 3 digits", "Race", "Ethnicity", "Patient Disposition", "Birth Weight", "Payment Typology 3", "Payment Typology 2", "Operating Certificate Number","Facility Name", "Gender", "CCS Diagnosis Description", "CCS Procedure Description", "APR DRG Description", "APR MDC Description", "APR Severity of Illness Description", "APR Medical Surgical Description", "Abortion Edit Indicator", "Discharge Year"], axis = 1)

In [None]:
df.isna().sum()

Age Group                         0
Length of Stay                    0
Type of Admission                 0
CCS Diagnosis Code                0
CCS Procedure Code                0
APR DRG Code                      0
APR MDC Code                      0
APR Severity of Illness Code      0
APR Risk of Mortality             0
Payment Typology 1                0
Emergency Department Indicator    0
dtype: int64

In [None]:
df.dtypes

Age Group                          object
Length of Stay                    float64
Type of Admission                  object
CCS Diagnosis Code                float64
CCS Procedure Code                float64
APR DRG Code                      float64
APR MDC Code                      float64
APR Severity of Illness Code      float64
APR Risk of Mortality              object
Payment Typology 1                 object
Emergency Department Indicator     object
dtype: object

In [None]:
mort_string_index = {'Minor': 1, 'Moderate': 2, 'Major': 3, 'Extreme': 4}
age_string_index = {'0 to 17': 1, '18 to 29': 2, '30 to 49': 3, '50 to 69': 4, '70 or Older': 5}

df['Age Group'] = df['Age Group'].apply(lambda x: age_string_index[x])
df['APR Risk of Mortality'] = df['APR Risk of Mortality'].apply(lambda x: mort_string_index[x])
df.head()

Unnamed: 0,Age Group,Length of Stay,Type of Admission,CCS Diagnosis Code,CCS Procedure Code,APR DRG Code,APR MDC Code,APR Severity of Illness Code,APR Risk of Mortality,Payment Typology 1,Emergency Department Indicator
0,5,4.0,Urgent,122.0,0.0,139.0,4.0,2.0,1,Medicare,Y
1,5,4.0,Elective,55.0,0.0,422.0,10.0,2.0,2,Medicare,Y
2,5,4.0,Urgent,122.0,202.0,139.0,4.0,1.0,1,Medicare,Y
3,4,2.0,Elective,55.0,0.0,249.0,6.0,2.0,1,Medicare,Y
4,5,2.0,Elective,122.0,0.0,139.0,4.0,1.0,2,Medicare,Y


In [None]:
one_hot_encoded_df = pd.get_dummies(df)

In [None]:
one_hot_encoded_df.head()

Unnamed: 0,Age Group,Length of Stay,CCS Diagnosis Code,CCS Procedure Code,APR DRG Code,APR MDC Code,APR Severity of Illness Code,APR Risk of Mortality,Type of Admission_Elective,Type of Admission_Emergency,...,Type of Admission_Urgent,Payment Typology 1_Blue Cross/Blue Shield,Payment Typology 1_Federal/State/Local/VA,Payment Typology 1_Medicaid,Payment Typology 1_Medicare,Payment Typology 1_Miscellaneous/Other,Payment Typology 1_Private Health Insurance,Payment Typology 1_Self-Pay,Emergency Department Indicator_N,Emergency Department Indicator_Y
0,5,4.0,122.0,0.0,139.0,4.0,2.0,1,0,0,...,1,0,0,0,1,0,0,0,0,1
1,5,4.0,55.0,0.0,422.0,10.0,2.0,2,1,0,...,0,0,0,0,1,0,0,0,0,1
2,5,4.0,122.0,202.0,139.0,4.0,1.0,1,0,0,...,1,0,0,0,1,0,0,0,0,1
3,4,2.0,55.0,0.0,249.0,6.0,2.0,1,1,0,...,0,0,0,0,1,0,0,0,0,1
4,5,2.0,122.0,0.0,139.0,4.0,1.0,2,1,0,...,0,0,0,0,1,0,0,0,0,1


In [None]:
one_hot_encoded_df.dtypes

Age Group                                        int64
Length of Stay                                 float64
CCS Diagnosis Code                             float64
CCS Procedure Code                             float64
APR DRG Code                                   float64
APR MDC Code                                   float64
APR Severity of Illness Code                   float64
APR Risk of Mortality                            int64
Type of Admission_Elective                       uint8
Type of Admission_Emergency                      uint8
Type of Admission_Newborn                        uint8
Type of Admission_Not Available                  uint8
Type of Admission_Trauma                         uint8
Type of Admission_Urgent                         uint8
Payment Typology 1_Blue Cross/Blue Shield        uint8
Payment Typology 1_Federal/State/Local/VA        uint8
Payment Typology 1_Medicaid                      uint8
Payment Typology 1_Medicare                      uint8
Payment Ty

In [None]:
def normalize_df(df):
  for col in df.columns:
    min_val = df[col].min()
    max_val = df[col].max()
    df[col] = (df[col] - min_val) / (max_val - min_val)
  return df

normalized_df = normalize_df(one_hot_encoded_df)
normalized_df.head()

Unnamed: 0,Age Group,Length of Stay,CCS Diagnosis Code,CCS Procedure Code,APR DRG Code,APR MDC Code,APR Severity of Illness Code,APR Risk of Mortality,Type of Admission_Elective,Type of Admission_Emergency,...,Type of Admission_Urgent,Payment Typology 1_Blue Cross/Blue Shield,Payment Typology 1_Federal/State/Local/VA,Payment Typology 1_Medicaid,Payment Typology 1_Medicare,Payment Typology 1_Miscellaneous/Other,Payment Typology 1_Private Health Insurance,Payment Typology 1_Self-Pay,Emergency Department Indicator_N,Emergency Department Indicator_Y
0,1.0,0.025862,0.179641,0.0,0.141499,0.125,0.333333,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
1,1.0,0.025862,0.079341,0.0,0.440338,0.375,0.333333,0.333333,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
2,1.0,0.025862,0.179641,0.874459,0.141499,0.125,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
3,0.75,0.008621,0.079341,0.0,0.257656,0.208333,0.333333,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
4,1.0,0.008621,0.179641,0.0,0.141499,0.125,0.0,0.333333,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0


In [None]:
bins = [0,4,8,30,120]
labels = [4,8,30,120]
normalized_df['stay_bin'] = pd.cut(x = normalized_df['Length of Stay'], bins = bins)
normalized_df['stay_label'] = pd.cut(x = normalized_df['Length of Stay'], bins = bins, labels = labels)
normalized_df['stay_bin'] = normalized_df['stay_bin'].apply(lambda x: str(x).replace(',',' -'))
normalized_df['stay_bin'] = normalized_df['stay_bin'].apply(lambda x: str(x).replace('120','120+'))
normalized_df.head()

Unnamed: 0,Age Group,Length of Stay,CCS Diagnosis Code,CCS Procedure Code,APR DRG Code,APR MDC Code,APR Severity of Illness Code,APR Risk of Mortality,Type of Admission_Elective,Type of Admission_Emergency,...,Payment Typology 1_Federal/State/Local/VA,Payment Typology 1_Medicaid,Payment Typology 1_Medicare,Payment Typology 1_Miscellaneous/Other,Payment Typology 1_Private Health Insurance,Payment Typology 1_Self-Pay,Emergency Department Indicator_N,Emergency Department Indicator_Y,stay_bin,stay_label
0,1.0,0.025862,0.179641,0.0,0.141499,0.125,0.333333,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,(0 - 4],4
1,1.0,0.025862,0.079341,0.0,0.440338,0.375,0.333333,0.333333,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,(0 - 4],4
2,1.0,0.025862,0.179641,0.874459,0.141499,0.125,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,(0 - 4],4
3,0.75,0.008621,0.079341,0.0,0.257656,0.208333,0.333333,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,(0 - 4],4
4,1.0,0.008621,0.179641,0.0,0.141499,0.125,0.0,0.333333,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,(0 - 4],4


In [None]:
normalized_df.isna().sum()

Age Group                                        0
Length of Stay                                   0
CCS Diagnosis Code                               0
CCS Procedure Code                               0
APR DRG Code                                     0
APR MDC Code                                     0
APR Severity of Illness Code                     0
APR Risk of Mortality                            0
Type of Admission_Elective                       0
Type of Admission_Emergency                      0
Type of Admission_Newborn                        0
Type of Admission_Not Available                  0
Type of Admission_Trauma                         0
Type of Admission_Urgent                         0
Payment Typology 1_Blue Cross/Blue Shield        0
Payment Typology 1_Federal/State/Local/VA        0
Payment Typology 1_Medicaid                      0
Payment Typology 1_Medicare                      0
Payment Typology 1_Miscellaneous/Other           0
Payment Typology 1_Private Heal

In [None]:
new_X = normalized_df.drop(['Length of Stay','stay_bin','stay_label'], axis=1)
new_y = normalized_df[['stay_label']]
X_train, X_test, y_train, y_test = train_test_split(new_X, new_y, test_size=0.3)

In [None]:
new_y.isna().sum()

stay_label    429
dtype: int64

In [None]:
from sklearn.model_selection import GridSearchCV
grid = {
    'learning_rate':[0.01,0.05,0.1],
    'n_estimators':np.arange(100,500,100),
}
gb = GradientBoostingClassifier()
gb_cv = GridSearchCV(gb, grid, cv = 4)
gb_cv.fit(X_train,y_train)
print("Best Parameters:",gb_cv.best_params_)
print("Train Score:",gb_cv.best_score_)
print("Test Score:",gb_cv.score(X_test,y_test))

ValueError: ignored

In [None]:
gbc=GradientBoostingClassifier(n_estimators=500,learning_rate=0.05,random_state=100,max_features=5 )
gbc.fit(X_train,y_train)