In [1]:
import numpy as np
import pandas as pd

pd.set_option('display.max_columns',None)


In [2]:
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline,make_pipeline
from sklearn.feature_selection import SelectKBest,chi2
from sklearn.metrics import mean_squared_error
import xgboost as xgb

In [3]:
import sklearn
print(sklearn.__version__)

1.3.0


In [4]:
df = pd.read_csv('iryss-data.csv')

In [5]:
df.head()

Unnamed: 0,AGE,SEX,RACE,HOSPID,DRG,NPR,NCHRONIC,ZIPINC_QRTL,PAY1,PAY2,CM_AIDS,CM_ALCOHOL,CM_ANEMDEF,CM_ARTH,CM_BLDLOSS,CM_CHF,CM_DRUG,DXn,TRAN_IN,TRAN_OUT,TOTCHG
0,56,Other,Black,2,ICD-10-CM,1365516690,1,3,Medicare,,no,no,no,no,no,yes,current,3,Transferred from acute care hospital,Not a transfer,43000.0
1,19,Male,Other,9,ICD-10-CM,6951096330,1,2,Medicaid,COBRA Coverage,no,yes,no,no,no,yes,current,4,Not a transfer,Not a transfer,47500.0
2,76,Male,White,3,ICD-10-CM/PCS,5600059825,1,2,Medicare,,no,no,no,no,no,no,current,4,Transferred from another health facility,Not a transfer,3000.0
3,65,Male,Hispanic,2,ICD-9-CM,6280085793,2,3,Medicare,Secondary Health Insurance,no,no,no,no,no,yes,current,4,Not a transfer,Transferred out to acute are hospital,31000.0
4,25,Female,Native American,9,ICD-10-CM,6448432997,2,3,Medicare,Secondary Health Insurance,no,no,no,no,yes,yes,current,4,Not a transfer,Not a transfer,46000.0


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 898 entries, 0 to 897
Data columns (total 21 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   AGE          898 non-null    int64  
 1   SEX          898 non-null    object 
 2   RACE         898 non-null    object 
 3   HOSPID       898 non-null    int64  
 4   DRG          898 non-null    object 
 5   NPR          898 non-null    int64  
 6   NCHRONIC     898 non-null    int64  
 7   ZIPINC_QRTL  898 non-null    int64  
 8   PAY1         898 non-null    object 
 9   PAY2         898 non-null    object 
 10  CM_AIDS      898 non-null    object 
 11  CM_ALCOHOL   898 non-null    object 
 12  CM_ANEMDEF   898 non-null    object 
 13  CM_ARTH      898 non-null    object 
 14  CM_BLDLOSS   898 non-null    object 
 15  CM_CHF       898 non-null    object 
 16  CM_DRUG      898 non-null    object 
 17  DXn          898 non-null    int64  
 18  TRAN_IN      898 non-null    object 
 19  TRAN_OUT

In [7]:
df.isnull().sum()

AGE            0
SEX            0
RACE           0
HOSPID         0
DRG            0
NPR            0
NCHRONIC       0
ZIPINC_QRTL    0
PAY1           0
PAY2           0
CM_AIDS        0
CM_ALCOHOL     0
CM_ANEMDEF     0
CM_ARTH        0
CM_BLDLOSS     0
CM_CHF         0
CM_DRUG        0
DXn            0
TRAN_IN        0
TRAN_OUT       1
TOTCHG         1
dtype: int64

In [8]:
df.dropna(subset=['TRAN_OUT','TOTCHG'],inplace=True)

In [9]:
df.isnull().sum()

AGE            0
SEX            0
RACE           0
HOSPID         0
DRG            0
NPR            0
NCHRONIC       0
ZIPINC_QRTL    0
PAY1           0
PAY2           0
CM_AIDS        0
CM_ALCOHOL     0
CM_ANEMDEF     0
CM_ARTH        0
CM_BLDLOSS     0
CM_CHF         0
CM_DRUG        0
DXn            0
TRAN_IN        0
TRAN_OUT       0
TOTCHG         0
dtype: int64

In [10]:
#Step-1
X_train,X_test,y_train,y_test = train_test_split(df.drop(columns=['TOTCHG']),df['TOTCHG'],test_size=0.2,random_state=42)

In [11]:
X_train.head()

Unnamed: 0,AGE,SEX,RACE,HOSPID,DRG,NPR,NCHRONIC,ZIPINC_QRTL,PAY1,PAY2,CM_AIDS,CM_ALCOHOL,CM_ANEMDEF,CM_ARTH,CM_BLDLOSS,CM_CHF,CM_DRUG,DXn,TRAN_IN,TRAN_OUT
870,27,Male,Other,7,ICD-10-CM,8738844579,2,3,Medicare,Employer-Sponsored Plans,no,no,no,no,yes,no,current,4,Transferred from another health facility,Transferred out to acute are hospital
740,43,Male,Black,4,ICD-10-CM/PCS,1884677531,1,2,Medicaid,,no,no,no,no,no,yes,current,3,Transferred from another health facility,Transferred out to another health facility
231,28,Male,Other,7,ICD-10-PCS,3456703593,1,2,Medicaid,Secondary Health Insurance,no,no,no,no,yes,no,former,4,Transferred from another health facility,Transferred out to acute are hospital
388,66,Female,Black,1,ICD-10-CM,5389740698,1,1,Medicare,Employer-Sponsored Plans,yes,no,no,no,no,yes,current,4,Not a transfer,Transferred out to acute are hospital
118,56,Other,White,7,ICD-9-CM,4526086619,1,2,Medicaid,Secondary Health Insurance,no,yes,no,no,no,no,never,4,Transferred from acute care hospital,Not a transfer


In [12]:
y_train.sample(5)

571    10000.0
161     2000.0
451    29000.0
126    52000.0
142    43000.0
Name: TOTCHG, dtype: float64

In [13]:
numeric_features = ['AGE', 'HOSPID','NPR','NCHRONIC','ZIPINC_QRTL','DXn'] 
categorical_features = ['RACE','DRG','PAY1','PAY2','CM_AIDS','CM_ALCOHOL','CM_ANEMDEF','CM_ARTH','CM_BLDLOSS','CM_CHF','CM_DRUG','TRAN_IN','TRAN_OUT']

In [14]:
numeric_transformer = Pipeline(steps=[('scaler', MinMaxScaler(feature_range=(-1, 1)))])

In [15]:
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(sparse_output=True, handle_unknown='ignore'))])

In [16]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

In [17]:
#Feature Selection
#feature_selection = SelectKBest(score_func=chi2,k=8)

In [18]:
# train the model
xgb_regressor=xgb.XGBRegressor(eval_metric='rmsle')

# Create Pipeline

In [19]:
from sklearn import set_config
set_config(display='diagram')

In [20]:
pipeline = Pipeline(steps=[
    ('precprocessor', preprocessor),
    ('xgb_regressor', xgb_regressor)
])

In [21]:
# train
pipeline.fit(X_train,y_train)

In [22]:
pipeline.named_steps

{'precprocessor': ColumnTransformer(transformers=[('num',
                                  Pipeline(steps=[('scaler',
                                                   MinMaxScaler(feature_range=(-1,
                                                                               1)))]),
                                  ['AGE', 'HOSPID', 'NPR', 'NCHRONIC',
                                   'ZIPINC_QRTL', 'DXn']),
                                 ('cat',
                                  Pipeline(steps=[('onehot',
                                                   OneHotEncoder(handle_unknown='ignore'))]),
                                  ['RACE', 'DRG', 'PAY1', 'PAY2', 'CM_AIDS',
                                   'CM_ALCOHOL', 'CM_ANEMDEF', 'CM_ARTH',
                                   'CM_BLDLOSS', 'CM_CHF', 'CM_DRUG', 'TRAN_IN',
                                   'TRAN_OUT'])]),
 'xgb_regressor': XGBRegressor(base_score=None, booster=None, callbacks=None,
              colsample_

In [23]:
final_predictions = pipeline.predict(X_test)

In [24]:
final_predictions

array([49151.04   , 32851.95   , 72262.875  , 41210.047  , 29454.957  ,
         618.0651 , 31176.633  ,   439.86176, 38300.977  ,  6258.466  ,
       42524.242  ,  6791.759  , 32468.629  , 47579.14   , 31544.283  ,
       37700.75   , 48643.38   ,   715.22815, 48460.133  ,  2709.0989 ,
       44366.965  , 64437.223  , 30616.898  , 40548.91   , 10771.756  ,
       41819.875  , 67439.04   ,  2017.084  , 68111.17   ,  2186.3008 ,
       60907.086  ,  4617.1895 , 49445.223  , 39908.492  ,  2412.029  ,
        3921.2463 , 42987.04   , 41694.09   , 59219.582  , 47584.47   ,
        4877.9487 , 30025.846  , 38239.14   , 70641.53   , 39141.29   ,
       70317.57   , 65602.25   ,  5778.4062 , 69063.81   , 11678.594  ,
       41639.36   , 35784.04   , 68667.25   , 29686.951  , 41539.324  ,
       65563.42   , 36145.086  ,  7241.5684 ,  3920.29   ,  2187.7007 ,
       32308.848  , 43578.566  ,  3603.439  , 33620.19   , 63076.9    ,
       37629.332  , 75115.71   , 29529.879  , 41527.812  , 12154

In [25]:
xgb_rmse = mean_squared_error(y_test, final_predictions,squared=False)
xgb_rmse

1606.4094090173187

In [26]:
pipeline.score(X_test, y_test)

0.994597060987101

# Cross Validation using Pipeline

In [27]:
# no change in the error after doing cross validation 

# Fine-Tune Your Model
Grid Search

# Specify the hyperparameter space

# Instantiate the GridSearchCV model

# Print the tuned parameters


# Exporting the Pipeline