In [2]:
import numpy as np
import pandas as pd

pd.set_option('display.max_columns',None)


In [3]:
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline,make_pipeline
from sklearn.feature_selection import SelectKBest,chi2
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree  import DecisionTreeRegressor

In [4]:
import sklearn
print(sklearn.__version__)

1.3.0


In [5]:
df = pd.read_csv('iryss-data.csv')

In [6]:
df.head()

Unnamed: 0,AGE,SEX,RACE,HOSPID,DRG,NPR,NCHRONIC,ZIPINC_QRTL,PAY1,PAY2,CM_AIDS,CM_ALCOHOL,CM_ANEMDEF,CM_ARTH,CM_BLDLOSS,CM_CHF,CM_DRUG,DXn,TRAN_IN,TRAN_OUT,TOTCHG
0,56,Other,Black,2,ICD-10-CM,1365516690,1,3,Medicare,,no,no,no,no,no,yes,current,3,Transferred from acute care hospital,Not a transfer,43000.0
1,19,Male,Other,9,ICD-10-CM,6951096330,1,2,Medicaid,COBRA Coverage,no,yes,no,no,no,yes,current,4,Not a transfer,Not a transfer,47500.0
2,76,Male,White,3,ICD-10-CM/PCS,5600059825,1,2,Medicare,,no,no,no,no,no,no,current,4,Transferred from another health facility,Not a transfer,3000.0
3,65,Male,Hispanic,2,ICD-9-CM,6280085793,2,3,Medicare,Secondary Health Insurance,no,no,no,no,no,yes,current,4,Not a transfer,Transferred out to acute are hospital,31000.0
4,25,Female,Native American,9,ICD-10-CM,6448432997,2,3,Medicare,Secondary Health Insurance,no,no,no,no,yes,yes,current,4,Not a transfer,Not a transfer,46000.0


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 898 entries, 0 to 897
Data columns (total 21 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   AGE          898 non-null    int64  
 1   SEX          898 non-null    object 
 2   RACE         898 non-null    object 
 3   HOSPID       898 non-null    int64  
 4   DRG          898 non-null    object 
 5   NPR          898 non-null    int64  
 6   NCHRONIC     898 non-null    int64  
 7   ZIPINC_QRTL  898 non-null    int64  
 8   PAY1         898 non-null    object 
 9   PAY2         898 non-null    object 
 10  CM_AIDS      898 non-null    object 
 11  CM_ALCOHOL   898 non-null    object 
 12  CM_ANEMDEF   898 non-null    object 
 13  CM_ARTH      898 non-null    object 
 14  CM_BLDLOSS   898 non-null    object 
 15  CM_CHF       898 non-null    object 
 16  CM_DRUG      898 non-null    object 
 17  DXn          898 non-null    int64  
 18  TRAN_IN      898 non-null    object 
 19  TRAN_OUT

In [8]:
df.isnull().sum()

AGE            0
SEX            0
RACE           0
HOSPID         0
DRG            0
NPR            0
NCHRONIC       0
ZIPINC_QRTL    0
PAY1           0
PAY2           0
CM_AIDS        0
CM_ALCOHOL     0
CM_ANEMDEF     0
CM_ARTH        0
CM_BLDLOSS     0
CM_CHF         0
CM_DRUG        0
DXn            0
TRAN_IN        0
TRAN_OUT       1
TOTCHG         1
dtype: int64

In [9]:
df.dropna(subset=['TRAN_OUT','TOTCHG'],inplace=True)

In [10]:
df.isnull().sum()

AGE            0
SEX            0
RACE           0
HOSPID         0
DRG            0
NPR            0
NCHRONIC       0
ZIPINC_QRTL    0
PAY1           0
PAY2           0
CM_AIDS        0
CM_ALCOHOL     0
CM_ANEMDEF     0
CM_ARTH        0
CM_BLDLOSS     0
CM_CHF         0
CM_DRUG        0
DXn            0
TRAN_IN        0
TRAN_OUT       0
TOTCHG         0
dtype: int64

In [11]:
#Step-1
X_train,X_test,y_train,y_test = train_test_split(df.drop(columns=['TOTCHG']),df['TOTCHG'],test_size=0.2,random_state=42)

In [12]:
X_train.head()

Unnamed: 0,AGE,SEX,RACE,HOSPID,DRG,NPR,NCHRONIC,ZIPINC_QRTL,PAY1,PAY2,CM_AIDS,CM_ALCOHOL,CM_ANEMDEF,CM_ARTH,CM_BLDLOSS,CM_CHF,CM_DRUG,DXn,TRAN_IN,TRAN_OUT
870,27,Male,Other,7,ICD-10-CM,8738844579,2,3,Medicare,Employer-Sponsored Plans,no,no,no,no,yes,no,current,4,Transferred from another health facility,Transferred out to acute are hospital
740,43,Male,Black,4,ICD-10-CM/PCS,1884677531,1,2,Medicaid,,no,no,no,no,no,yes,current,3,Transferred from another health facility,Transferred out to another health facility
231,28,Male,Other,7,ICD-10-PCS,3456703593,1,2,Medicaid,Secondary Health Insurance,no,no,no,no,yes,no,former,4,Transferred from another health facility,Transferred out to acute are hospital
388,66,Female,Black,1,ICD-10-CM,5389740698,1,1,Medicare,Employer-Sponsored Plans,yes,no,no,no,no,yes,current,4,Not a transfer,Transferred out to acute are hospital
118,56,Other,White,7,ICD-9-CM,4526086619,1,2,Medicaid,Secondary Health Insurance,no,yes,no,no,no,no,never,4,Transferred from acute care hospital,Not a transfer


In [13]:
y_train.sample(5)

468    41500.0
301     4000.0
453    71000.0
410    33000.0
631    42000.0
Name: TOTCHG, dtype: float64

In [14]:
numeric_features = ['AGE', 'HOSPID','NPR','NCHRONIC','ZIPINC_QRTL','DXn'] 
categorical_features = ['RACE','DRG','PAY1','PAY2','CM_AIDS','CM_ALCOHOL','CM_ANEMDEF','CM_ARTH','CM_BLDLOSS','CM_CHF','CM_DRUG','TRAN_IN','TRAN_OUT']

In [15]:
numeric_transformer = Pipeline(steps=[('scaler', MinMaxScaler(feature_range=(-1, 1)))])

In [16]:
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(sparse_output=True, handle_unknown='ignore'))])

In [17]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

In [18]:
#Feature Selection
#feature_selection = SelectKBest(score_func=chi2,k=8)

In [19]:
# train the model
random_forest = RandomForestRegressor(random_state=42)
#DecisionTreeRegressor(random_state=42)

# Create Pipeline

In [20]:
from sklearn import set_config
set_config(display='diagram')

In [21]:
model = Pipeline(steps=[
    ('precprocessor', preprocessor),
   # ('feature_selection',feature_selection),
    ('random_forest', random_forest)
])

In [22]:
# train
model.fit(X_train,y_train)

In [23]:
model.named_steps

{'precprocessor': ColumnTransformer(transformers=[('num',
                                  Pipeline(steps=[('scaler',
                                                   MinMaxScaler(feature_range=(-1,
                                                                               1)))]),
                                  ['AGE', 'HOSPID', 'NPR', 'NCHRONIC',
                                   'ZIPINC_QRTL', 'DXn']),
                                 ('cat',
                                  Pipeline(steps=[('onehot',
                                                   OneHotEncoder(handle_unknown='ignore'))]),
                                  ['RACE', 'DRG', 'PAY1', 'PAY2', 'CM_AIDS',
                                   'CM_ALCOHOL', 'CM_ANEMDEF', 'CM_ARTH',
                                   'CM_BLDLOSS', 'CM_CHF', 'CM_DRUG', 'TRAN_IN',
                                   'TRAN_OUT'])]),
 'random_forest': RandomForestRegressor(random_state=42)}

In [24]:
final_predictions = model.predict(X_test)

In [25]:
final_predictions

array([46565., 33310., 68960., 39595., 30675.,  1370., 32090.,  1520.,
       38840.,  6310., 41595.,  6505., 33445., 44605., 31560., 34800.,
       45110.,  2645., 46435.,  4010., 42505., 65140., 33090., 44835.,
        8520., 41665., 68115.,  3505., 68800.,  2220., 62390.,  3860.,
       51045., 40355.,  3275.,  3385., 42630., 46265., 59490., 46110.,
        4980., 30955., 38770., 68295., 40330., 68605., 65715.,  6910.,
       68495.,  8805., 40850., 35880., 67705., 34695., 43860., 65515.,
       34560.,  5285.,  4185.,  3175., 30250., 43805.,  4840., 34910.,
       61375., 35270., 69910., 31480., 43275., 12420.,  7110., 37185.,
       63405., 40280.,  5615.,  5685., 65420., 36860.,  7330.,  5585.,
       44140., 70610., 67625.,  6135., 43885., 47185., 41785., 47390.,
       45660., 71565., 39805., 40765., 56375.,  6705.,  5050., 49010.,
       41990., 69005.,  4560., 70040.,  3205., 41195., 44900., 44655.,
       33975.,  3260., 34000., 40865., 63700., 44180., 35005., 63905.,
      

In [27]:
tree_rmse = mean_squared_error(y_test, final_predictions,squared=False)
tree_rmse

2798.5014094848852

In [26]:
model.score(X_test,y_test)

0.9836028395300327

# Cross Validation using Pipeline

In [27]:
from sklearn.model_selection import cross_val_score
forest_rmses = -cross_val_score(model, X_train,y_train,
                                scoring="neg_root_mean_squared_error", cv=10)

In [28]:
pd.Series(forest_rmses).describe()

count      10.000000
mean     2891.738398
std       263.713624
min      2448.334372
25%      2797.503593
50%      2915.652231
75%      3072.166396
max      3241.605933
dtype: float64

In [29]:
model.fit(X_train,y_train)
final_predictions = model.predict(X_test)
forest_rmse = mean_squared_error(y_test, final_predictions,squared=False)
forest_rmse

2798.5014094848852

In [30]:
# no change in the error after doing cross validation 

# Fine-Tune Your Model
Grid Search

In [31]:
from sklearn.model_selection import GridSearchCV

grid_params = [
    {
     "random_forest__max_depth":[1,2,3,4,5,None]
    }
]
grid_search = GridSearchCV(model,grid_params,cv=5,scoring='neg_root_mean_squared_error')

In [32]:
grid_search.fit(X_train,y_train)

In [33]:

grid_search.best_score_

-2958.5210197577285

In [34]:
final_predictions_cv = model.predict(X_test)
forest_rmse_cv = mean_squared_error(y_test, final_predictions_cv,squared=False)
forest_rmse_cv

2798.5014094848852

# Exporting the Pipeline

In [73]:
# export 
import pickle

In [74]:
pickle.dump(model,open('iryss-model.pkl','wb'))

In [75]:
import joblib

In [29]:
joblib.dump(model,open('iryss-model-joblib.pkl','wb'))