In [1]:
#import library
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
#import machine learning library
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
    

In [2]:
#loading the data
df=pd.read_csv('../input/av-healthcare-analytics-ii/healthcare/train_data.csv')
df

Unnamed: 0,case_id,Hospital_code,Hospital_type_code,City_Code_Hospital,Hospital_region_code,Available Extra Rooms in Hospital,Department,Ward_Type,Ward_Facility_Code,Bed Grade,patientid,City_Code_Patient,Type of Admission,Severity of Illness,Visitors with Patient,Age,Admission_Deposit,Stay
0,1,8,c,3,Z,3,radiotherapy,R,F,2.0,31397,7.0,Emergency,Extreme,2,51-60,4911.0,0-10
1,2,2,c,5,Z,2,radiotherapy,S,F,2.0,31397,7.0,Trauma,Extreme,2,51-60,5954.0,41-50
2,3,10,e,1,X,2,anesthesia,S,E,2.0,31397,7.0,Trauma,Extreme,2,51-60,4745.0,31-40
3,4,26,b,2,Y,2,radiotherapy,R,D,2.0,31397,7.0,Trauma,Extreme,2,51-60,7272.0,41-50
4,5,26,b,2,Y,2,radiotherapy,S,D,2.0,31397,7.0,Trauma,Extreme,2,51-60,5558.0,41-50
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
318433,318434,6,a,6,X,3,radiotherapy,Q,F,4.0,86499,23.0,Emergency,Moderate,3,41-50,4144.0,11-20
318434,318435,24,a,1,X,2,anesthesia,Q,E,4.0,325,8.0,Urgent,Moderate,4,81-90,6699.0,31-40
318435,318436,7,a,4,X,3,gynecology,R,F,4.0,125235,10.0,Emergency,Minor,3,71-80,4235.0,11-20
318436,318437,11,b,2,Y,3,anesthesia,Q,D,3.0,91081,8.0,Trauma,Minor,5,11-20,3761.0,11-20


In [3]:
#checking the missing values
df.isna().sum()

case_id                                 0
Hospital_code                           0
Hospital_type_code                      0
City_Code_Hospital                      0
Hospital_region_code                    0
Available Extra Rooms in Hospital       0
Department                              0
Ward_Type                               0
Ward_Facility_Code                      0
Bed Grade                             113
patientid                               0
City_Code_Patient                    4532
Type of Admission                       0
Severity of Illness                     0
Visitors with Patient                   0
Age                                     0
Admission_Deposit                       0
Stay                                    0
dtype: int64

In [4]:
#creating a function to fill null value inside the column
def input_missing_values(df,columns):
    for column in columns:
        df[column]=df[column].fillna(df[column].mean())
        

In [5]:
#storing the name of the column inside the list
columns=['Bed Grade','City_Code_Patient']
#calling the function
input_missing_values(df,columns)


In [6]:
#getting the data and checking again the null values
df.isna().sum()

case_id                              0
Hospital_code                        0
Hospital_type_code                   0
City_Code_Hospital                   0
Hospital_region_code                 0
Available Extra Rooms in Hospital    0
Department                           0
Ward_Type                            0
Ward_Facility_Code                   0
Bed Grade                            0
patientid                            0
City_Code_Patient                    0
Type of Admission                    0
Severity of Illness                  0
Visitors with Patient                0
Age                                  0
Admission_Deposit                    0
Stay                                 0
dtype: int64

In [7]:
#getting the data types for each column
df.dtypes.index

Index(['case_id', 'Hospital_code', 'Hospital_type_code', 'City_Code_Hospital',
       'Hospital_region_code', 'Available Extra Rooms in Hospital',
       'Department', 'Ward_Type', 'Ward_Facility_Code', 'Bed Grade',
       'patientid', 'City_Code_Patient', 'Type of Admission',
       'Severity of Illness', 'Visitors with Patient', 'Age',
       'Admission_Deposit', 'Stay'],
      dtype='object')

In [8]:
#creating  a function to create a list with object
#data type and then getting unique value of that column
# and store inside the dictionary
def get_categorical_uniques(df):
    categorical_column=[column for column in df.dtypes.index if df.dtypes[column]=='object']
    #saving the unique values to a dictionary
    categorical_uniques={column:df[column].unique() for column in categorical_column}
    return categorical_uniques

In [9]:
categorical_uniques=get_categorical_uniques(df)

In [10]:
#creating a list in which we do onehot_encoding
onehot_column=['Hospital_type_code','Hospital_region_code',
              'Department','Ward_Type','Ward_Facility_Code']

            

In [11]:
#writing a function to onehotencode
def onehot_encode(df,columns):
    for column in columns:
        dummies=pd.get_dummies(df[column])
        df=pd.concat([df,dummies],axis=1)
        df.drop(column,axis=1,inplace=True)
    return df
        
    

In [12]:
df=onehot_encode(df,onehot_column)

In [13]:
categorical_uniques

{'Hospital_type_code': array(['c', 'e', 'b', 'a', 'f', 'd', 'g'], dtype=object),
 'Hospital_region_code': array(['Z', 'X', 'Y'], dtype=object),
 'Department': array(['radiotherapy', 'anesthesia', 'gynecology', 'TB & Chest disease',
        'surgery'], dtype=object),
 'Ward_Type': array(['R', 'S', 'Q', 'P', 'T', 'U'], dtype=object),
 'Ward_Facility_Code': array(['F', 'E', 'D', 'B', 'A', 'C'], dtype=object),
 'Type of Admission': array(['Emergency', 'Trauma', 'Urgent'], dtype=object),
 'Severity of Illness': array(['Extreme', 'Moderate', 'Minor'], dtype=object),
 'Age': array(['51-60', '71-80', '31-40', '41-50', '81-90', '61-70', '21-30',
        '11-20', '0-10', '91-100'], dtype=object),
 'Stay': array(['0-10', '41-50', '31-40', '11-20', '51-60', '21-30', '71-80',
        'More than 100 Days', '81-90', '61-70', '91-100'], dtype=object)}

In [14]:
#creating for loop inside the dictionary
for column in categorical_uniques:
    categorical_uniques[column]=sorted(categorical_uniques[column])

In [15]:
categorical_uniques

{'Hospital_type_code': ['a', 'b', 'c', 'd', 'e', 'f', 'g'],
 'Hospital_region_code': ['X', 'Y', 'Z'],
 'Department': ['TB & Chest disease',
  'anesthesia',
  'gynecology',
  'radiotherapy',
  'surgery'],
 'Ward_Type': ['P', 'Q', 'R', 'S', 'T', 'U'],
 'Ward_Facility_Code': ['A', 'B', 'C', 'D', 'E', 'F'],
 'Type of Admission': ['Emergency', 'Trauma', 'Urgent'],
 'Severity of Illness': ['Extreme', 'Minor', 'Moderate'],
 'Age': ['0-10',
  '11-20',
  '21-30',
  '31-40',
  '41-50',
  '51-60',
  '61-70',
  '71-80',
  '81-90',
  '91-100'],
 'Stay': ['0-10',
  '11-20',
  '21-30',
  '31-40',
  '41-50',
  '51-60',
  '61-70',
  '71-80',
  '81-90',
  '91-100',
  'More than 100 Days']}

In [16]:
#one hot columns
one_hot=['Hospital_type_code','Hospital_region_code',
              'Department','Ward_Type','Ward_Facility_Code']
for key in one_hot:
    del categorical_uniques[key]


In [17]:
categorical_uniques = {key: categorical_uniques[key] for key in categorical_uniques.keys()
                               & {'Type of Admission','Severity of Illness',
                                'Age','Stay'}}

In [18]:
unique_list=categorical_uniques['Type of Admission']
unique_list.insert(0,unique_list.pop(unique_list.index('Urgent')))
unique_list.insert(0,unique_list.pop(unique_list.index('Trauma')))
unique_list


['Trauma', 'Urgent', 'Emergency']

In [19]:
unique_list=categorical_uniques['Severity of Illness']
unique_list.insert(0,unique_list.pop(unique_list.index('Moderate')))
unique_list.insert(0,unique_list.pop(unique_list.index('Minor')))
unique_list

['Minor', 'Moderate', 'Extreme']

In [20]:
categorical_uniques

{'Age': ['0-10',
  '11-20',
  '21-30',
  '31-40',
  '41-50',
  '51-60',
  '61-70',
  '71-80',
  '81-90',
  '91-100'],
 'Type of Admission': ['Trauma', 'Urgent', 'Emergency'],
 'Stay': ['0-10',
  '11-20',
  '21-30',
  '31-40',
  '41-50',
  '51-60',
  '61-70',
  '71-80',
  '81-90',
  '91-100',
  'More than 100 Days'],
 'Severity of Illness': ['Minor', 'Moderate', 'Extreme']}

In [21]:
stay_mapping={value: index for index,value in enumerate(categorical_uniques['Stay'])}
stay_mapping

{'0-10': 0,
 '11-20': 1,
 '21-30': 2,
 '31-40': 3,
 '41-50': 4,
 '51-60': 5,
 '61-70': 6,
 '71-80': 7,
 '81-90': 8,
 '91-100': 9,
 'More than 100 Days': 10}

In [22]:
#creating the function to encode
def ordinal_encode(df,unique):
    for column in unique:
        df[column]=df[column].apply(lambda x:unique[column].index(x))
        

In [23]:
#calling the function
ordinal_encode(df,categorical_uniques)


In [71]:
df

Unnamed: 0,case_id,Hospital_code,City_Code_Hospital,Available Extra Rooms in Hospital,Bed Grade,patientid,City_Code_Patient,Type of Admission,Severity of Illness,Visitors with Patient,...,R,S,T,U,A,B,C,D,E,F
0,1,8,3,3,2.0,31397,7.0,2,2,2,...,1,0,0,0,0,0,0,0,0,1
1,2,2,5,2,2.0,31397,7.0,0,2,2,...,0,1,0,0,0,0,0,0,0,1
2,3,10,1,2,2.0,31397,7.0,0,2,2,...,0,1,0,0,0,0,0,0,1,0
3,4,26,2,2,2.0,31397,7.0,0,2,2,...,1,0,0,0,0,0,0,1,0,0
4,5,26,2,2,2.0,31397,7.0,0,2,2,...,0,1,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
318433,318434,6,6,3,4.0,86499,23.0,2,1,3,...,0,0,0,0,0,0,0,0,0,1
318434,318435,24,1,2,4.0,325,8.0,1,1,4,...,0,0,0,0,0,0,0,0,1,0
318435,318436,7,4,3,4.0,125235,10.0,2,0,3,...,1,0,0,0,0,0,0,0,0,1
318436,318437,11,2,3,3.0,91081,8.0,0,0,5,...,0,0,0,0,0,0,0,1,0,0


In [72]:
y=df['Stay']
x=df.drop('Stay',axis=1)
x.set_index('case_id',inplace=True)

In [73]:
#scaling the feature data with StandardScaler
scaler=StandardScaler()
x=pd.DataFrame(scaler.fit_transform(x),index=x.index,columns=x.columns)
x

Unnamed: 0_level_0,Hospital_code,City_Code_Hospital,Available Extra Rooms in Hospital,Bed Grade,patientid,City_Code_Patient,Type of Admission,Severity of Illness,Visitors with Patient,Age,...,R,S,T,U,A,B,C,D,E,F
case_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,-1.195176,-0.571055,-0.169177,-0.716855,-0.904442,-0.053458,1.212557,1.646648,-0.727923,0.461600,...,1.220175,-0.568572,-0.068263,-0.005316,-0.309922,-0.352282,-0.354009,-0.440807,-0.458683,1.350633
2,-1.890124,0.073580,-1.025217,-0.716855,-0.904442,-0.053458,-0.974973,1.646648,-0.727923,0.461600,...,-0.819554,1.758792,-0.068263,-0.005316,-0.309922,-0.352282,-0.354009,-0.440807,-0.458683,1.350633
3,-0.963527,-1.215691,-1.025217,-0.716855,-0.904442,-0.053458,-0.974973,1.646648,-0.727923,0.461600,...,-0.819554,1.758792,-0.068263,-0.005316,-0.309922,-0.352282,-0.354009,-0.440807,2.180153,-0.740394
4,0.889668,-0.893373,-1.025217,-0.716855,-0.904442,-0.053458,-0.974973,1.646648,-0.727923,0.461600,...,1.220175,-0.568572,-0.068263,-0.005316,-0.309922,-0.352282,-0.354009,2.268564,-0.458683,-0.740394
5,0.889668,-0.893373,-1.025217,-0.716855,-0.904442,-0.053458,-0.974973,1.646648,-0.727923,0.461600,...,-0.819554,1.758792,-0.068263,-0.005316,-0.309922,-0.352282,-0.354009,2.268564,-0.458683,-0.740394
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
318434,-1.426825,0.395897,-0.169177,1.574123,0.546379,3.342582,1.212557,0.138090,-0.161049,-0.067622,...,-0.819554,-0.568572,-0.068263,-0.005316,-0.309922,-0.352282,-0.354009,-0.440807,-0.458683,1.350633
318435,0.658018,-1.215691,-1.025217,1.574123,-1.722559,0.158795,0.118792,0.138090,0.405826,2.049268,...,-0.819554,-0.568572,-0.068263,-0.005316,-0.309922,-0.352282,-0.354009,-0.440807,2.180153,-0.740394
318436,-1.311001,-0.248738,-0.169177,1.574123,1.566288,0.583300,1.212557,-1.370469,-0.161049,1.520045,...,1.220175,-0.568572,-0.068263,-0.005316,-0.309922,-0.352282,-0.354009,-0.440807,-0.458683,1.350633
318437,-0.847702,-0.893373,-0.169177,0.428634,0.667022,0.158795,-0.974973,-1.370469,0.972701,-1.655290,...,-0.819554,-0.568572,-0.068263,-0.005316,-0.309922,-0.352282,-0.354009,2.268564,-0.458683,-0.740394


In [74]:
#spliting the data
x_train,x_test,y_train,y_test=train_test_split(x,y,train_size=0.7)

In [78]:
model=LogisticRegression()


In [76]:
y_train

173793    1
267388    2
95871     1
92683     0
100171    3
         ..
70535     5
108523    5
293549    3
86655     2
211811    8
Name: Stay, Length: 222906, dtype: int64

In [57]:
x_train

Unnamed: 0,case_id,Hospital_code,City_Code_Hospital,Available Extra Rooms in Hospital,Bed Grade,patientid,City_Code_Patient,Type of Admission,Severity of Illness,Visitors with Patient,...,R,S,T,U,A,B,C,D,E,F
217598,0.635077,0.889668,-0.893373,0.686863,-0.716855,-1.025558,1.587948e-01,1.212557,-1.370469,0.405826,...,-0.819554,-0.568572,-0.068263,-0.005316,-0.309922,-0.352282,-0.354009,2.268564,-0.458683,-0.740394
283853,1.355826,0.542194,0.395897,-0.169177,1.574123,1.534561,-9.024676e-01,-0.974973,0.138090,-0.161049,...,1.220175,-0.568572,-0.068263,-0.005316,-0.309922,-0.352282,-0.354009,-0.440807,-0.458683,1.350633
283851,1.355804,0.889668,-0.893373,0.686863,0.428634,1.534561,-9.024676e-01,-0.974973,0.138090,-0.727923,...,1.220175,-0.568572,-0.068263,-0.005316,-0.309922,-0.352282,-0.354009,2.268564,-0.458683,-0.740394
291246,1.436250,-1.774299,-0.571055,-1.025217,0.428634,0.500487,1.587948e-01,-0.974973,-1.370469,-0.727923,...,-0.819554,-0.568572,-0.068263,-0.005316,3.226623,-0.352282,-0.354009,-0.440807,-0.458683,-0.740394
280566,1.320069,0.426369,1.362850,-1.025217,0.428634,1.463655,-5.345767e-02,0.118792,0.138090,-0.727923,...,1.220175,-0.568572,-0.068263,-0.005316,-0.309922,2.838636,-0.354009,-0.440807,-0.458683,-0.740394
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
210562,0.558536,-0.616053,0.073580,-1.025217,0.428634,0.906359,5.655543e-16,1.212557,0.138090,-0.727923,...,1.220175,-0.568572,-0.068263,-0.005316,-0.309922,-0.352282,-0.354009,-0.440807,-0.458683,1.350633
84743,-0.810176,-1.426825,0.395897,0.686863,0.428634,-0.210363,1.220057e+00,1.212557,0.138090,0.405826,...,-0.819554,-0.568572,-0.068263,-0.005316,-0.309922,-0.352282,-0.354009,-0.440807,-0.458683,1.350633
198792,0.430497,-0.036930,2.652120,-0.169177,0.428634,0.241560,-5.345767e-02,0.118792,-1.370469,-0.161049,...,1.220175,-0.568572,-0.068263,-0.005316,-0.309922,2.838636,-0.354009,-0.440807,-0.458683,-0.740394
162034,0.030628,0.542194,0.395897,-0.169177,0.428634,-0.749991,-1.326973e+00,-0.974973,-1.370469,-0.161049,...,-0.819554,-0.568572,-0.068263,-0.005316,-0.309922,-0.352282,-0.354009,-0.440807,-0.458683,1.350633


In [82]:
x_train.dtypes

Hospital_code                        float64
City_Code_Hospital                   float64
Available Extra Rooms in Hospital    float64
Bed Grade                            float64
patientid                            float64
City_Code_Patient                    float64
Type of Admission                    float64
Severity of Illness                  float64
Visitors with Patient                float64
Age                                  float64
Admission_Deposit                    float64
a                                    float64
b                                    float64
c                                    float64
d                                    float64
e                                    float64
f                                    float64
g                                    float64
X                                    float64
Y                                    float64
Z                                    float64
TB & Chest disease                   float64
anesthesia

In [None]:
logistic.s