In [15]:
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from scikeras.wrappers import KerasClassifier
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

In [16]:
df = pd.read_csv('df_EDA.csv')
df.head()

Unnamed: 0,USMER,MEDICAL_UNIT,SEX,HOSPITALIZED,DIED,PNEUMONIA,AGE,PREGNANT,DIABETES,COPD,ASTHMA,INMSUPR,HIPERTENSION,OTHER_DISEASE,CARDIOVASCULAR,OBESITY,RENAL_CHRONIC,TOBACCO,ANTIGEN_TEST,Age_Group
0,2,1,Female,0,1,1.0,65,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1,60-69
1,2,1,Male,0,1,1.0,72,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0,70-79
2,2,1,Male,1,1,0.0,55,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,50-59
3,2,1,Female,0,1,0.0,53,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,50-59
4,2,1,Male,0,1,0.0,68,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1,60-69


In [17]:
#Not able to train Grid Search CV with parameters hypertuning on whole dataset so take a sample
df_sampled = df.sample(n=50000, random_state=42)

In [18]:
X = df_sampled.drop(columns=['DIED','Age_Group','MEDICAL_UNIT'],axis=1)
y = df_sampled["DIED"]

In [19]:
num_features = X.select_dtypes(exclude="object").columns
cat_features = X.select_dtypes(include="object").columns

In [20]:
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.compose import ColumnTransformer

num_feat_transformer = StandardScaler()
cat_feat_transformer = OneHotEncoder()

preprocessor = ColumnTransformer(
    [
        ("OneHotEncoder", cat_feat_transformer, cat_features),
         ("StandardScaler", num_feat_transformer, num_features),        
    ]
)



In [21]:
X = preprocessor.fit_transform(X)

In [22]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)
X_train.shape, X_test.shape

((40000, 18), (10000, 18))

In [26]:
from joblib import parallel_backend

In [47]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from joblib import parallel_backend
import numpy as np

# Define pipeline
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('rf', RandomForestClassifier())
])

# Define parameter grid
param_grid = {
    'rf__n_estimators': [50, 100, 200],
    'rf__max_depth': [5, 10, 20]
}

# Define batch size
batch_size = 1000

# Split data into batches
X_batches = np.array_split(X_train, np.ceil(len(X_train)/batch_size))
y_batches = np.array_split(y_train, np.ceil(len(y_train)/batch_size))

# Perform grid search with parallel processing
with parallel_backend('multiprocessing'):
    for i, (X_batch, y_batch) in enumerate(zip(X_batches, y_batches)):
        print(f"Processing batch {i+1}/{len(X_batches)}")
        if len(X_batch) < 2: # check if batch size is greater than 1
            continue
        grid = GridSearchCV(pipe, param_grid, cv=5)
        grid.fit(X_batch, y_batch)

# Print overall best parameters and score
print(f"Best parameters: {grid.best_params_}")
print(f"Best score: {grid.best_score_}")


Processing batch 1/40
Processing batch 2/40
Processing batch 3/40
Processing batch 4/40
Processing batch 5/40
Processing batch 6/40
Processing batch 7/40
Processing batch 8/40
Processing batch 9/40
Processing batch 10/40
Processing batch 11/40
Processing batch 12/40
Processing batch 13/40
Processing batch 14/40
Processing batch 15/40
Processing batch 16/40
Processing batch 17/40
Processing batch 18/40
Processing batch 19/40
Processing batch 20/40
Processing batch 21/40
Processing batch 22/40
Processing batch 23/40
Processing batch 24/40
Processing batch 25/40
Processing batch 26/40
Processing batch 27/40
Processing batch 28/40
Processing batch 29/40
Processing batch 30/40
Processing batch 31/40
Processing batch 32/40
Processing batch 33/40
Processing batch 34/40
Processing batch 35/40
Processing batch 36/40
Processing batch 37/40
Processing batch 38/40
Processing batch 39/40
Processing batch 40/40
Best parameters: {'rf__max_depth': 5, 'rf__n_estimators': 200}
Best score: 0.921999999999

In [76]:
# Define pipeline with best hyperparameters
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('rf', RandomForestClassifier(n_estimators=200, max_depth=5))
])

# Fit pipeline on entire training set
pipe.fit(X_train, y_train)

# Use pipeline to predict on new data
y_pred = pipe.predict(X_test)

from sklearn.metrics import accuracy_score

accuracy_rf = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy_rf}")

Accuracy: 0.9395


* With using Random Forest Classifier we get accuracy 94%

In [50]:
#Using grid search for XGBC algoritham.

# Define pipeline
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('xgb', XGBClassifier())
])

# Define parameter grid
param_grid = {
    'xgb__n_estimators': [50, 100, 200],
    'xgb__max_depth': [5, 10, 20],
    'xgb__learning_rate': [0.1, 0.01, 0.001]
}

# Define batch size
batch_size = 1000

# Split data into batches
X_batches = np.array_split(X_train, np.ceil(len(X_train)/batch_size))
y_batches = np.array_split(y_train, np.ceil(len(y_train)/batch_size))

# Perform grid search with parallel processing
with parallel_backend('multiprocessing'):
    for i, (X_batch, y_batch) in enumerate(zip(X_batches, y_batches)):
        print(f"Processing batch {i+1}/{len(X_batches)}")
        if len(X_batch) < 2: # check if batch size is greater than 1
            continue
        grid = GridSearchCV(pipe, param_grid, cv=5,n_jobs=-1)
        grid.fit(X_batch, y_batch)

# Print overall best parameters and score
print(f"Best parameters: {grid.best_params_}")
print(f"Best score: {grid.best_score_}")

#Best parameters: {'xgb__learning_rate': 0.01, 'xgb__max_depth': 5, 'xgb__n_estimators': 50}
#Best score: 0.9260000000000002

Processing batch 1/40
Processing batch 2/40
Processing batch 3/40
Processing batch 4/40
Processing batch 5/40
Processing batch 6/40
Processing batch 7/40
Processing batch 8/40
Processing batch 9/40
Processing batch 10/40
Processing batch 11/40
Processing batch 12/40
Processing batch 13/40
Processing batch 14/40
Processing batch 15/40
Processing batch 16/40
Processing batch 17/40
Processing batch 18/40
Processing batch 19/40
Processing batch 20/40
Processing batch 21/40
Processing batch 22/40
Processing batch 23/40
Processing batch 24/40
Processing batch 25/40
Processing batch 26/40
Processing batch 27/40
Processing batch 28/40
Processing batch 29/40
Processing batch 30/40
Processing batch 31/40
Processing batch 32/40
Processing batch 33/40
Processing batch 34/40
Processing batch 35/40
Processing batch 36/40
Processing batch 37/40
Processing batch 38/40
Processing batch 39/40
Processing batch 40/40
Best parameters: {'xgb__learning_rate': 0.01, 'xgb__max_depth': 5, 'xgb__n_estimators': 5

In [74]:
# Define pipeline with best hyperparameters
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('xgb', XGBClassifier(learning_rate=0.01, max_depth=5, n_estimators=50))
])

# Fit pipeline on entire training set
pipe.fit(X_train, y_train)

# Use pipeline to predict on new data
y_pred = pipe.predict(X_test)

from sklearn.metrics import accuracy_score

accuracy_xgb = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy_xgb}")

Accuracy: 0.9404


With XGBOOST we get Accuracy 94%

In [54]:
#Using grid search for LGBMC algoritham.

# Define pipeline
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('lgbm', LGBMClassifier())
])

# Define parameter grid
param_grid = {
    'lgbm__n_estimators': [50, 100, 200],
    'lgbm__max_depth': [5, 10, 20],
    'lgbm__learning_rate': [0.1, 0.01, 0.001]
}

# Define batch size
batch_size = 1000

# Split data into batches
X_batches = np.array_split(X_train, np.ceil(len(X_train)/batch_size))
y_batches = np.array_split(y_train, np.ceil(len(y_train)/batch_size))

# Perform grid search with parallel processing
with parallel_backend('multiprocessing'):
    for i, (X_batch, y_batch) in enumerate(zip(X_batches, y_batches)):
        print(f"Processing batch {i+1}/{len(X_batches)}")
        if len(X_batch) < 2: # check if batch size is greater than 1
            continue
        grid = GridSearchCV(pipe, param_grid, cv=5,n_jobs=-1)
        grid.fit(X_batch, y_batch)

# Print overall best parameters and score
print(f"Best parameters: {grid.best_params_}")
print(f"Best score: {grid.best_score_}")

Processing batch 1/40
Processing batch 2/40
Processing batch 3/40
Processing batch 4/40
Processing batch 5/40
Processing batch 6/40
Processing batch 7/40
Processing batch 8/40
Processing batch 9/40
Processing batch 10/40
Processing batch 11/40
Processing batch 12/40
Processing batch 13/40
Processing batch 14/40
Processing batch 15/40
Processing batch 16/40
Processing batch 17/40
Processing batch 18/40
Processing batch 19/40
Processing batch 20/40
Processing batch 21/40
Processing batch 22/40
Processing batch 23/40
Processing batch 24/40
Processing batch 25/40
Processing batch 26/40
Processing batch 27/40
Processing batch 28/40
Processing batch 29/40
Processing batch 30/40
Processing batch 31/40
Processing batch 32/40
Processing batch 33/40
Processing batch 34/40
Processing batch 35/40
Processing batch 36/40
Processing batch 37/40
Processing batch 38/40
Processing batch 39/40
Processing batch 40/40
Best parameters: {'lgbm__learning_rate': 0.01, 'lgbm__max_depth': 5, 'lgbm__n_estimators'

In [73]:
# Define pipeline with best hyperparameters
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('xgb', LGBMClassifier(learning_rate=0.01, max_depth=5, n_estimators=200))
])

# Fit pipeline on entire training set
pipe.fit(X_train, y_train)

# Use pipeline to predict on new data
y_pred = pipe.predict(X_test)

from sklearn.metrics import accuracy_score

accuracy_lgbm = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy_lgbm}")

Accuracy: 0.9417


*Again with LGBM we have accuracy 94%

In [65]:
from catboost import CatBoostClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from joblib import parallel_backend
import numpy as np

# Define pipeline
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('catboost', CatBoostClassifier(verbose=False))
])

# Define parameter grid
param_grid = {
    'catboost__n_estimators': [50, 100, 200],
    'catboost__depth': [5, 10],
    'catboost__learning_rate': [0.1, 0.01, 0.001]
}

# Define batch size
batch_size = 2000

# Split data into batches
X_batches = np.array_split(X_train, np.ceil(len(X_train)/batch_size))
y_batches = np.array_split(y_train, np.ceil(len(y_train)/batch_size))

# Perform grid search with parallel processing
with parallel_backend('multiprocessing'):
    for i, (X_batch, y_batch) in enumerate(zip(X_batches, y_batches)):
        print(f"Processing batch {i+1}/{len(X_batches)}")
        if len(X_batch) < 2: # check if batch size is greater than 1
            continue
        grid = GridSearchCV(pipe, param_grid, cv=5, n_jobs=-1)
        grid.fit(X_batch, y_batch)

# Print overall best parameters and score
print(f"Best parameters: {grid.best_params_}")
print(f"Best score: {grid.best_score_}")
#We get 
#Best parameters: {'catboost__depth': 5, 'catboost__learning_rate': 0.1, 'catboost__n_estimators': 50}
#Best score: 0.9380000000000001


Processing batch 1/20
Processing batch 2/20
Processing batch 3/20
Processing batch 4/20
Processing batch 5/20
Processing batch 6/20
Processing batch 7/20
Processing batch 8/20
Processing batch 9/20
Processing batch 10/20
Processing batch 11/20
Processing batch 12/20
Processing batch 13/20
Processing batch 14/20
Processing batch 15/20
Processing batch 16/20
Processing batch 17/20
Processing batch 18/20
Processing batch 19/20
Processing batch 20/20
Best parameters: {'catboost__depth': 5, 'catboost__learning_rate': 0.1, 'catboost__n_estimators': 50}
Best score: 0.9380000000000001


In [70]:
# Define pipeline with best hyperparameters
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('xgb', CatBoostClassifier(learning_rate=0.1, depth=5, n_estimators=50))
])

# Fit pipeline on entire training set
pipe.fit(X_train, y_train)

# Use pipeline to predict on new data
y_pred = pipe.predict(X_test)

from sklearn.metrics import accuracy_score

accuracy_cb = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy_cb}")

0:	learn: 0.6158865	total: 4.94ms	remaining: 242ms
1:	learn: 0.5522485	total: 12.7ms	remaining: 304ms
2:	learn: 0.4982570	total: 18.4ms	remaining: 288ms
3:	learn: 0.4534917	total: 22.5ms	remaining: 259ms
4:	learn: 0.4145703	total: 30.1ms	remaining: 271ms
5:	learn: 0.3814742	total: 34.4ms	remaining: 252ms
6:	learn: 0.3522454	total: 38.6ms	remaining: 237ms
7:	learn: 0.3268993	total: 45.6ms	remaining: 239ms
8:	learn: 0.3048862	total: 50ms	remaining: 228ms
9:	learn: 0.2852965	total: 54.1ms	remaining: 217ms
10:	learn: 0.2681207	total: 61ms	remaining: 216ms
11:	learn: 0.2532563	total: 65.8ms	remaining: 208ms
12:	learn: 0.2397529	total: 69.9ms	remaining: 199ms
13:	learn: 0.2280140	total: 76.4ms	remaining: 196ms
14:	learn: 0.2174669	total: 80.8ms	remaining: 189ms
15:	learn: 0.2080338	total: 84.9ms	remaining: 180ms
16:	learn: 0.2002321	total: 88.5ms	remaining: 172ms
17:	learn: 0.1929531	total: 94.7ms	remaining: 168ms
18:	learn: 0.1866900	total: 98.9ms	remaining: 161ms
19:	learn: 0.1806787	total

* Also with catboost Accuracy is 94%

In [67]:
# Define pipeline
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('dt', DecisionTreeClassifier())
])

# Define parameter grid
param_grid = {
    'dt__max_depth': [5, 10, 15],
    'dt__min_samples_split': [2, 5, 10],
    'dt__min_samples_leaf': [1, 2, 4],
    'dt__criterion': ['gini', 'entropy']
}

# Define batch size
batch_size = 2000

# Split data into batches
X_batches = np.array_split(X_train, np.ceil(len(X_train)/batch_size))
y_batches = np.array_split(y_train, np.ceil(len(y_train)/batch_size))

# Perform grid search with parallel processing
with parallel_backend('multiprocessing'):
    for i, (X_batch, y_batch) in enumerate(zip(X_batches, y_batches)):
        print(f"Processing batch {i+1}/{len(X_batches)}")
        if len(X_batch) < 2: # check if batch size is greater than 1
            continue
        grid = GridSearchCV(pipe, param_grid, cv=5, n_jobs=-1)
        grid.fit(X_batch, y_batch)

# Print overall best parameters and score
print(f"Best parameters: {grid.best_params_}")
print(f"Best score: {grid.best_score_}")
#We get 
#Best parameters: {'dt__criterion': 'gini', 'dt__max_depth': 5, 'dt__min_samples_leaf': 2, 'dt__min_samples_split': 5}
#Best score: 0.932

Processing batch 1/20
Processing batch 2/20
Processing batch 3/20
Processing batch 4/20
Processing batch 5/20
Processing batch 6/20
Processing batch 7/20
Processing batch 8/20
Processing batch 9/20
Processing batch 10/20
Processing batch 11/20
Processing batch 12/20
Processing batch 13/20
Processing batch 14/20
Processing batch 15/20
Processing batch 16/20
Processing batch 17/20
Processing batch 18/20
Processing batch 19/20
Processing batch 20/20
Best parameters: {'dt__criterion': 'gini', 'dt__max_depth': 5, 'dt__min_samples_leaf': 2, 'dt__min_samples_split': 5}
Best score: 0.932


In [75]:
# Define pipeline with best hyperparameters
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('xgb', RandomForestClassifier(criterion='gini', max_depth=5, min_samples_leaf=2,min_samples_split=5))
])

# Fit pipeline on entire training set
pipe.fit(X_train, y_train)

# Use pipeline to predict on new data
y_pred = pipe.predict(X_test)

from sklearn.metrics import accuracy_score

accuracy_dt = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy_dt}")

Accuracy: 0.9394


* Regarding that we get best accuracy with LGBM we will train that model with best parameters and save it as a pickle file

In [28]:
import joblib
# Define pipeline with best parameters
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('lgbm', LGBMClassifier(n_estimators=200, max_depth=5, learning_rate=0.01))
])

# Train model on entire training set
pipe.fit(X_train, y_train)

# Save trained model to disk
joblib.dump(pipe, 'lgbm_model.pkl')

['lgbm_model.pkl']