
# **ML-Olympiad---GOOD-HEALTH-AND-WELL-BEING-HACKATHON**

### ***Setting Up***

In [135]:
!pip install catboost



### Importing Libaries

In [98]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random

from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

from sklearn.model_selection import train_test_split, StratifiedKFold, StratifiedShuffleSplit
from sklearn.metrics import classification_report, confusion_matrix, f1_score, roc_auc_score
from sklearn.cluster import KMeans

from imblearn.over_sampling import SMOTE

from sklearn.preprocessing import PolynomialFeatures

import warnings
warnings.filterwarnings('ignore')

### **Dataset Preprocessing**

In [99]:
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
SEED = 2022   
seed_everything(SEED) 

In [100]:
train_df = pd.read_csv('Ml-Olympiad/train.csv')
test_df = pd.read_csv('Ml-Olympiad/test.csv')
sample_submission = pd.read_csv('Ml-Olympiad/sample_submission.csv')

In [101]:
train_df.head(3)

Unnamed: 0,PatientID,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,Diabetes,PhysActivity,Fruits,Veggies,HvyAlcoholConsump,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income,target
0,42351,1,1,1,29,0,0,0,1,1,1,0,1,0,3,0,0,0,0,13,5,8,0
1,135091,1,0,1,30,0,1,2,0,0,0,0,0,0,2,0,0,0,0,9,5,6,0
2,201403,0,0,1,31,0,0,0,1,1,1,0,1,0,2,0,7,0,0,10,6,8,0


In [102]:
X_df = train_df.drop(['PatientID','target'], axis = 1)
y = train_df['target']

test_df = test_df.drop('PatientID', axis = 1)

In [103]:
train_df.isnull().sum().sum(), pd.read_csv('Ml-Olympiad/train.csv').isnull().sum().sum()

(0, 0)

 ### **Feature Engineering and Selection**

**1. Feature Interaction**

In [104]:
feat_interact = PolynomialFeatures(interaction_only = True, include_bias = False)
train_fi = feat_interact.fit_transform(X_df)
test_fi = feat_interact.fit_transform(test_df)

col_names = [f'col_{n}' for n in range(1,232)]

train_fi = pd.DataFrame(train_fi, columns = col_names)
test_fi = pd.DataFrame(test_fi, columns = col_names)

In [105]:
train_fi.shape, test_fi.shape

((177576, 231), (76104, 231))

In [106]:
imp_featues = ['col_208', 'col_34', 'col_22', 'col_57', 'col_53', 'col_26', 'col_119',
       'col_63', 'col_38', 'col_39', 'col_23', 'col_226', 'col_44', 'col_17',
       'col_74', 'col_42', 'col_37', 'col_6', 'col_75'] ## 'col_18',

X = pd.concat([X_df, train_fi[imp_featues]], axis = 1)
test = pd.concat([test_df, test_fi[imp_featues]], axis = 1)

**2. Clustering**

In [107]:
cluster_1 = ['HighBP', 'HighChol', 'CholCheck', 'BMI']
cluster_2 = ['Smoker', 'Stroke','Diabetes', 'PhysActivity']
cluster_3 = ['Fruits', 'Veggies', 'HvyAlcoholConsump','AnyHealthcare']
cluster_4 = ['NoDocbcCost', 'GenHlth', 'MentHlth', 'PhysHlth']

cluster_zip = [('cluster_1',cluster_1),('cluster_2',cluster_2),('cluster_3',cluster_3),('cluster_4',cluster_4)]#

for cluster_name, cluster_col in cluster_zip:
  cluster = KMeans(n_clusters = 4)
  cluster.fit(X[cluster_col])
  X[cluster_name] = cluster.predict(X[cluster_col])
  test[cluster_name] = cluster.predict(test[cluster_col])

3. **OneHotEncoding of**

In [108]:
X['Diabetes'] = X['Diabetes'].apply(lambda x: str(x))
test['Diabetes'] = test['Diabetes'].apply(lambda x: str(x))

In [109]:
X = pd.get_dummies(X, drop_first = True)
test = pd.get_dummies(test, drop_first = True)

**CV fold**

In [110]:
skfold = StratifiedKFold(n_splits = 10, random_state = 2022, shuffle = True)

# **XGBoost CV**

In [111]:
model_xgb = XGBClassifier(objective='binary:logistic', max_depth =  10,
          learning_rate = 0.06808766268462589, colsample_bytree = 0.5124699707194202, 
             subsample =  0.4553243247037658, reg_alpha =  1.5936907054484504, random_state = 117)


train_roc_xgb, val_roc_xgb = [], []

train_pred_xgb = np.zeros(len(X))
val_pred_xgb = np.zeros(len(X))
test_pred_xgb = np.zeros(len(test))
fold = 0

for train_index, val_index in skfold.split(X, y):
    X_train = X.iloc[train_index] ; y_train = y.iloc[train_index].values
    X_val = X.iloc[val_index] ; y_val = y.iloc[val_index].values
    fold += 1
    print(f'***************************Fold :{fold}***********************************************')
    
    model_xgb.fit(X_train, y_train, early_stopping_rounds = 200, eval_metric="auc",
                       eval_set=[(X_val, y_val)],verbose=250)
    
    train_pred = model_xgb.predict_proba(X_train, ntree_limit = model_xgb.get_booster().best_ntree_limit)[:,1]
    val_pred = model_xgb.predict_proba(X_val, ntree_limit = model_xgb.get_booster().best_ntree_limit)[:,1]

    train_pred_xgb[train_index] = train_pred
    val_pred_xgb[val_index] = val_pred
    
    test_pred_xgb += model_xgb.predict_proba(test)[:,1]
    
    print(f'Train score : {roc_auc_score(y_train, train_pred)}')
    print(f'Validation score : {roc_auc_score(y_val, val_pred)}\n')
    
   
    train_roc_xgb.append(roc_auc_score(y_train, train_pred)); val_roc_xgb.append(roc_auc_score(y_val,val_pred))
    
test_pred_xgb = test_pred_xgb / 10
print(f'Training ROC score : {np.mean(train_roc_xgb)}')
print(f'Testing ROC score : {np.mean(val_roc_xgb)} +/- {np.std(val_roc_xgb)}')

***************************Fold :1***********************************************
[0]	validation_0-auc:0.83297
Will train until validation_0-auc hasn't improved in 200 rounds.
[99]	validation_0-auc:0.850219
Train score : 0.8805244515923771
Validation score : 0.8510782095578371

***************************Fold :2***********************************************
[0]	validation_0-auc:0.836256
Will train until validation_0-auc hasn't improved in 200 rounds.
[99]	validation_0-auc:0.851378
Train score : 0.8684256033660581
Validation score : 0.8525730108707831

***************************Fold :3***********************************************
[0]	validation_0-auc:0.829304
Will train until validation_0-auc hasn't improved in 200 rounds.
[99]	validation_0-auc:0.847772
Train score : 0.8670529103774676
Validation score : 0.8490144166497431

***************************Fold :4***********************************************
[0]	validation_0-auc:0.822183
Will train until validation_0-auc hasn't improved

### **Model Prediction**

In [112]:
train_prediction = pd.DataFrame({'XGBoost_Prob': val_pred_xgb})

In [113]:
test_prediction = pd.DataFrame({'XGBoost_Prob': test_pred_xgb})
xgb_test_pred = test_prediction['XGBoost_Prob'].apply(lambda x: 1 if x >= 0.2 else 0)

sub_1 = sample_submission.copy()
sub_1['target'] = xgb_test_pred
sub_1.to_csv('Ml-Olympiad/Xgboost_unbal.csv', index = False)

# **LightGBM CV**

In [114]:
model_lgb = LGBMClassifier(boosting_type =  'gbdt',objective = 'binary',random_state=34, n_estimators=500,
    colsample_bytree=0.9, min_child_samples=10, subsample=0.7, subsample_freq=2,
    num_leaves=120,reg_lambda=2, reg_alpha=5 ,metric='auc', learning_rate=0.008, max_depth=5)

In [115]:
train_roc_lgb, val_roc_lgb = [], []

train_pred_lgb = np.zeros(len(X))
val_pred_lgb = np.zeros(len(X))
test_pred_lgb = np.zeros(len(test))
fold = 0

for train_index, val_index in skfold.split(X, y):
    X_train = X.iloc[train_index] ; y_train = y.iloc[train_index].values
    X_val = X.iloc[val_index] ; y_val = y.iloc[val_index].values
    fold += 1
    print(f'***************************Fold :{fold}***********************************************')
    
    model_lgb.fit(X_train,y_train,eval_set=[(X_val,y_val)],early_stopping_rounds=200,
                               verbose=250)
    train_pred = model_lgb.predict_proba(X_train,  num_iteration = model_lgb.best_iteration_)[:,1]
    val_pred = model_lgb.predict_proba(X_val, num_iteration = model_lgb.best_iteration_)[:,1]

    train_pred_lgb[train_index] = train_pred
    val_pred_lgb[val_index] = val_pred
    
    test_pred_lgb += model_lgb.predict_proba(test)[:,1]
    
    print(f'Train score : {roc_auc_score(y_train, train_pred)}')
    print(f'Validation score : {roc_auc_score(y_val, val_pred)}\n')
    
   
    train_roc_lgb.append(roc_auc_score(y_train, train_pred)); val_roc_lgb.append(roc_auc_score(y_val,val_pred))
    
test_pred_lgb = test_pred_lgb / 10
print(f'Training ROC score : {np.mean(train_roc_lgb)}')
print(f'Testing ROC score : {np.mean(val_roc_lgb)} +/- {np.std(val_roc_lgb)}')

***************************Fold :1***********************************************
Training until validation scores don't improve for 200 rounds.
[250]	valid_0's auc: 0.848947
[500]	valid_0's auc: 0.851159
Did not meet early stopping. Best iteration is:
[500]	valid_0's auc: 0.851159
Train score : 0.8532795123333422
Validation score : 0.8511585938796671

***************************Fold :2***********************************************
Training until validation scores don't improve for 200 rounds.
[250]	valid_0's auc: 0.851858
[500]	valid_0's auc: 0.8533
Did not meet early stopping. Best iteration is:
[500]	valid_0's auc: 0.8533
Train score : 0.8530632875977309
Validation score : 0.8533004114981658

***************************Fold :3***********************************************
Training until validation scores don't improve for 200 rounds.
[250]	valid_0's auc: 0.847278
[500]	valid_0's auc: 0.849568
Did not meet early stopping. Best iteration is:
[500]	valid_0's auc: 0.849568
Train score

### **LGBM Model Submission**

In [116]:
train_prediction['LightGBM_Prob'] = val_pred_lgb

In [117]:
test_prediction['LightGBM_Prob'] = test_pred_lgb
lgb_test_pred = test_prediction['LightGBM_Prob'].apply(lambda x: 1 if x >= 0.2 else 0)

sub_2 = sample_submission.copy()
sub_2['target'] = lgb_test_pred
sub_2.to_csv('Ml-Olympiad/LGBM_unbal.csv', index = False)

In [133]:
sub_5['target'].value_counts(normalize = True)

0    0.838221
1    0.161779
Name: target, dtype: float64

# **CATBOOST**

In [118]:
model_cat =  CatBoostClassifier(random_seed=34,use_best_model=True,
                          n_estimators=5000,silent=True,eval_metric='AUC')

train_roc_cat, val_roc_cat = [], []

train_pred_cat = np.zeros(len(X))
val_pred_cat = np.zeros(len(X))
test_pred_cat = np.zeros(len(test))
fold = 0

for train_index, val_index in skfold.split(X, y):
    X_train = X.iloc[train_index] ; y_train = y.iloc[train_index].values
    X_val = X.iloc[val_index] ; y_val = y.iloc[val_index].values
    fold += 1
    print(f'***************************Fold :{fold}***********************************************')
    
    model_cat.fit(X_train,y_train,eval_set=[(X_val,y_val)],early_stopping_rounds=200,
                               verbose=250,use_best_model=True)
    train_pred = model_cat.predict_proba(X_train)[:,1]
    val_pred = model_cat.predict_proba(X_val)[:,1]

    train_pred_cat[train_index] = train_pred
    val_pred_cat[val_index] = val_pred
    
    test_pred_cat += model_cat.predict_proba(test)[:,1]
    
    print(f'Train score : {roc_auc_score(y_train, train_pred)}')
    print(f'Validation score : {roc_auc_score(y_val, val_pred)}\n')
    
   
    train_roc_cat.append(roc_auc_score(y_train, train_pred)); val_roc_cat.append(roc_auc_score(y_val,val_pred))
    
test_pred_cat = test_pred_cat / 10
print(f'Training ROC score : {np.mean(train_roc_cat)}')
print(f'Testing ROC score : {np.mean(val_roc_cat)} +/- {np.std(val_roc_cat)}')

***************************Fold :1***********************************************
Learning rate set to 0.0551
0:	test: 0.8049175	best: 0.8049175 (0)	total: 59ms	remaining: 4m 54s
250:	test: 0.8532935	best: 0.8533851 (197)	total: 11.5s	remaining: 3m 38s
Stopped by overfitting detector  (200 iterations wait)

bestTest = 0.8533850946
bestIteration = 197

Shrink model to first 198 iterations.
Train score : 0.8557690762054389
Validation score : 0.8533850945902615

***************************Fold :2***********************************************
Learning rate set to 0.0551
0:	test: 0.8139646	best: 0.8139646 (0)	total: 48.3ms	remaining: 4m 1s
250:	test: 0.8544415	best: 0.8544615 (243)	total: 11.2s	remaining: 3m 32s
Stopped by overfitting detector  (200 iterations wait)

bestTest = 0.854461458
bestIteration = 243

Shrink model to first 244 iterations.
Train score : 0.856985088224831
Validation score : 0.8544614580230809

***************************Fold :3***************************************

### **CatBoost Model Submission**

In [119]:
train_prediction['CatBoost_Prob'] = val_pred_cat

In [120]:
test_prediction['CatBoost_Prob'] = test_pred_cat
cat_test_pred = test_prediction['CatBoost_Prob'].apply(lambda x: 1 if x >= 0.2 else 0)

sub_3 = sample_submission.copy()
sub_3['target'] = cat_test_pred
sub_3.to_csv('Ml-Olympiad/CatBoost_unbal.csv', index = False)

# **Model Blending**

In [121]:
train_prediction['Blend'] = 0.5 * train_prediction['XGBoost_Prob'] + 0.3 * train_prediction['LightGBM_Prob'] + 0.2 * train_prediction['CatBoost_Prob']
test_prediction['Blend'] = 0.5 * test_prediction['XGBoost_Prob'] + 0.3 * test_prediction['LightGBM_Prob'] + 0.2 * test_prediction['CatBoost_Prob']

In [122]:
sub_4 = sample_submission.copy()
sub_4['target'] = test_prediction['Blend'].apply(lambda x: 1 if x >= 0.2 else 0)
sub_4.to_csv('Ml-Olympiad/Blending_unbal.csv', index = False)

# **Model Stacking**

In [123]:
X_stack_T = train_prediction.drop('Blend', axis = 1)
y_stack_T = y

X_stack_t = test_prediction.drop('Blend', axis = 1)

In [124]:
from sklearn.linear_model import LinearRegression

In [125]:
stack_model = LinearRegression()
stack_model.fit(X_stack_T, y_stack_T)

train_prediction['Stack'] = stack_model.predict(X_stack_T)
test_prediction['Stack'] = stack_model.predict(X_stack_t)

In [126]:
sub_5 = sample_submission.copy()
sub_5['target'] = test_prediction['Stack'].apply(lambda x: 1 if x >= 0.2 else 0)
sub_5.to_csv('Ml-Olympiad/Stacking_unbal.csv', index = False)

In [134]:
pip freeze Requirements.txt

absl-py==1.0.0
alabaster==0.7.12
albumentations==0.1.12
altair==4.2.0
appdirs==1.4.4
argon2-cffi==21.3.0
argon2-cffi-bindings==21.2.0
arviz==0.11.4
astor==0.8.1
astropy==4.3.1
astunparse==1.6.3
atari-py==0.2.9
atomicwrites==1.4.0
attrs==21.4.0
audioread==2.1.9
autograd==1.3
Babel==2.9.1
backcall==0.2.0
beautifulsoup4==4.6.3
bleach==4.1.0
blis==0.4.1
bokeh==2.3.3
Bottleneck==1.3.4
branca==0.4.2
bs4==0.0.1
CacheControl==0.12.10
cached-property==1.5.2
cachetools==4.2.4
catalogue==1.0.0
catboost==1.0.4
certifi==2021.10.8
cffi==1.15.0
cftime==1.5.2
chardet==3.0.4
charset-normalizer==2.0.12
click==7.1.2
cloudpickle==1.3.0
cmake==3.12.0
cmdstanpy==0.9.5
colorcet==3.0.0
colorlover==0.3.0
community==1.0.0b1
contextlib2==0.5.5
convertdate==2.4.0
coverage==3.7.1
coveralls==0.5
crcmod==1.7
cufflinks==0.17.3
cvxopt==1.2.7
cvxpy==1.0.31
cycler==0.11.0
cymem==2.0.6
Cython==0.29.28
daft==0.0.4
dask==2.12.0
datascience==0.10.6
debugpy==1.0.0
decorator==4.4.2
defusedxml==0.7.1
descartes==1.1.0
dill==0.3