# <span class="title-section w3-xxlarge" id="imports"> Importing Libraries 📚</span>
<hr>

In [1]:
import numpy as np 
import pandas as pd 
import os, gc
import lightgbm as lgb
from lightgbm import log_evaluation
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold, StratifiedKFold, RepeatedStratifiedKFold
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier, VotingClassifier,AdaBoostRegressor
import joblib
import tensorflow as tf
from sklearn.preprocessing import MinMaxScaler,RobustScaler
from sklearn.metrics import log_loss

# <span class="title-section w3-xxlarge" id="loading"> Loading Meta Data 🗂️ </span>
<hr>

In [2]:
train_df=pd.read_csv('/kaggle/input/diabetes-prediction-competitiontfug-chd-nov-2022/train_data.csv')
test_df=pd.read_csv('/kaggle/input/diabetes-prediction-competitiontfug-chd-nov-2022/test_data.csv')

In [3]:
df=train_df

In [4]:
train_df=train_df.drop_duplicates()

In [5]:
train_df.head()

Unnamed: 0,Age,Sex,HighChol,CholCheck,BMI,Smoker,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,HvyAlcoholConsump,GenHlth,MentHlth,PhysHlth,DiffWalk,Diabetes,Hypertension,Stroke
0,4.0,1.0,0.0,1.0,26.0,0.0,0.0,1.0,0.0,1.0,0.0,3.0,5.0,30.0,0.0,0.0,1.0,0.0
1,12.0,1.0,1.0,1.0,26.0,1.0,0.0,0.0,1.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,1.0,1.0
2,13.0,1.0,0.0,1.0,26.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,10.0,0.0,0.0,0.0,0.0
3,11.0,1.0,1.0,1.0,28.0,1.0,0.0,1.0,1.0,1.0,0.0,3.0,0.0,3.0,0.0,0.0,1.0,0.0
4,8.0,0.0,0.0,1.0,29.0,1.0,0.0,1.0,1.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
test_df.head()

Unnamed: 0,Age,Sex,HighChol,CholCheck,BMI,Smoker,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,HvyAlcoholConsump,GenHlth,MentHlth,PhysHlth,DiffWalk,Hypertension,Stroke
0,11.0,1.0,1.0,1.0,35.0,1.0,0.0,0.0,0.0,1.0,0.0,3.0,0.0,3.0,0.0,1.0,0.0
1,13.0,0.0,1.0,1.0,25.0,1.0,0.0,0.0,1.0,0.0,0.0,5.0,0.0,2.0,0.0,1.0,0.0
2,11.0,0.0,1.0,1.0,30.0,1.0,0.0,0.0,1.0,0.0,0.0,5.0,0.0,15.0,0.0,1.0,0.0
3,4.0,0.0,0.0,1.0,25.0,1.0,0.0,1.0,1.0,1.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0
4,8.0,1.0,0.0,1.0,20.0,1.0,0.0,1.0,0.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0


# <span class="title-section w3-xxlarge" id="data_pipeline"> Data Pipeline 🔧 </span>
<hr>

In [7]:
from imblearn.over_sampling import SMOTE
from collections import Counter
sm = SMOTE(random_state=42)

In [8]:
def replace_median(train_df,features):
    for f in features:
        train_df[f]=train_df[f].replace(0,train_df[f].mean())
    return train_df

# <span class="title-section w3-xxlarge" id="training"> Training 🏋️</span>
<hr>

In [9]:
def run_model(train_df, test_df):
    features =list(filter(lambda x: x not in ['Diabetes'], train_df.columns))
    target = train_df['Diabetes']
    train_df=replace_median(train_df,features)
    X = train_df[features]
    Y = train_df['Diabetes']
    
    X_resamp_tr, y_resamp_tr = sm.fit_resample(X, Y)
    lgbm_params = {
    'boosting_type': 'gbdt',      
    'objective': 'binary',          
    'metric': 'binary_logloss',     
    'learning_rate': 0.11437976620936573, 

    # -- acceleration of model learning --
    'max_depth':24,
    'num_leaves': 25, 

    'n_estimators': 10000, 
    'importance_type': 'gain',      # for variables' importance

    # -- for model overfitting --
    'min_data_in_leaf': 61, 
    'min_sum_hessian_in_leaf': 32.93481038105413,
    'lambda_l1': 1.0,               # L1 regularization
    'lambda_l2': 1.0,               # L2 regularization
    
    'bagging_fraction':0.9280786928822388,
    'bagging_freq': 1,
    'feature_fraction':0.5869250335390612, 
    'verbosity': -1,
    'random_seed': 42     
    }

    num_round = 10000
    folds = StratifiedKFold(n_splits=10, shuffle=True, random_state=2022)
    oof = np.zeros(len(X_resamp_tr))
    predictions = np.zeros(len(test_df))
    for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_resamp_tr.values, y_resamp_tr.values)):
        print("Fold {}".format(fold_))
        trn_data = lgb.Dataset(X_resamp_tr.iloc[trn_idx][features], label=y_resamp_tr.iloc[trn_idx])
        val_data = lgb.Dataset(X_resamp_tr.iloc[val_idx][features], label=y_resamp_tr.iloc[val_idx])
        clf = lgb.train(lgbm_params, trn_data, num_boost_round=20000, valid_sets = [trn_data, val_data], verbose_eval=10000, early_stopping_rounds = 3500)
        oof[val_idx] = clf.predict(X_resamp_tr.iloc[val_idx][features], num_iteration=clf.best_iteration)
        predictions += clf.predict(test_df[features], num_iteration=clf.best_iteration) / folds.n_splits
    return predictions

In [10]:
predictions = run_model(train_df, test_df)


A value is trying to be set on a copy of a slice from a DataFrame.

Try using .loc[row_indexer,col_indexer] = value instead



See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  This is separate from the ipykernel package so we can avoid doing imports until


Fold 0


Training until validation scores don't improve for 3500 rounds









Early stopping, best iteration is:

[108]	training's binary_logloss: 0.48022	valid_1's binary_logloss: 0.496034

Fold 1


Training until validation scores don't improve for 3500 rounds

Early stopping, best iteration is:

[112]	training's binary_logloss: 0.480476	valid_1's binary_logloss: 0.490336

Fold 2


Training until validation scores don't improve for 3500 rounds

Early stopping, best iteration is:

[126]	training's binary_logloss: 0.47932	valid_1's binary_logloss: 0.486734

Fold 3


Training until validation scores don't improve for 3500 rounds

Early stopping, best iteration is:

[119]	training's binary_logloss: 0.478912	valid_1's binary_logloss: 0.499409

Fold 4


Training until validation scores don't improve for 3500 rounds

Early stopping, best iteration is:

[181]	training's binary_logloss: 0.475665	valid_1's binary_logloss: 0.490766

Fold 5


Training until validation scores don't improve for 3500 rounds

Early stopping, best iteration is:

[138]	training's binary_logloss

# <span class="title-section w3-xxlarge" id="submit"> Submitting to Kaggle 🇰</span>
<hr>

In [11]:
sub = pd.read_csv('/kaggle/input/diabetes-prediction-competitiontfug-chd-nov-2022/sample_submission.csv')
sub['Diabetes'] = predictions
sub.to_csv('submission.csv', index=False)