# <span class="title-section w3-xxlarge" id="imports"> Importing Libraries 📚</span>
<hr>

In [1]:
import numpy as np 
import pandas as pd 
import os, gc
import glob
import lightgbm as lgb
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler,RobustScaler
import joblib
from sklearn.metrics import log_loss

# <span class="title-section w3-xxlarge" id="loading"> Loading Meta Data 🗂️ </span>
<hr>

In [2]:
train_df = pd.read_csv('../input/tfug-mysuru-water-quality-prediction/train.csv')
test_df = pd.read_csv('../input/tfug-mysuru-water-quality-prediction/test.csv')
submission = pd.read_csv('../input/tfug-mysuru-water-quality-prediction/sample_submission.csv')

In [3]:
train_df.head()

Unnamed: 0,id,categoryA,categoryB,categoryC,categoryD,categoryE,categoryF,featureA,featureB,featureC,...,compositionC,compositionD,compositionE,compositionF,compositionG,compositionH,compositionI,compositionJ,unit,result
0,a563699ca2a601c6ac64aa29986a00a90fb42b48741695...,catA_1,catB_0,catC_718,catD_0,catE_0,catF_0,75808.375,4.45784,0.005718,...,26.0,0.0,32.0,3.0,0.0,9.0,22.26,20.0,unit_6,0.000458
1,91ab3eb3bcf6c8c1c5fe2da9ba671aa5a48c7369d9a50f...,catA_1,catB_0,catC_1309,catD_0,catE_0,catF_0,75808.375,4.45784,0.005718,...,26.0,0.0,32.0,3.0,0.0,9.0,22.26,20.0,unit_6,0.000335
2,7128c51c554735d6c81862684ad6005ae12d2edbcd4644...,catA_15,catB_0,catC_1309,catD_0,catE_0,catF_0,75808.375,4.45784,0.005718,...,26.0,0.0,32.0,3.0,0.0,9.0,22.26,20.0,unit_4,0.054072
3,c8144b52e4f63014de0a0d8e1c629bf0b05cb2696cfc23...,catA_0,catB_0,catC_935,catD_0,catE_0,catF_0,75808.375,4.45784,0.005718,...,26.0,0.0,32.0,3.0,0.0,9.0,22.26,20.0,unit_5,0.061143
4,88d15a5b2df6692f23d105ff1ae82ae026be00c9271eef...,catA_22,catB_0,catC_1325,catD_0,catE_2,catF_0,-40055.25,4.363288,0.729194,...,14.0,0.0,49.0,3.0,2.0,9.0,16.84,15.0,unit_15,0.015439


In [4]:
test_df.head()

Unnamed: 0,id,categoryA,categoryB,categoryC,categoryD,categoryE,categoryF,featureA,featureB,featureC,...,compositionB,compositionC,compositionD,compositionE,compositionF,compositionG,compositionH,compositionI,compositionJ,unit
0,939edadbc8e0cddf21c8c5710c4f2f909abd36c196aee8...,catA_6,catB_0,catC_55,catD_0,catE_1,catF_0,325435.125,1.728675,0.032407,...,2.2,28.2,2.7,16.7,3.5,7.6,20.2,27.25,18.9,unit_7
1,fcfed000585350fdc6a982bd1146cd8cdd71d989a2fb01...,catA_6,catB_0,catC_1638,catD_0,catE_0,catF_0,75808.375,4.45784,0.005718,...,10.0,26.0,0.0,32.0,3.0,0.0,9.0,22.26,20.0,unit_7
2,917b412b73b7c1c83f779d2f350acda4d2b14804579e79...,catA_17,catB_0,catC_1400,catD_0,catE_16,catF_0,-19688.875,25.98219,0.0,...,21.4,9.7,0.0,46.7,2.6,2.4,7.7,21.23,6.1,unit_12
3,012d646dbd712fd4b752aa8761dc18aaf28643c262f859...,catA_5,catB_0,catC_204,catD_0,catE_0,catF_0,75808.375,4.45784,0.005718,...,10.0,26.0,0.0,32.0,3.0,0.0,9.0,22.26,20.0,unit_0
4,77c69c2a873444dfef22131bada7d9504ad731ca8df55d...,catA_5,catB_0,catC_1282,catD_0,catE_2,catF_0,-40055.25,4.363288,0.729194,...,8.0,14.0,0.0,49.0,3.0,2.0,9.0,16.84,15.0,unit_0


In [5]:
submission.head()

Unnamed: 0,id,result
0,939edadbc8e0cddf21c8c5710c4f2f909abd36c196aee8...,0.0
1,fcfed000585350fdc6a982bd1146cd8cdd71d989a2fb01...,0.0
2,917b412b73b7c1c83f779d2f350acda4d2b14804579e79...,0.0
3,012d646dbd712fd4b752aa8761dc18aaf28643c262f859...,0.0
4,77c69c2a873444dfef22131bada7d9504ad731ca8df55d...,0.0


# <span class="title-section w3-xxlarge" id="config"> Configuration ⚙️ </span>
<hr>

In [6]:
class CFG:
    N_fold=5
    Batch_Size=16
    LR=0.0001
    EPOCHS = 35

# <span class="title-section w3-xxlarge" id="data_pipeline"> Data Pipeline 🔧 </span>
<hr>

In [7]:
categorys=['categoryA','categoryB','categoryC','categoryD','categoryE','categoryF']
itr=[1,2,3,4,5,6]
for i,c in enumerate(categorys):
    train_df[c] = pd.factorize(train_df.iloc[: , itr[i]])[0]
train_df['unit'] = pd.factorize(train_df.iloc[: , -2])[0]


for i,c in enumerate(categorys):
    test_df[c] = pd.factorize(test_df.iloc[: , itr[i]])[0]
test_df['unit'] = pd.factorize(test_df.iloc[: , -2])[0]

In [8]:
train_df

Unnamed: 0,id,categoryA,categoryB,categoryC,categoryD,categoryE,categoryF,featureA,featureB,featureC,...,compositionC,compositionD,compositionE,compositionF,compositionG,compositionH,compositionI,compositionJ,unit,result
0,a563699ca2a601c6ac64aa29986a00a90fb42b48741695...,0,0,0,0,0,0,75808.375,4.457840,0.005718,...,26.00,0.0,32.00,3.0,0.00,9.00,22.26,20.00,0,0.000458
1,91ab3eb3bcf6c8c1c5fe2da9ba671aa5a48c7369d9a50f...,0,0,1,0,0,0,75808.375,4.457840,0.005718,...,26.00,0.0,32.00,3.0,0.00,9.00,22.26,20.00,0,0.000335
2,7128c51c554735d6c81862684ad6005ae12d2edbcd4644...,1,0,1,0,0,0,75808.375,4.457840,0.005718,...,26.00,0.0,32.00,3.0,0.00,9.00,22.26,20.00,1,0.054072
3,c8144b52e4f63014de0a0d8e1c629bf0b05cb2696cfc23...,2,0,2,0,0,0,75808.375,4.457840,0.005718,...,26.00,0.0,32.00,3.0,0.00,9.00,22.26,20.00,2,0.061143
4,88d15a5b2df6692f23d105ff1ae82ae026be00c9271eef...,3,0,3,0,1,0,-40055.250,4.363288,0.729194,...,14.00,0.0,49.00,3.0,2.00,9.00,16.84,15.00,3,0.015439
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12995,6dcf9c3ccaf0a92b107d6a78a2cd3ea363350b537444ec...,4,0,949,0,0,0,75808.375,4.457840,0.005718,...,26.00,0.0,32.00,3.0,0.00,9.00,22.26,20.00,4,0.008260
12996,e6241d096ea1b88d6e57a92fb6d984825c70de3a4b3ec5...,14,0,930,0,0,0,75808.375,4.457840,0.005718,...,26.00,0.0,32.00,3.0,0.00,9.00,22.26,20.00,10,0.002395
12997,081aa272b71b94d3cc665e1484a232b821e9997ed9cc80...,80,0,930,0,0,1,75808.375,4.457840,0.005718,...,26.00,0.0,32.00,3.0,0.00,9.00,22.26,20.00,15,0.150982
12998,c50903ae1d6ff475fa20e32fe39ce4429cc6285999a658...,6,0,2211,0,8,0,18927.000,21.007457,0.000387,...,13.13,0.0,35.88,1.7,0.73,1.45,28.12,37.83,5,0.488908


In [9]:
test_df

Unnamed: 0,id,categoryA,categoryB,categoryC,categoryD,categoryE,categoryF,featureA,featureB,featureC,...,compositionB,compositionC,compositionD,compositionE,compositionF,compositionG,compositionH,compositionI,compositionJ,unit
0,939edadbc8e0cddf21c8c5710c4f2f909abd36c196aee8...,0,0,0,0,0,0,325435.125,1.728675,0.032407,...,2.2,28.2,2.7,16.7,3.5,7.6,20.2,27.25,18.9,0
1,fcfed000585350fdc6a982bd1146cd8cdd71d989a2fb01...,0,0,1,0,1,0,75808.375,4.457840,0.005718,...,10.0,26.0,0.0,32.0,3.0,0.0,9.0,22.26,20.0,1
2,917b412b73b7c1c83f779d2f350acda4d2b14804579e79...,1,0,2,0,2,0,-19688.875,25.982190,0.000000,...,21.4,9.7,0.0,46.7,2.6,2.4,7.7,21.23,6.1,2
3,012d646dbd712fd4b752aa8761dc18aaf28643c262f859...,2,0,3,0,1,0,75808.375,4.457840,0.005718,...,10.0,26.0,0.0,32.0,3.0,0.0,9.0,22.26,20.0,1
4,77c69c2a873444dfef22131bada7d9504ad731ca8df55d...,2,0,4,0,3,0,-40055.250,4.363288,0.729194,...,8.0,14.0,0.0,49.0,3.0,2.0,9.0,16.84,15.0,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6995,f7a904e2348659e727d933ca229531103b280171fe2519...,2,0,1753,0,3,0,-40055.250,4.363288,0.729194,...,8.0,14.0,0.0,49.0,3.0,2.0,9.0,16.84,15.0,3
6996,9006763de2b19c431ad08584e65d5726c58ffc7037048b...,2,0,509,0,1,0,75808.375,4.457840,0.005718,...,10.0,26.0,0.0,32.0,3.0,0.0,9.0,22.26,20.0,1
6997,6e71675534f7806f50c0d35736f0e68c6b1415af31dd82...,7,0,1754,0,3,0,-40055.250,4.363288,0.729194,...,8.0,14.0,0.0,49.0,3.0,2.0,9.0,16.84,15.0,3
6998,0a0569a1eda01c4e3bd9327e13b50294a4b3ef7e223fc8...,2,0,1632,0,0,0,325435.125,1.728675,0.032407,...,2.2,28.2,2.7,16.7,3.5,7.6,20.2,27.25,18.9,0


In [10]:
ycol = 'result'
feature_names = list(filter(lambda x: x not in [ycol, 'id'], train_df.columns))


In [11]:
scaler = MinMaxScaler()
train_df[feature_names] = scaler.fit_transform(train_df[feature_names])
test_df[feature_names] = scaler.transform(test_df[feature_names])

<h1> <span class="title-section w3-xxlarge" id="the_models"> Models Configuration ⚙️ </span> </h1>
<hr>

In [12]:
# LightGBM params
lgb_params= {
      'boosting_type':'dart',
      'objective' :'regression',
      'metric':'rmse',
      'tree_learner':'serial',
      'n_estimators':1000,
      'num_leaves':64,
      'max_depth':8,
      'learning_rate':0.1,
      'subsample':0.8,
      'feature_fraction':0.6,
      'reg_alpha':0.1,
      'reg_lambda':0.1,
      'random_state':2023
}

# <span class="title-section w3-xxlarge" id="training"> Training 🏋️</span>
<hr>

In [13]:
model = lgb.LGBMRegressor(**lgb_params)

df_importance_list = []

kfold =KFold(n_splits=5, shuffle=True, random_state=2022)
for fold_id, (trn_idx, val_idx) in enumerate(kfold.split(train_df[feature_names], train_df[ycol])):
    X_train = train_df.iloc[trn_idx][feature_names]
    Y_train = train_df.iloc[trn_idx][ycol]
    X_val = train_df.iloc[val_idx][feature_names]
    Y_val = train_df.iloc[val_idx][ycol]
    lgb_model = model.fit(X_train,
                          Y_train,
                          eval_names=['train', 'valid'],
                          eval_set=[(X_train, Y_train), (X_val, Y_val)],
                          verbose=25,
                          eval_metric='rmse',
                          early_stopping_rounds=50)
    joblib.dump(lgb_model, f'lgb_{fold_id}.pkl')
    df_importance = pd.DataFrame({
        'column': feature_names,
        'importance': lgb_model.feature_importances_,
    })
    df_importance_list.append(df_importance)
    del lgb_model, X_train, Y_train, X_val, Y_val
    gc.collect()



[25]	train's rmse: 0.106145	valid's rmse: 0.109331
[50]	train's rmse: 0.098508	valid's rmse: 0.102922
[75]	train's rmse: 0.0820114	valid's rmse: 0.0876806
[100]	train's rmse: 0.0800626	valid's rmse: 0.0858922
[125]	train's rmse: 0.0775077	valid's rmse: 0.0836861
[150]	train's rmse: 0.0775025	valid's rmse: 0.0838539
[175]	train's rmse: 0.0765077	valid's rmse: 0.0831729
[200]	train's rmse: 0.076898	valid's rmse: 0.0836047
[225]	train's rmse: 0.0775941	valid's rmse: 0.0842985
[250]	train's rmse: 0.0770864	valid's rmse: 0.0838552
[275]	train's rmse: 0.0762218	valid's rmse: 0.0832675
[300]	train's rmse: 0.0746972	valid's rmse: 0.0822194
[325]	train's rmse: 0.0742877	valid's rmse: 0.0821488
[350]	train's rmse: 0.0736072	valid's rmse: 0.0818463
[375]	train's rmse: 0.073654	valid's rmse: 0.0819849
[400]	train's rmse: 0.0733408	valid's rmse: 0.0818431
[425]	train's rmse: 0.0737102	valid's rmse: 0.082265
[450]	train's rmse: 0.0730541	valid's rmse: 0.0818389
[475]	train's rmse: 0.0720367	valid's 

# <span class="title-section w3-xxlarge" id="first_infer">Testing 🔥</span>
<hr>

In [14]:
models = []
lgb_model_0 = joblib.load('./lgb_0.pkl')
lgb_model_1 = joblib.load('./lgb_1.pkl')
lgb_model_2 = joblib.load('./lgb_2.pkl')
lgb_model_3 = joblib.load('./lgb_3.pkl')
lgb_model_4 = joblib.load('./lgb_4.pkl')
models.append(lgb_model_0)
models.append(lgb_model_1)
models.append(lgb_model_2)
models.append(lgb_model_3)
models.append(lgb_model_4)

Average_preds= sum([model.predict(test_df[feature_names], verbose=1) for model in models]) / len(models)

# <span class="title-section w3-xxlarge" id="submit"> Submitting to Kaggle 🇰</span>
<hr>

In [15]:
preds = pd.DataFrame(test_df, columns=['id'])
preds['result'] = pd.DataFrame(Average_preds, columns=['result'])
preds.to_csv('submission.csv', index = False)