<center><img src="https://pbs.twimg.com/media/EGo-3oHXUAA9E9D.jpg"></center>

<center><h1>How Electricity Reaches Our Homes </h1></center>

# <span class="title-section w3-xxlarge" id="imports"> Importing Libraries 📚</span>
<hr>

In [1]:
import pandas as pd 
import os, gc
import glob
import lightgbm as lgb
import xgboost
from sklearn.linear_model import LinearRegression
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler,RobustScaler
import joblib
from sklearn.metrics import log_loss


# <span class="title-section w3-xxlarge" id="loading"> Loading Meta Data 🗂️ </span>
<hr>

In [2]:
train_df = pd.read_csv('/kaggle/input/tfug-mysuruelectricity-generation-prediction/train.csv')
test_df = pd.read_csv('/kaggle/input/tfug-mysuruelectricity-generation-prediction/test.csv')
submission = pd.read_csv('/kaggle/input/tfug-mysuruelectricity-generation-prediction/sample_submission.csv')

# <span class="title-section w3-xxlarge" id="data_pipeline"> Data Pipeline 🔧 </span>
<hr>

In [3]:
## to get the numeric representation of days
train_df['obs_day'] = pd.factorize(train_df.iloc[: , -2])[0]
test_df['obs_day'] = pd.factorize(test_df.iloc[: , -2])[0]

In [4]:
## fix the problem of NaN by replacing NaN with 0, then replacing 0 with the median of that column 
def replace_median(train_df,features):
    train_df=train_df.fillna(0) 
    for f in features:
        train_df[f]=train_df[f].replace(0,train_df[f].mean())
    return train_df

In [5]:
ycol = 'output_gen'
feature_names = list(filter(lambda x: x not in [ycol, 'id'], train_df.columns))
train_df=replace_median(train_df,feature_names)

In [6]:
scaler = MinMaxScaler()
train_df[feature_names] = scaler.fit_transform(train_df[feature_names])
test_df[feature_names] = scaler.transform(test_df[feature_names])

<h1> <span class="title-section w3-xxlarge" id="the_models"> Models Configuration ⚙️ </span> </h1>
<hr>

In [7]:
lgb_params_1 = {
      'boosting_type':'gbdt',
      'objective' :'regression',
      'metric':'rmse',
      'tree_learner':'serial',
      'n_estimators':1000,
      'num_leaves':64,
      'max_depth':8,
      'learning_rate':0.1,
      'subsample':0.8,
      'feature_fraction':0.6,
      'reg_alpha':0.1,
      'reg_lambda':0.1,
      'random_state':2023
}

lgb_params_2 = {
    'early_stopping_rounds': 50,
    'objective': 'regression',
    'metric': 'rmse',
    'n_estimators':1000,
    'boosting_type': 'gbdt',
    'max_depth': 5,
    'verbose': -1,
    'max_bin':600,
    'min_data_in_leaf':50,
    'learning_rate': 0.03,
    'subsample': 0.7,
    'subsample_freq': 1,
    'feature_fraction': 1,
    'lambda_l1': 0.5,
    'lambda_l2': 2,
    'seed':2023,
    'feature_fraction_seed': 2023,
    'bagging_fraction_seed': 2023,
    'drop_seed': 2023,
    'data_random_seed': 2023,
    'extra_trees': True,
    'extra_seed': 2023,
    'zero_as_missing': True,
    "first_metric_only": True
         }

<h1> <span class="title-section w3-xxlarge" id="training"> Training 🏋️</span> </h1>
<h3> 🚨 Double LGBMRegressor 🚨 </h3>

In [8]:
models = []
saved_models=[]
model1 = lgb.LGBMRegressor(**lgb_params_1)
model2 = lgb.LGBMRegressor(**lgb_params_2)
models.append(model1)
models.append(model2)
for model in models:
    print('')
    print(f'Model num :{models.index(model)}')
    kfold =KFold(n_splits=5, shuffle=True, random_state=2022)
    for fold_id, (trn_idx, val_idx) in enumerate(kfold.split(train_df[feature_names], train_df[ycol])):
        print(f'fold num :{fold_id}')
        X_train = train_df.iloc[trn_idx][feature_names]
        Y_train = train_df.iloc[trn_idx][ycol]
        X_val = train_df.iloc[val_idx][feature_names]
        Y_val = train_df.iloc[val_idx][ycol]
        lgb_model = model.fit(X_train,
                              Y_train,
                              eval_names=['train', 'valid'],
                              eval_set=[(X_train, Y_train), (X_val, Y_val)],
                              verbose=1000,
                              eval_metric='rmse',
                              early_stopping_rounds=50)
        saved_models.append(lgb_model)
        del lgb_model


Model num :0
fold num :0




[1000]	train's rmse: 1.10188	valid's rmse: 2.6333
fold num :1
[1000]	train's rmse: 1.07847	valid's rmse: 4.74224
fold num :2
[1000]	train's rmse: 1.11537	valid's rmse: 2.87208
fold num :3
[1000]	train's rmse: 1.13248	valid's rmse: 3.8859
fold num :4
[1000]	train's rmse: 1.12563	valid's rmse: 3.39725

Model num :1
fold num :0
[1000]	train's rmse: 6.97929	valid's rmse: 6.81601
fold num :1
[1000]	train's rmse: 6.92709	valid's rmse: 7.3565
fold num :2
[1000]	train's rmse: 6.77941	valid's rmse: 7.51745
fold num :3
[1000]	train's rmse: 6.67321	valid's rmse: 7.50376
fold num :4
[1000]	train's rmse: 6.843	valid's rmse: 7.1942


# <span class="title-section w3-xxlarge" id="first_infer">Testing 🔥</span>
<hr>

In [9]:
avg_pred=sum([model.predict(test_df[feature_names]) for model in saved_models])/len(saved_models)

# <span class="title-section w3-xxlarge" id="submit"> Submitting to Kaggle 🇰</span>
<hr>

In [10]:
sub = pd.read_csv('/kaggle/input/tfug-mysuruelectricity-generation-prediction/sample_submission.csv')
sub['output_gen'] = avg_pred
sub.to_csv('submission.csv', index=False)