In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold, GroupKFold
import lightgbm as lgb
from datetime import datetime, timedelta
import gc
import optuna
from utils_testing import optuna_logging
from itertools import combinations
from termcolor import colored
import pytz
UTC = pytz.utc  

timeZ_Kl = pytz.timezone('Asia/Kolkata')

In [None]:
train_df = pd.read_pickle("../data/train_df_interim.pickle")
test_df = pd.read_pickle("../data/test_df_interim.pickle")

train_df.shape, test_df.shape

In [None]:
drop = ['SURV_DTE'
        , 'sand_target_avg'
        , 'fold'
        
       ]
target = 'PCT_DESAT_TO_ORIG'
indep = train_df.columns.difference(drop+[target])
indep_master = indep.copy() # Taking a copy so it can be used to get the original features
indep

# LGBM

### 5 fold Groupd CV

In [None]:
# def lgb_eval_rmse(preds, y_true):
#     actual=y_true.get_label()
    
#     preds = np.where(preds>=1,1, preds)
#     preds = np.where(preds<=0,0, preds)
    
#     fold_rmse = np.sqrt(mean_squared_error(actual, preds))
    
#     return "lgb_rmse", fold_rmse, False

In [None]:
def train_lgb_model(train_df, lgb_params):
    
    num_rounds = 100000
    
    fold_iterations = []
    fold_results = []
    lgb_models_fold = {}

    print("")
    for fold_i in range(0, train_df.fold.max()+1):

        train_fold = train_df[train_df.fold!=fold_i].copy()
        valid_fold = train_df[train_df.fold==fold_i].copy()
    
        lgb_train_local = lgb.Dataset(train_fold[indep], train_fold[target], free_raw_data=False)
        lgb_test_local = lgb.Dataset(valid_fold[indep], valid_fold[target],
                                     reference=lgb_train_local,  free_raw_data=False)                             

        np.random.seed(100)
        lgb_model_local = lgb.train(lgb_params,
                                    lgb_train_local,
                                    num_boost_round=num_rounds ,
                                    valid_sets=lgb_test_local,
        #                             feval=lgb_eval_rmspe,
        #                             categorical_feature=['stock_id'],
                                    early_stopping_rounds=50,
                                    verbose_eval=False
        #                             , callbacks=[lgb.reset_parameter(learning_rate = learning_rate_010_decay_power_0995)]
                                   )
        lgb_local_prediction = lgb_model_local.predict(valid_fold[indep])

        lgb_local_prediction = np.where(lgb_local_prediction<0, 0, lgb_local_prediction)
        lgb_local_prediction = np.where(lgb_local_prediction>1, 1, lgb_local_prediction)

        fold_rmse = np.sqrt(mean_squared_error(valid_fold[target], lgb_local_prediction))
        fold_iteration = lgb_model_local.best_iteration
        
        fold_iterations.append(fold_iteration)
        fold_results.append(np.round(fold_rmse, 5))
        lgb_models_fold[fold_i] = lgb_model_local
        
        print(f"Current fold: {fold_i}, iteration {fold_iteration}, RMSE {fold_rmse}")
    
    return fold_iterations, fold_results, lgb_models_fold

### LGB optuna

In [None]:
def train_lgb_model_optuna(trial):
    """
    This function is used to train the model using the parameters obtained from optuna.
    """
    lgbm_param = {
                'objective': 'regression',
                'metric': 'rmse',
                'verbose': -1,
                "boosting_type": "gbdt",
#                 "lambda_l1": trial.suggest_float("lambda_l1", 1e-1, 10.0, log=True),
#                 "lambda_l2": trial.suggest_float("lambda_l2", 1e-1, 10.0, log=True),
                'learning_rate': trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
                "num_leaves": trial.suggest_int("num_leaves", 2**2, 2**6),
                "max_depth": trial.suggest_int("max_depth", 2, 8),
                'feature_fraction': trial.suggest_float("feature_fraction", 0.6, 1.0),
                "bagging_fraction": trial.suggest_float("bagging_fraction", 0.6, 1.0),
#                 "bagging_freq": trial.suggest_int("bagging_freq", 1, 3),
                "min_child_samples": trial.suggest_int("min_child_samples", 2, 25),
            }

    lgb_fold_iterations, lgb_fold_results, lgb_models_fold = train_lgb_model(train_df=train_df, 
                                                                             lgb_params = lgbm_param)
    
    avg_error = np.mean(lgb_fold_results)
    print("Avg.Fold results:", avg_error)

    return avg_error

In [None]:
# Optuna Hyper-parameter tuning
lgb_study = optuna.create_study(direction="minimize")
lgb_study.optimize(train_lgb_model_optuna
                   , n_trials=1000
                   , n_jobs=1
                   #                , timeout=600
                   , show_progress_bar=True
                   , gc_after_trial=True
              )

# Write the best hyer-parameter and the best RMSE to the logging file
optuna_logging(model='lgb', study=lgb_study, indep=np.array(indep))

print("Number of finished trials: ", len(lgb_study.trials))
print("Best trial:", lgb_study.best_trial.number)
print("Best Value: {}".format(lgb_study.best_trial.value))
print("Params: ")
lgb_study.best_params

In [None]:
# Read all the hyperparameters and their best RMSE from the logged file
filename = f"../Optuna_logging/lgb_optuna_logging.csv"
temp = pd.read_csv(filename)
temp

In [None]:
best_RMSE = temp.best_RMSE.min()
lgb_params = eval(temp.best_param[temp.best_RMSE==best_RMSE].values[0])
print(f"The parameter corresponding to the best RMSE {best_RMSE}")
lgb_params

# Indep combination

In [None]:
def get_indep_combination(indep_all_combo, total_combinations_to_try):
    """
    This function trains the LGB model based on the different combinations of independent 
    features from the overall features that is available and write it to the file 
    lgb_best_indep_combo.csv
    """
    
    lgb_params = {
    #     'device_type':'gpu',
    #     'nthreads':12,
        'boosting_type': 'gbdt',
        'objective': 'regression',
    #     'num_class':4,
#         'metric': 'custom',
        'metric': 'rmse',
        'num_leaves': 2**3,
    #     'max_depth': 7,
        'learning_rate': 0.04,
        'feature_fraction': 1,
        'bagging_fraction': 0.9,
        'bagging_freq': 1,
        'min_child_samples':20,
        'verbose': -1
    }


    # reading the iterations ran so far
    global overall_best
    overall_best = pd.read_csv("../indep_combo/lgb_best_indep_combo.csv")
#     overall_best['indep'] = overall_best.indep.apply(lambda x : eval(x))

    random_index = np.random.choice(len(indep_all_combo), 
                                    total_combinations_to_try, 
                                    replace=False)
    mean_fold_result = []
    best_result={}
    indep_df = []

    # declare the indep as global so the changes can be reflected in the training function
    global indep 
    
    best = 10000
    for i, indep_ind in enumerate(random_index):
        indep= indep_all_combo[indep_ind]
        print(f"{i}/{total_combinations_to_try}")

        fold_iterations, fold_results, lgb_models_fold = train_lgb_model(train_df=train_df,
                                                                         lgb_params = lgb_params)
        mean_fold_result.append(np.mean(fold_results))
        indep_df.append(indep)
        avg_iteration = int(np.mean(fold_iterations))

        print("Fold iterations:", fold_iterations)
        print("Average iteration:", avg_iteration)
        print("Fold results:", fold_results)
        print("Avg.Fold results:", mean_fold_result[-1])

        # Printing the current best
        if mean_fold_result[-1]<best:
            best = mean_fold_result[-1]
            print(colored(f"New best {best}", 'green'))
            
            # Reading and writing the indep combo
            overall_best = pd.read_csv("../indep_combo/lgb_best_indep_combo.csv")
            best_indep = pd.DataFrame({'Date':datetime.now(timeZ_Kl).strftime('%d-%m-%Y %H:%M:%S'),
                                       'indep': str(indep), 
                                       'rmse': [best]})
            
            print(colored("writing the indep combos to disk", 'blue'))
            overall_best = overall_best.append(best_indep).drop_duplicates().reset_index(drop=True)
            overall_best.to_csv("../indep_combo/lgb_best_indep_combo.csv", index=False)
            
        else:
            print(colored(f"Best so far {best}", 'yellow'))

In [None]:
print(f"Total actual features: {len(indep_master)}")

features_2_use=31
comb_features = combinations(indep_master, features_2_use)

indep_all_combo=[]
for indep_combo in list(comb_features):
    indep_all_combo.append(list(indep_combo))
    
print(f"Total features to use: {features_2_use}")
print(f"Total combo possible : {len(indep_all_combo)}")

In [None]:
get_indep_combination(indep_all_combo=indep_all_combo, 
                      total_combinations_to_try=20)

In [None]:
# Extract the parameters and the independent features with the best metric

days_before = 0

best_indep = pd.read_csv("../indep_combo/lgb_best_indep_combo.csv")
best_indep['Date'] = pd.to_datetime(best_indep.Date).dt.date.astype('str')

today_date = (datetime.now()-timedelta(days=days_before)).strftime('%Y-%m-%d')
print(today_date)

condition1 = (best_indep.Date==today_date)
best_indep = best_indep[condition1].reset_index(drop=True)

condition2 = (best_indep.rmse == best_indep.rmse.min())
indep = eval(best_indep[condition2].indep.values[0])
lgb_params = eval(best_indep[condition2].params.values[0])

print(f"Best RMSE : {best_indep.rmse.min()}")
print("Best indep size", len(indep))


### local

In [None]:
lgb_params = {'boosting_type': 'gbdt',
              'objective': 'regression',
              'metric': 'rmse',
              'num_leaves': 8,
              'learning_rate': 0.04,
              'feature_fraction': 1,
              'bagging_fraction': 0.9,
              'bagging_freq': 1,
              'min_child_samples': 20,
              'verbose': -1}


lgb_params['boosting_type'] = 'gbdt'
lgb_params['objective'] = 'regression'
lgb_params['metric'] = 'rmse'
lgb_params['verbose'] = -1

fold_iterations, fold_results, lgb_models_fold = train_lgb_model(train_df=train_df,
                                                                 lgb_params = lgb_params)

avg_iteration = int(np.mean(fold_iterations))
print("Fold iterations:", fold_iterations)
print("Average iteration:", avg_iteration)
print("Fold results:", fold_results)
print("Avg.Fold results:", np.mean(fold_results))

In [None]:
ind = 3
lgb_imp = pd.DataFrame({'feature' : indep, 
                        'fea_imp' : lgb_models_fold[ind].feature_importance()}).sort_values(['fea_imp'], ascending=False).reset_index(drop=True)
lgb_imp

# Fold Ensemble predictions

In [None]:
def fold_ensemble(model_list, test):
    """
    This is the Ensemble prediction of the final test data from the fold models
    """
    
    ens_pred = []
    for i in model_list.keys():
        print(f"Prediction for model {i}")  
        
        fold_pred = model_list[i].predict(test[indep])
        fold_pred = np.where(fold_pred<0, 0, fold_pred)
        fold_pred = np.where(fold_pred>1, 1, fold_pred)
        ens_pred.append(fold_pred)
        
    ensemble_prediction = np.array(ens_pred).mean(axis=0)
           
    return ensemble_prediction
        
# lgb_prod_prediction = fold_ensemble(model_list=lgb_models_fold, test=test_df)
# lgb_prod_prediction

### Prod

In [None]:
lgb_train_prod = lgb.Dataset(train_df[indep], train_df[target], free_raw_data=False)

# params = trial.params
# params['metric'] = 'rmse'
# params['verbose'] = 1

lgb_best_iteration = avg_iteration#+int(0.1*avg_iteration)
print(f"Training for {lgb_best_iteration}")
np.random.seed(100)
lgb_model_prod = lgb.train(lgb_params
                           ,lgb_train_prod
                           ,num_boost_round=lgb_best_iteration
#                            ,valid_sets=lgb_test_local
#                            ,feval=lgb_eval_rmspe
#                            ,categorical_feature=['stock_id']
#                            ,early_stopping_rounds=50
                          )


In [None]:
lgb_prod_prediction = lgb_model_prod.predict(test_df[indep])

lgb_prod_prediction = np.where(lgb_prod_prediction<0, 0, lgb_prod_prediction)
lgb_prod_prediction = np.where(lgb_prod_prediction>1, 1, lgb_prod_prediction)

In [None]:
LGB_submission = pd.DataFrame({'PCT_DESAT_TO_ORIG':lgb_prod_prediction})
LGB_submission

In [None]:
LGB_submission.to_csv("../sub/LGB_sub_20.csv", index=False)

# Model Explainability using SHAP values

In [None]:
import shap
shap.initjs()

In [None]:
explainer = shap.TreeExplainer(lgb_models_fold[0])
shap_values = explainer.shap_values(train_df.loc[train_df.fold!=1, indep].reset_index(drop=True))

i =10
shap.force_plot(explainer.expected_value, 
                shap_values[i], 
                features=train_df.loc[i, indep], 
                feature_names=train_df[indep].columns)

In [None]:
shap.summary_plot(shap_values, 
                  features=train_df[indep].reset_index(drop=True),
                  feature_names=train_df[indep].columns)