In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold, GroupKFold
from catboost import CatBoostRegressor, Pool, cv
from datetime import datetime, timedelta
import gc
import optuna
from utils_testing import optuna_logging
from itertools import combinations
from termcolor import colored
import pytz
UTC = pytz.utc  

timeZ_Kl = pytz.timezone('Asia/Kolkata')

In [None]:
train_df = pd.read_pickle("../data/train_df_interim.pickle")
test_df = pd.read_pickle("../data/test_df_interim.pickle")

train_df.shape, test_df.shape

In [None]:
drop = ['SURV_DTE'
        , 'sand_target_avg'
        , 'CMPL_FAC_ID'
        , 'fold'
       ]

target = 'PCT_DESAT_TO_ORIG'
indep = train_df.columns.difference(drop+[target])
indep_master = indep.copy() # Taking a copy so it can be used to get the original features
indep

# CatBoost

### 5 fold Groupd CV

In [None]:
def train_cat_model(train_df, cat_params):
    
    num_rounds = 100000
    
    fold_iterations = []
    fold_results = []
    cat_models_fold = {}

    print("")
    for fold_i in range(0, train_df.fold.max()+1):

        train_fold = train_df[train_df.fold!=fold_i].copy()
        valid_fold = train_df[train_df.fold==fold_i].copy()
    
        eval_dataset = Pool(valid_fold[indep], valid_fold[target])

        nrounds = 1000000
        np.random.seed(100)
        cat_local_model = CatBoostRegressor(**cat_params
                                            ,iterations=nrounds
                                            ,early_stopping_rounds=50
                                            ,verbose=0
                                            )

        cat_local_model.fit(train_fold[indep],
                            train_fold[target],
                            eval_set=eval_dataset)
        cat_local_prediction = cat_local_model.predict(valid_fold[indep])
        
        cat_local_prediction = np.where(cat_local_prediction<0, 0, cat_local_prediction)
        cat_local_prediction = np.where(cat_local_prediction>1, 1, cat_local_prediction)

        fold_rmse = np.sqrt(mean_squared_error(valid_fold[target], cat_local_prediction))
        fold_iteration = cat_local_model.best_iteration_
        
        fold_iterations.append(fold_iteration)
        fold_results.append(np.round(fold_rmse, 5))
        cat_models_fold[fold_i] = cat_local_model
        
        print(f"Current fold: {fold_i}, iteration {fold_iteration}, RMSE {fold_rmse}")
    
    return fold_iterations, fold_results, cat_models_fold


### CAT optuna

In [None]:
def train_cat_model_optuna(trial):
    """
    This function is used to train the model using the parameters obtained from optuna.
    """
    cat_param = {#'silent':True,
                'task_type' : "CPU",
                'eval_metric': 'RMSE',
                'learning_rate': trial.suggest_float("learning_rate", 0.006, 0.05, log=True),
                'depth': trial.suggest_int("depth", 4, 8),
                'subsample': trial.suggest_float("subsample", 0.7, 1.0),
                'colsample_bylevel': trial.suggest_float("colsample_bylevel", 0.7, 1.0)
                
            }
    
    cat_fold_iterations, cat_fold_results, cat_models_fold = train_cat_model(train_df=train_df, 
                                                                             cat_params = cat_param)
    
    avg_error = np.mean(cat_fold_results)
    print("Avg.Fold results:", avg_error)

    return avg_error

In [None]:
# Optuna Hyper-parameter tuning
cat_study = optuna.create_study(direction="minimize")
cat_study.optimize(train_cat_model_optuna
                   , n_trials=100
                   , n_jobs=1
                   #                , timeout=600
                   , show_progress_bar=True
                   , gc_after_trial=True
              )

optuna_logging(model='cat', study=cat_study, indep=indep)

print("Number of finished trials: ", len(cat_study.trials))
print("Best trial:", cat_study.best_trial.number)
print("Best Value: {}".format(cat_study.best_trial.value))
print("Params: ")
cat_study.best_params

In [None]:
# Read all the hyperparameters and their best RMSE from the logged file
filename = f"../Optuna_logging/cat_optuna_logging.csv"
temp = pd.read_csv(filename)
temp

In [None]:
best_RMSE = temp.best_RMSE.min()
cat_params = eval(temp.best_param[temp.best_RMSE==best_RMSE].values[0])
print(f"The parameter corresponding to the best RMSE {best_RMSE}")
cat_params

# Indep combination

In [None]:
def get_indep_combination(indep_all_combo, total_combinations_to_try):
    """
    This function trains the LGB model based on the different combinations of independent 
    features from the overall features that is available and write it to the file 
    lgb_best_indep_combo.csv
    """
    cat_params = {'learning_rate' :0.04
                  , 'depth' : 5
                  , 'subsample' : 0.9
                  , 'colsample_bylevel' : 0.9
    #               , 'min_child_samples':100
                  , 'task_type' : "CPU"
    #               , loss_function='RMSE'
                  , 'eval_metric' : 'RMSE'
                 }

    # reading the iterations ran so far
    global overall_best
    overall_best = pd.read_csv("../indep_combo/cat_best_indep_combo.csv")
#     overall_best['indep'] = overall_best.indep.apply(lambda x : eval(x))

    random_index = np.random.choice(len(indep_all_combo), 
                                    total_combinations_to_try, 
                                    replace=False)
    mean_fold_result = []
    best_result={}
    indep_df = []

    # declare the indep as global so the changes can be reflected in the training function
    global indep 
    
    best = 10000
    for i, indep_ind in enumerate(random_index):
        indep= indep_all_combo[indep_ind]
        print(f"{i}/{total_combinations_to_try}")

        fold_iterations, fold_results, cat_models_fold = train_cat_model(train_df=train_df,
                                                                         cat_params = cat_params)
        mean_fold_result.append(np.mean(fold_results))
        indep_df.append(indep)
        avg_iteration = int(np.mean(fold_iterations))

        print("Fold iterations:", fold_iterations)
        print("Average iteration:", avg_iteration)
        print("Fold results:", fold_results)
        print("Avg.Fold results:", mean_fold_result[-1])

        # Printing the current best
        if mean_fold_result[-1]<best:
            best = mean_fold_result[-1]
            print(colored(f"New best {best}", 'green'))
            
            # Reading and writing the indep combo
            overall_best = pd.read_csv("../indep_combo/cat_best_indep_combo.csv")
            best_indep = pd.DataFrame({'Date':datetime.now(timeZ_Kl).strftime('%d-%m-%Y %H:%M:%S'),
                                       'indep': str(indep), 
                                       'rmse': [best]})
            
            print(colored("writing the indep combos to disk", 'blue'))
            overall_best = overall_best.append(best_indep).drop_duplicates().reset_index(drop=True)
            overall_best.to_csv("../indep_combo/cat_best_indep_combo.csv", index=False)
            
        else:
            print(colored(f"Best so far {best}", 'yellow'))

In [None]:
print(f"Total actual features: {len(indep_master)}")

features_2_use=30
comb_features = combinations(indep_master, features_2_use)

indep_all_combo=[]
for indep_combo in list(comb_features):
    indep_all_combo.append(list(indep_combo))
    
print(f"Total features to use: {features_2_use}")
print(f"Total combo possible : {len(indep_all_combo)}")

In [None]:
get_indep_combination(indep_all_combo=indep_all_combo, 
                      total_combinations_to_try=20)

In [None]:
# Extract the parameters and the independent features with the best metric

days_before = 1

best_indep = pd.read_csv("../indep_combo/cat_best_indep_combo.csv")
best_indep['Date'] = pd.to_datetime(best_indep.Date).dt.date.astype('str')

today_date = (datetime.now()-timedelta(days=days_before)).strftime('%Y-%m-%d')
print(today_date)

condition1 = (best_indep.Date==today_date)
best_indep = best_indep[condition1].reset_index(drop=True)

condition2 = (best_indep.rmse == best_indep.rmse.min())
indep = eval(best_indep[condition2].indep.values[0])

cat_params = eval(best_indep[condition2].params.values[0])

print(f"Best RMSE : {best_indep.rmse.min()}")
print("Best indep size", len(indep))
print("Best cat params", cat_params)
# indep[-1]
indep

### local

In [None]:
cat_params = {'learning_rate' :0.04
              , 'depth' : 5
              , 'subsample' : 0.9
              , 'colsample_bylevel' : 0.9
#               , 'min_child_samples':100
              , 'task_type' : "CPU"
#               , loss_function='RMSE'
              , 'eval_metric' : 'RMSE'
             }

# cat_params['task_type'] = 'CPU'
# cat_params['eval_metric'] = 'RMSE'

fold_iterations, fold_results, cat_models_fold = train_cat_model(train_df=train_df,
                                                                 cat_params = cat_params)

avg_iteration = int(np.mean(fold_iterations))
print("Fold iterations:", fold_iterations)
print("Average iteration:", avg_iteration)
print("Fold results:", fold_results)
print("Avg.Fold results:", np.mean(fold_results))

In [None]:
ind= 2

cat_imp = pd.DataFrame({'features':cat_models_fold[ind].feature_names_, 
                        'imp':cat_models_fold[ind].feature_importances_}).sort_values(['imp'], ascending=False).reset_index(drop=True)
cat_imp

# Fold Ensemble predictions

In [None]:
def fold_ensemble(model_list, test):
    """
    This is the Ensemble prediction of the final test data from the fold models
    """
    
    ens_pred = []
    for i in model_list.keys():
        print(f"Prediction for model {i}")
        fold_pred = model_list[i].predict(test[indep])
        fold_pred = np.where(fold_pred<0, 0, fold_pred)
        fold_pred = np.where(fold_pred>1, 1, fold_pred)
        ens_pred.append(fold_pred)
        
    ensemble_prediction = np.array(ens_pred).mean(axis=0)
           
    return ensemble_prediction
        
# cat_prod_prediction = fold_ensemble(model_list=cat_models_fold, test=test_df)
# cat_prod_prediction

### Prod

In [None]:
np.random.seed(100)
cat_prod_model = CatBoostRegressor(**cat_params
                                    ,iterations=avg_iteration#+int(avg_iteration *0.2)
                                    ,early_stopping_rounds=50
                                    )
# Training on overall dataset
cat_prod_model.fit(train_df[indep],
                   train_df[target])


In [None]:
cat_prod_prediction = cat_prod_model.predict(test_df[indep])

cat_prod_prediction = np.where(cat_prod_prediction<0, 0, cat_prod_prediction)
cat_prod_prediction = np.where(cat_prod_prediction>1, 1, cat_prod_prediction)
cat_prod_prediction

In [None]:
CAT_submission = pd.DataFrame({'PCT_DESAT_TO_ORIG':cat_prod_prediction})
CAT_submission.to_csv("../sub/CAT_sub_47.csv", index=False)
CAT_submission

# Model Explainability using SHAP values

In [None]:
import shap
shap.initjs()

# explainer = shap.TreeExplainer(cat_models_fold[0])
# shap_values = explainer.shap_values(train_df.loc[train_df.fold!=1, indep].reset_index(drop=True))

explainer = shap.TreeExplainer(cat_prod_model)
shap_values = explainer.shap_values(train_df[indep].reset_index(drop=True))

In [None]:
i =10
shap.force_plot(explainer.expected_value, 
                shap_values[i], 
                features=train_df.loc[i, indep], 
                feature_names=train_df[indep].columns)


In [None]:
shap.summary_plot(shap_values, 
                  features=train_df[indep].reset_index(drop=True),
                  feature_names=train_df[indep].columns)