In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold, GroupKFold
from pytorch_tabnet.tab_model import TabNetRegressor
from sklearn.preprocessing import MinMaxScaler
from datetime import datetime
import gc
import optuna
from utils_testing import optuna_logging
from itertools import combinations
from termcolor import colored
import pytz
UTC = pytz.utc  

timeZ_Kl = pytz.timezone('Asia/Kolkata')

In [None]:
train_df = pd.read_pickle("../data/train_df_interim.pickle")
test_df = pd.read_pickle("../data/test_df_interim.pickle")

train_df.shape, test_df.shape

In [None]:
train_df = train_df.replace({-999:0})
test_df = test_df.replace({-999:0})

In [None]:
drop = ['SURV_DTE'
        , 'sand_target_avg'
        , 'CMPL_FAC_ID'
        , 'fold'
       ]

target = 'PCT_DESAT_TO_ORIG'
indep = train_df.columns.difference(drop+[target])
indep_master = indep.copy() # Taking a copy so it can be used to get the original features
indep

In [None]:
# scaler = MinMaxScaler()
# scaler.fit(train_df[indep])

# train_df[indep] = scaler.transform(train_df[indep])
# test_df[indep] = scaler.transform(test_df[indep])

# Tabnet

### 5 fold Groupd CV

In [None]:
def train_tab_model(train_df, tab_params, tab_fit_params):
    
    num_rounds = 100000
    
    fold_iterations = []
    fold_results = []
    tab_models_fold = {}

    print("")
    for fold_i in range(0, train_df.fold.max()+1):

        train_fold = train_df[train_df.fold!=fold_i].copy().reset_index(drop=True)
        valid_fold = train_df[train_df.fold==fold_i].copy().reset_index(drop=True)

        nrounds = 1000000
        np.random.seed(100)
        tab_local_model = TabNetRegressor(**tab_params)

        tab_local_model.fit(**tab_fit_params, 
                            X_train=train_fold[indep].values,
                            y_train=train_fold[[target]].values,
                            eval_set=[(valid_fold[indep].values, 
                                       valid_fold[[target]].values)]
                           )
        
        tab_local_prediction = tab_local_model.predict(valid_fold[indep].values)
        
        tab_local_prediction = np.where(tab_local_prediction<0, 0, tab_local_prediction)
        tab_local_prediction = np.where(tab_local_prediction>1, 1, tab_local_prediction)

        fold_rmse = np.sqrt(mean_squared_error(valid_fold[target], tab_local_prediction))
        fold_iteration = tab_local_model.max_epochs
        
        fold_iterations.append(tab_local_model.best_epoch)
        fold_results.append(np.round(fold_rmse, 5))
        tab_models_fold[fold_i] = tab_local_model
        
        print(f"Current fold: {fold_i}, iteration {fold_iteration}, RMSE {fold_rmse}")
    
    return fold_iterations, fold_results, tab_models_fold


In [None]:
# tab_local_model = TabNetRegressor()

# tab_local_model.fit(X_train=train_df[indep].values,
#                     y_train=train_df[[target]].values, 
#                     max_epochs=10)


### tab optuna

In [None]:
def train_tab_model_optuna(trial):
    tab_param = {'n_d':trial.suggest_int("n_d", 4, 32)
                 , 'n_a':trial.suggest_int("n_a", 4, 32)
#               , 'optimizer_params': {'lr': 0.02}
                 , 'verbose':1
                 , 'seed':42}
    
    tab_fit_params = {'batch_size':64, 
                      'patience':20, 
                      'max_epochs':1000,
                      'num_workers':8,
                      'eval_metric':['rmse']}
    
    tab_fold_iterations, tab_fold_results, tab_models_fold = train_tab_model(train_df=train_df, 
                                                                             tab_params = tab_param,
                                                                             tab_fit_params = tab_fit_params
                                                                             )
    
    avg_error = np.mean(tab_fold_results)
    print("Avg.Fold results:", avg_error)

    return avg_error

In [None]:
# Optuna Hyper-parameter tuning

tab_study = optuna.create_study(direction="minimize")
tab_study.optimize(train_tab_model_optuna
                   , n_trials=1000
                   , n_jobs=1
                   #                , timeout=600
                   , show_progress_bar=True
                   , gc_after_trial=True
              )

optuna_logging(model='tab', study=tab_study, indep=indep)

print("Number of finished trials: ", len(tab_study.trials))
print("Best trial:", tab_study.best_trial.number)
print("Best Value: {}".format(tab_study.best_trial.value))
print("Params: ")
tab_study.best_params

In [None]:
# Read all the hyperparameters and their best RMSE from the logged file
filename = f"../Optuna_logging/tab_optuna_logging.csv"
temp = pd.read_csv(filename)
temp

In [None]:
best_RMSE = temp.best_RMSE.min()
tab_params = eval(temp.best_param[temp.best_RMSE==best_RMSE].values[0])
print(f"The parameter corresponding to the best RMSE {best_RMSE}")
tab_params

# Indep combination

In [None]:
def get_indep_combination(indep_all_combo, total_combinations_to_try):
    tab_param = {'n_d':trial.suggest_int("n_d", 4, 32)
                 , 'n_a':trial.suggest_int("n_a", 4, 32)
#               , 'optimizer_params': {'lr': 0.02}
                 , 'verbose':1
                 , 'seed':42}
    
    tab_fit_params = {'batch_size':64, 
                      'patience':20, 
                      'max_epochs':1000,
                      'num_workers':8,
                      'eval_metric':['rmse']}

    # reading the iterations ran so far
    global overall_best
    overall_best = pd.read_csv("../indep_combo/tab_best_indep_combo.csv")
#     overall_best['indep'] = overall_best.indep.apply(lambda x : eval(x))

    random_index = np.random.choice(len(indep_all_combo), total_combinations_to_try, replace=False)
    mean_fold_result = []
    best_result={}
    indep_df = []

    # declare the indep as global so the changes can be reflected in the training function
    global indep 
    
    best = 10000
    for i, indep_ind in enumerate(random_index):
        indep= indep_all_combo[indep_ind]
        print(f"{i}/{total_combinations_to_try}")

        fold_iterations, fold_results, tab_models_fold = train_tab_model(train_df=train_df,
                                                                         tab_params = tab_params,
                                                                         tab_fit_params=tab_fit_params)
        mean_fold_result.append(np.mean(fold_results))
        indep_df.append(indep)
        avg_iteration = int(np.mean(fold_iterations))

        print("Fold iterations:", fold_iterations)
        print("Average iteration:", avg_iteration)
        print("Fold results:", fold_results)
        print("Avg.Fold results:", mean_fold_result[-1])

        # Printing the current best
        if mean_fold_result[-1]<best:
            best = mean_fold_result[-1]
            print(colored(f"New best {best}", 'green'))
            
            # Reading and writing the indep combo
            overall_best = pd.read_csv("../indep_combo/tab_best_indep_combo.csv")
            best_indep = pd.DataFrame({'Date':datetime.now(timeZ_Kl).strftime('%d-%m-%Y %H:%M:%S'),
                                       'indep': str(indep), 
                                       'rmse': [best]})
            
            print(colored("writing the indep combos to disk", 'blue'))
            overall_best = overall_best.append(best_indep).drop_duplitabes().reset_index(drop=True)
            overall_best.to_csv("../indep_combo/tab_best_indep_combo.csv", index=False)
            
        else:
            print(colored(f"Best so far {best}", 'yellow'))

In [None]:
print(f"Total actual features: {len(indep_master)}")

features_2_use=29
comb_features = combinations(indep_master, features_2_use)

indep_all_combo=[]
for indep_combo in list(comb_features):
    indep_all_combo.append(list(indep_combo))
    
print(f"Total features to use: {features_2_use}")
print(f"Total combo possible : {len(indep_all_combo)}")

In [None]:
get_indep_combination(indep_all_combo=indep_all_combo, 
                      total_combinations_to_try=20)

In [None]:
best_indep = pd.read_csv("../indep_combo/cat_best_indep_combo.csv")
best_indep['Date'] = pd.to_datetime(best_indep.Date).dt.date.astype('str')

today_date = datetime.now().strftime('%Y-%m-%d')

condition1 = (best_indep.Date==today_date)
best_indep = best_indep[condition1].reset_index(drop=True)

condition2 = (best_indep.rmse == best_indep.rmse.min())
indep = eval(best_indep[condition2].indep.values[0])

print(f"Best RMSE : {best_indep.rmse.min()}")
print("Best indep size", len(indep))
# indep[-1]


### local

In [None]:
tab_params = {'n_d':32
              , 'n_a':32
#               , 'optimizer_params': {'lr': 0.02}
              , 'verbose':1
              , 'seed':42}

tab_fit_params = {'batch_size':64, 
                  'patience':20, 
                  'max_epochs':1000,
                  'num_workers':8,
                  'eval_metric':['rmse']}


fold_iterations, fold_results, tab_models_fold = train_tab_model(train_df=train_df,
                                                                 tab_params = tab_params, 
                                                                 tab_fit_params = tab_fit_params)

avg_iteration = int(np.mean(fold_iterations))
print("Fold iterations:", fold_iterations)
print("Average iteration:", avg_iteration)
print("Fold results:", fold_results)
print("Avg.Fold results:", np.mean(fold_results))