In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold, GroupKFold
from catboost import CatBoostRegressor, Pool, cv
from datetime import datetime, timedelta

from sklearn.preprocessing import MinMaxScaler

import gc
import optuna
from utils_testing import optuna_logging
from itertools import combinations
from termcolor import colored

import os
import random
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from torch.optim import Adam, AdamW
from transformers import get_linear_schedule_with_warmup

import pytz
UTC = pytz.utc  

timeZ_Kl = pytz.timezone('Asia/Kolkata')

In [None]:
def set_random_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)

    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

    torch.backends.cudnn.deterministic = True

In [None]:
train_df = pd.read_pickle("../data/train_df_interim.pickle")
test_df = pd.read_pickle("../data/test_df_interim.pickle")

train_df.shape, test_df.shape

In [None]:
train_df = train_df.replace({-999:0})
test_df = test_df.replace({-999:0})

In [None]:
drop = ['SURV_DTE'
        , 'sand_target_avg'
        , 'fold'
       ]

target = 'PCT_DESAT_TO_ORIG'
indep = train_df.columns.difference(drop+[target])
indep_master = indep.copy() # Taking a copy so it can be used to get the original features
indep

In [None]:
scaler = MinMaxScaler()
scaler.fit(train_df[indep])

train_df[indep] = scaler.transform(train_df[indep])
test_df[indep] = scaler.transform(test_df[indep])

In [None]:
class Regression_NN(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.input_layer = nn.Linear(input_dim, 128)
#         self.hidden_layer_1 = nn.Linear(256, 256)
        self.output = nn.Linear(128, 1)
        self.relu = nn.ReLU()
    
    def forward(self, x):
        output = self.input_layer(x)
        output = self.relu(output)
#         output = self.hidden_layer_1(output)
#         output = self.relu(output)
        output = self.output(output)
#         output = self.relu(output)
        
        return output

In [None]:
class get_input:
    def __init__(self, input_X, target, data_type):
        self.input_X = input_X
        self.target = target
        self.data_type = data_type
        
    def __len__(self):
        return len(self.input_X)
    
    def __getitem__(self, idx):
        if self.data_type=='train':
            target = self.target[idx]
        else:
            target = 1
            
        return {'input_X': torch.tensor(self.input_X[idx], dtype=torch.float), 
                'target': torch.tensor(target, dtype=torch.float)}
        

In [None]:
criterion = nn.MSELoss()

def rmse(y_true, y_pred):
    '''
    Compute Root Mean Square Percentage Error between two arrays.
    '''
    loss = np.sqrt(mean_squared_error(y_true, y_pred))

    return loss


In [None]:
def get_data_loader(df_input, batch_size, shuffle):
    temp = DataLoader(df_input, 
                      batch_size, 
                      shuffle=shuffle)
    return temp

In [None]:
def train_fn(train_data_loader
             , valid_data_loader
             , model
             , optimizer
             , epoch
             , best_score
             , best_epoch
             , best_step 
             , current_fold
             , scheduler=None):
    
    model.train()
    train_loss = 0
    eval_at_every = params_dict['eval_every_step']
    
    gc.collect()
    torch.cuda.empty_cache()
    
    global actual, predicted_output
    
    for current_step, data in enumerate(train_data_loader):
          
        optimizer.zero_grad()
        
        batch_input_X = data['input_X'].to(params_dict['device'], dtype = torch.float)
        batch_target = data['target'].to(params_dict['device'], dtype = torch.float)

        output = model(x=batch_input_X)

        batch_prediction = output.flatten()
        
        batch_train_loss = criterion(batch_prediction, batch_target)
        train_loss+=batch_train_loss.sum()
        
        batch_train_loss.sum().backward()
        optimizer.step()
#         scheduler.step()
            
        # Deleting the intermediate variables
        del output, batch_prediction, batch_train_loss
        torch.cuda.empty_cache()
        gc.collect()
        
        if current_step%eval_at_every == 0:
            # Calculate train loss for each step
            interim_avg_train_loss = torch.sqrt(train_loss/(current_step+1))
        
            # Get the Eval results
            eval_loss, actual, predicted_output = eval_fn(data_loader=valid_data_loader,
                                                          model=model)
            
            # Get the actual and predicted
            actual = actual.detach().cpu().numpy()
            predicted_output = predicted_output.detach().cpu().numpy()
            
            predicted_output = np.where(predicted_output<0, 0, predicted_output)
            predicted_output = np.where(predicted_output>1, 1, predicted_output)

            # Calculate Eval RMPSE
            eval_rmse = rmse(y_true=actual, y_pred=predicted_output)

            if params_dict['verbose']:
                print(f"Epoch:{epoch}/{params_dict['epoch']} Step: {current_step}/{len(train_data_loader)}, Train_loss: {interim_avg_train_loss :0.4f}, Eval_loss:{eval_loss:0.4f}, Eval RMSE:{eval_rmse:0.4f}")
            
            if eval_rmse<best_score:
                if params_dict['verbose']:
                    print(colored(f"Eval RMSE improved from {best_score:0.4f} to {eval_rmse:0.4f}", 'green'))
                best_score = eval_rmse
                best_epoch = epoch
                best_step = current_step
                
                # Saving the model
#                 model_name = f"{params_dict['model_dir']}_{params_dict['model_suffix']}"
                model_name = f"{params_dict['model_dir']}_regression_nn_fold_{current_fold}.bin"
                
                if params_dict['verbose']:
                    print(f"Saving the model {model_name}")        
                torch.save(model.state_dict(), model_name)
            else:
                if params_dict['verbose']:
                    print(f"Eval RMSE did not improve from the {best_score:0.4f} from epoch:{best_epoch} step:{best_step}")
            if params_dict['verbose']:
                print("")
                
    avg_train_loss = torch.sqrt(train_loss/len(train_data_loader))
    
    # Deleting the intermediate variables
    del train_loss
    gc.collect()
    torch.cuda.empty_cache()
                  
    return avg_train_loss, best_score, best_epoch, best_step

In [None]:
def eval_fn(data_loader, model):
    
    model.eval()
    actual = torch.tensor([]).to(params_dict['device'])
    predicted_output = torch.tensor([]).to(params_dict['device'])
    
    eval_loss = 0
    with torch.no_grad():
        for i, data in enumerate(data_loader):

#            batch_stock_id = data['stock_id']
#            batch_time_id = data['time_id']
            batch_input_X = data['input_X'].to(params_dict['device'], dtype = torch.float)
            batch_target = data['target'].to(params_dict['device'], dtype = torch.float)

            output = model(x=batch_input_X)
            batch_prediction = output.flatten()
            
            batch_eval_loss = criterion(batch_prediction, batch_target)
            eval_loss+=batch_eval_loss.sum()
            
            actual = torch.hstack([actual, batch_target])
            predicted_output = torch.hstack([predicted_output,  batch_prediction])
            
    avg_eval_loss = torch.sqrt(eval_loss/len(data_loader))
            
    return avg_eval_loss, actual, predicted_output

In [None]:
def train_engine(train_data_loader, eval_data_loader, current_fold):
        
    gc.collect()
    torch.cuda.empty_cache()
    
    set_random_seed(seed=100)
    model = Regression_NN(input_dim = len(indep))
    
#     model = nn.DataParallel(model)
    model.to(params_dict['device'])
    
#     optimizer = create_optimizer(model)     
    optimizer = AdamW(model.parameters()
                     , lr=params_dict['learning_rate']
#                      , eps = params_dict['EPS']
                    )
    
#     scheduler = get_cosine_schedule_with_warmup(optimizer,
#                                                 num_training_steps=params_dict['epoch'] * len(train_data_loader),
#                                                 num_warmup_steps=50)
    
#     scheduler = get_linear_schedule_with_warmup(optimizer, 
#                                                 num_warmup_steps=0, 
#                                                 num_training_steps=len(train_data_loader) * params_dict['epoch'])
        
    best_score = 100000
    best_epoch=0
    best_step=0
    for epoch in range(1, (params_dict['epoch']+1)):
        train_loss, best_score, best_epoch, best_step = train_fn(train_data_loader=train_data_loader,
                                                                 valid_data_loader=eval_data_loader,
                                                                 model=model,
                                                                 optimizer=optimizer,
                                                                 epoch=epoch,
                                                                 best_score=best_score,
                                                                 best_epoch=best_epoch,
                                                                 best_step=best_step, 
                                                                 scheduler=None, 
                                                                 current_fold=current_fold)
        if params_dict['verbose']:
            print(f"--------------------------------------------------------------------------------")
            print(f"----------------------Epoch: {epoch} over----------------------")
            print(f"--------------------------------------------------------------------------------")
            print("")
        
        if (epoch - best_epoch) > params_dict['early_stopping']:
            print("The Early Stopping threshold is reached so stop training")
            break
    
    # Deleting the model and clearing the CUDA memory
    del model
    gc.collect()
    torch.cuda.empty_cache()
        
    return  best_score, best_epoch, best_step

In [None]:
params_dict = {'train_batch_size':16,
               'valid_batch_size':1024,
               'learning_rate':1e-4,
#                'EPS':3e-8, 
               'weight_dict':0, 
               'opt': 'ADAMW', # MADGRAD, ADAM, ADAMW
               'scheduler':False,
               'epoch':1000,
               'eval_every_step':100, 
               'early_stopping':10,
               'verbose':True,
               'device': 'cuda' if torch.cuda.is_available() else 'cpu',           
               'model_dir':"../output_models/new_"
               
              }

for fold_i in range(0, train_df.fold.max()+1):
    train_fold = train_df[train_df.fold!=fold_i].copy().reset_index(drop=True)
    valid_fold = train_df[train_df.fold==fold_i].copy().reset_index(drop=True)
    
    ###########
    train_local_input = get_input(input_X=train_fold[indep].values, 
                                  target=train_fold[target].values, 
                                  data_type='train')
    test_local_input = get_input(input_X=valid_fold[indep].values, 
                                 target=valid_fold[target].values,
                                 data_type='train')
    
    ###########
    train_local_data_loader = get_data_loader(df_input=train_local_input, 
                                              batch_size=params_dict['train_batch_size'], 
                                              shuffle=True)
    test_local_data_loader = get_data_loader(df_input=test_local_input, 
                                             batch_size=params_dict['valid_batch_size'],
                                             shuffle=False)
    
    best_score, best_epoch, best_step = train_engine(train_data_loader=train_local_data_loader, 
                                                     eval_data_loader=test_local_data_loader, 
                                                     current_fold=fold_i)



    print(f"Current fold: {fold_i}, best epoch:{best_epoch} step:{best_step}, RMSE {best_score}")
    print("###############################################################")
    print("")

# Fold Ensemble Prediction
### local

In [None]:
fold_iterations = []
fold_results = []
tab_models_fold = {}

model = Regression_NN(input_dim = len(indep))
for fold_i in range(5):
    model_path = f"../output_models/new__regression_nn_fold_{fold_i}.bin"
    print(f"{fold_i} : {model_path}")
    model.load_state_dict(torch.load(model_path))
    
    valid_fold = train_df[train_df.fold==fold_i].copy().reset_index(drop=True)
    
    prediction = model(torch.tensor(valid_fold[indep].values, dtype=torch.float))    
    pred = prediction.detach().numpy().reshape(-1)
    pred = np.where(pred<0, 0, pred)
    pred = np.where(pred>1, 1, pred)
    fold_rmse = np.round(rmse(pred, valid_fold[target]), 4)
    fold_results.append(fold_rmse)
    
# avg_iteration = int(np.mean(fold_iterations))
# print("Fold iterations:", fold_iterations)
# print("Average iteration:", avg_iteration)
print("Fold results:", fold_results)
print("Avg.Fold results:", np.mean(fold_results))

# Fold Ensemble Prediction
### Prod

In [None]:
fold_iterations = []
fold_results = []
tab_models_fold = {}

model = Regression_NN(input_dim = len(indep))
for fold_i in range(5):
    model_path = f"../output_models/new__regression_nn_fold_{fold_i}.bin"
#     model_path = f"../output_models/FINAL__regression_nn_fold_1.bin"
    print(f"{fold_i} : {model_path}")
    model.load_state_dict(torch.load(model_path))
        
    prediction = model(torch.tensor(test_df[indep].values, dtype=torch.float))    
    pred = prediction.detach().numpy().reshape(-1)
    pred = np.where(pred<0, 0, pred)
    pred = np.where(pred>1, 1, pred)

    fold_results.append(pred)
    
NN_prediction = np.array(fold_results).mean(axis=0)
    
# avg_iteration = int(np.mean(fold_iterations))
# print("Fold iterations:", fold_iterations)
# print("Average iteration:", avg_iteration)
print("Fold results:", fold_results)

In [None]:
NN_submission = pd.DataFrame({'PCT_DESAT_TO_ORIG':NN_prediction})
NN_submission.to_csv("../sub/NN_sub_6.csv", index=False)
NN_submission