## 1. Importing Libraries

In [55]:
import shutil
import apiquery
import pandas as pd
import sys
import seaborn as sns
import os
import numpy as np
import random
import torch
import pickle
import gc
DATA_PATH = '../01.Data'
shutil.copy("apiquery_pyc.py", "apiquery.pyc")

module_path = "../src"
if module_path not in sys.path:
    sys.path.append(module_path)
    
from utils.training import *
from utils.encoding import *
from utils.utils import *
from utils.fetch import *
from dataset.dataset import BNPParibasText
from models.models import Roberta_Model
from utils.EarlyStopping import EarlyStopping
from utils.LoopFunctions import train_fn,valid_fn
from utils.prediction import get_prediction,get_embedding
pd.options.display.max_rows = 999
pd.options.display.max_columns = 100
import math
from collections import Counter
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import mean_squared_error
import math
import time
import lightgbm as lgbm
import matplotlib.pyplot as plt
import torch.nn as nn
import config_ad
import transformers

In [3]:
def calc_oof(df,config):
    df.loc[:,'oof'] = -1
    for fold in np.sort(df.fold.unique()):
        print(f'Predicting Model: {fold}')
        valid       = df[df['fold']==fold]
        valid_index = valid.index.to_list()
        valid = valid.reset_index(drop=True)
        # Defining DataSet
        col_unique = generate_col_unique(valid,config.COLUMNS_ENCODE)
        tokenizer = transformers.RobertaTokenizer.from_pretrained(config.PRETRAINED)
        valid_dataset = BNPParibasText(valid,config.MAX_LENGTH,tokenizer,col_unique)
        valid_loader = torch.utils.data.DataLoader(
            valid_dataset,
            batch_size  = config.BATCH_SIZE,
            num_workers = config.NUM_WORKERS,
            shuffle     = False,
            pin_memory  = True,
        )

       # Defining Device
        model = Roberta_Model(pretrained_model=config.PRETRAINED,dropout = config.DROPOUT)
        model.load_state_dict(torch.load(f'../03.Models/BNP_PARIBAS_ROBERTA_FOLD_{fold}'))
        model.to(config.DEVICE)
        preds = get_prediction(valid_loader, model,config.DEVICE)
        df.loc[valid_index,'oof'] = preds
    oof_score = np.sqrt(mean_squared_error(df['target'],df['oof']))
    print('OOF_SCORE (RMSE): ',oof_score)
    return oof_score


# Calculating predictions for test
def calculate_test(test,config):
    col_unique = generate_col_unique(test,config.COLUMNS_ENCODE)   
    tokenizer     = transformers.RobertaTokenizer.from_pretrained(config.PRETRAINED)
    test_dataset = BNPParibasText(test,config.MAX_LENGTH,tokenizer,col_unique)
    test_loader = torch.utils.data.DataLoader(
        test_dataset,
        batch_size  = config.BATCH_SIZE,
        pin_memory  = True,
        num_workers = config.NUM_WORKERS
    )
    preds = 0
    for fold in range(0,5):
        model = Roberta_Model(pretrained_model=config.PRETRAINED,dropout = config.DROPOUT)
        model.load_state_dict(torch.load(f'../03.Models/BNP_PARIBAS_ROBERTA_FOLD_{fold}'))
        model.to(config.DEVICE)
        preds = preds + get_prediction(test_loader, model,config.DEVICE)
    test['preds'] = preds/5


In [2]:
%%time
df_train     = pd.read_csv(os.path.join("../01.Data",'fold.csv'))
y_submission = pd.read_csv(os.path.join(DATA_PATH,'y_test_submission_example.tsv'), index_col='Index', encoding='utf-8', sep='\t')

CPU times: user 1.73 s, sys: 234 ms, total: 1.97 s
Wall time: 2.31 s


## 2. Fine Tuning

In [4]:
def run(data,fold,output_path,config,run=None):
    print(f'******************** Model Fold {fold}  *****************')
    seed_everything(seed=config.SEED)
    train = data[data['fold']!=fold].reset_index(drop=True)
    valid = data[data['fold']==fold].reset_index(drop=True)
    col_unique = generate_col_unique(train,config.COLUMNS_ENCODE)
    col_unique = generate_col_unique(valid,config.COLUMNS_ENCODE)
    
    
    print('Train: ',train.shape[0], 'Valid: ',valid.shape[0])
    # Defining DataSet
    tokenizer     = transformers.RobertaTokenizer.from_pretrained(config.PRETRAINED)
    train_dataset = BNPParibasText(train,config.MAX_LENGTH,tokenizer,col_unique)
    valid_dataset = BNPParibasText(valid,config.MAX_LENGTH,tokenizer,col_unique)
        
    train_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size  = config.BATCH_SIZE,
        pin_memory  = True,
        num_workers = config.NUM_WORKERS
    )

    valid_loader = torch.utils.data.DataLoader(
        valid_dataset,
        batch_size  = config.BATCH_SIZE,
        num_workers = config.NUM_WORKERS,
        shuffle     = False,
        pin_memory  = True,
    )
    
    # Defining Device
    model = Roberta_Model(pretrained_model=config.PRETRAINED,dropout = config.DROPOUT)
    model.to(config.DEVICE)
    criterion = nn.MSELoss()
    criterion.to(config.DEVICE)

    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_parameters = [
        {
            "params": [
                p for n, p in param_optimizer if not any(nd in n for nd in no_decay)
            ],
            "weight_decay": config.WEIGHT_DECAY,
        },
        {
            "params": [
                p for n, p in param_optimizer if any(nd in n for nd in no_decay)
            ],
            "weight_decay": 0.0,
        },
    ]

    num_train_steps = int((len(train) / config.BATCH_SIZE )* config.EPOCHS)
    config.SCHEDULER_PARAMETERS['NUM_TRAIN_STEPS'] = num_train_steps
    print(f'num_train_steps: {num_train_steps}')
    optimizer = fetch_optimizer(config.OPTIMIZER_NAME,config.LEARNING_RATE,optimizer_parameters)
    scheduler = fetch_scheduler(config.SCHEDULER_NAME,optimizer,config.SCHEDULER_PARAMETERS)
    
    es = EarlyStopping (patience = config.EARLY_STOPPING, mode = config.MODE,delta=0)
      
    for epoch in range(config.EPOCHS):
        print('Epoch {}, lr {}'.format(epoch, optimizer.param_groups[0]['lr']))        
        training_loss = train_fn(train_loader,model,criterion,optimizer,config.DEVICE,scheduler,mode_sched = config.MODE_SCHEDULER)
        valid_loss    = valid_fn(valid_loader,model,criterion,config.DEVICE)
        if run:
            run.log({'training_loss':training_loss,'valid_loss':valid_loss})
            
        es(valid_loss, model,output_path)
        
        if es.early_stop:
            print('Meet early stopping')
            return es.get_best_val_score()
        gc.collect()
        torch.cuda.empty_cache()
    print("Didn't meet early stopping")
    return es.get_best_val_score()

In [5]:
for i in range(0,5):
    output_path = f'../03.Models/Roberta-Pretrained/BNP_PARIBAS_ROBERTA_ADDITIVE_FOLD_{i}'
    run(df_train,i,output_path,config_ad)

******************** Model Fold 0  *****************
Train:  81622 Valid:  20406


  0%|          | 0/1276 [00:00<?, ?it/s]

num_train_steps: 15304
Epoch 0, lr 0.0
Mode Scheduler: OK


100%|██████████| 1276/1276 [05:02<00:00,  4.22it/s, LR=1.67e-5, Train_Loss=68]  
  0%|          | 0/319 [00:00<?, ?it/s]

Training -> Loss: 68.00697775096356


100%|██████████| 319/319 [00:28<00:00, 11.16it/s, Eval_Loss=33.4]


Validation -> Loss: 33.38989837789984
Validation score improved (inf --> 33.38989837789984). Saving model!


  0%|          | 0/1276 [00:00<?, ?it/s]

Epoch 1, lr 1.6679738562091504e-05
Mode Scheduler: OK


100%|██████████| 1276/1276 [05:02<00:00,  4.22it/s, LR=1.85e-5, Train_Loss=29.7]
  0%|          | 0/319 [00:00<?, ?it/s]

Training -> Loss: 29.745953457482557


100%|██████████| 319/319 [00:28<00:00, 11.18it/s, Eval_Loss=26.3]


Validation -> Loss: 26.25095771323177
Validation score improved (33.38989837789984 --> 26.25095771323177). Saving model!


  0%|          | 0/1276 [00:00<?, ?it/s]

Epoch 2, lr 1.8516044721939887e-05
Mode Scheduler: OK


100%|██████████| 1276/1276 [05:01<00:00,  4.24it/s, LR=1.67e-5, Train_Loss=24.5]
  0%|          | 0/319 [00:00<?, ?it/s]

Training -> Loss: 24.54152354700812


100%|██████████| 319/319 [00:29<00:00, 10.93it/s, Eval_Loss=24.9]


Validation -> Loss: 24.906518236596757
Validation score improved (26.25095771323177 --> 24.906518236596757). Saving model!


  0%|          | 0/1276 [00:00<?, ?it/s]

Epoch 3, lr 1.6663278640917674e-05
Mode Scheduler: OK


100%|██████████| 1276/1276 [05:01<00:00,  4.24it/s, LR=1.48e-5, Train_Loss=22]  
  0%|          | 0/319 [00:00<?, ?it/s]

Training -> Loss: 22.031593575372966


100%|██████████| 319/319 [00:29<00:00, 10.92it/s, Eval_Loss=23.8]


Validation -> Loss: 23.76212262957821
Validation score improved (24.906518236596757 --> 23.76212262957821). Saving model!


  0%|          | 0/1276 [00:00<?, ?it/s]

Epoch 4, lr 1.4810512559895456e-05
Mode Scheduler: OK


100%|██████████| 1276/1276 [05:01<00:00,  4.23it/s, LR=1.3e-5, Train_Loss=20.2] 
  0%|          | 0/319 [00:00<?, ?it/s]

Training -> Loss: 20.201210182662294


100%|██████████| 319/319 [00:28<00:00, 11.06it/s, Eval_Loss=23.5]


Validation -> Loss: 23.4995545324487
Validation score improved (23.76212262957821 --> 23.4995545324487). Saving model!


  0%|          | 0/1276 [00:00<?, ?it/s]

Epoch 5, lr 1.2957746478873242e-05
Mode Scheduler: OK


100%|██████████| 1276/1276 [05:00<00:00,  4.24it/s, LR=1.11e-5, Train_Loss=18.8]
  0%|          | 0/319 [00:00<?, ?it/s]

Training -> Loss: 18.815732654359273


100%|██████████| 319/319 [00:29<00:00, 10.89it/s, Eval_Loss=23.3]


Validation -> Loss: 23.285865714931187
Validation score improved (23.4995545324487 --> 23.285865714931187). Saving model!


  0%|          | 0/1276 [00:00<?, ?it/s]

Epoch 6, lr 1.1104980397851025e-05
Mode Scheduler: OK


100%|██████████| 1276/1276 [05:01<00:00,  4.23it/s, LR=9.25e-6, Train_Loss=17.8]
  0%|          | 0/319 [00:00<?, ?it/s]

Training -> Loss: 17.805843653350042


100%|██████████| 319/319 [00:29<00:00, 10.89it/s, Eval_Loss=23.8]
  0%|          | 0/1276 [00:00<?, ?it/s]

Validation -> Loss: 23.78731407117694
EarlyStopping counter: 1 out of 5
Epoch 7, lr 9.252214316828807e-06
Mode Scheduler: OK


100%|██████████| 1276/1276 [05:03<00:00,  4.21it/s, LR=7.4e-6, Train_Loss=16.9] 
  0%|          | 0/319 [00:00<?, ?it/s]

Training -> Loss: 16.927461096672427


100%|██████████| 319/319 [00:30<00:00, 10.59it/s, Eval_Loss=23.2]


Validation -> Loss: 23.22477491372805
Validation score improved (23.285865714931187 --> 23.22477491372805). Saving model!


  0%|          | 0/1276 [00:00<?, ?it/s]

Epoch 8, lr 7.399448235806593e-06
Mode Scheduler: OK


100%|██████████| 1276/1276 [05:01<00:00,  4.23it/s, LR=5.55e-6, Train_Loss=16.3]
  0%|          | 0/319 [00:00<?, ?it/s]

Training -> Loss: 16.273143233924078


100%|██████████| 319/319 [00:29<00:00, 10.80it/s, Eval_Loss=23.4]
  0%|          | 0/1276 [00:00<?, ?it/s]

Validation -> Loss: 23.42048592403017
EarlyStopping counter: 1 out of 5
Epoch 9, lr 5.546682154784376e-06
Mode Scheduler: OK


100%|██████████| 1276/1276 [05:02<00:00,  4.22it/s, LR=3.69e-6, Train_Loss=15.7]
  0%|          | 0/319 [00:00<?, ?it/s]

Training -> Loss: 15.700256390818234


100%|██████████| 319/319 [00:28<00:00, 11.06it/s, Eval_Loss=22.9]


Validation -> Loss: 22.90965392372825
Validation score improved (23.22477491372805 --> 22.90965392372825). Saving model!


  0%|          | 0/1276 [00:00<?, ?it/s]

Epoch 10, lr 3.693916073762161e-06
Mode Scheduler: OK


100%|██████████| 1276/1276 [05:02<00:00,  4.22it/s, LR=1.84e-6, Train_Loss=15.3]
  0%|          | 0/319 [00:00<?, ?it/s]

Training -> Loss: 15.27596096484265


100%|██████████| 319/319 [00:30<00:00, 10.43it/s, Eval_Loss=22.7]


Validation -> Loss: 22.66348245076625
Validation score improved (22.90965392372825 --> 22.66348245076625). Saving model!


  0%|          | 0/1276 [00:00<?, ?it/s]

Epoch 11, lr 1.841149992739945e-06
Mode Scheduler: OK


100%|██████████| 1276/1276 [05:01<00:00,  4.24it/s, LR=0, Train_Loss=14.9]      
  0%|          | 0/319 [00:00<?, ?it/s]

Training -> Loss: 14.949389297199847


100%|██████████| 319/319 [00:29<00:00, 10.99it/s, Eval_Loss=22.6]


Validation -> Loss: 22.566729470853897
Validation score improved (22.66348245076625 --> 22.566729470853897). Saving model!
Didn't meet early stopping
******************** Model Fold 1  *****************
Train:  81622 Valid:  20406


  0%|          | 0/1276 [00:00<?, ?it/s]

num_train_steps: 15304
Epoch 0, lr 0.0
Mode Scheduler: OK


100%|██████████| 1276/1276 [05:02<00:00,  4.22it/s, LR=1.67e-5, Train_Loss=68.4]
  0%|          | 0/319 [00:00<?, ?it/s]

Training -> Loss: 68.44055353063027


100%|██████████| 319/319 [00:30<00:00, 10.45it/s, Eval_Loss=32.9]


Validation -> Loss: 32.91006547455503
Validation score improved (inf --> 32.91006547455503). Saving model!


  0%|          | 0/1276 [00:00<?, ?it/s]

Epoch 1, lr 1.6679738562091504e-05
Mode Scheduler: OK


100%|██████████| 1276/1276 [05:05<00:00,  4.17it/s, LR=1.85e-5, Train_Loss=29.9]
  0%|          | 0/319 [00:00<?, ?it/s]

Training -> Loss: 29.85158812588659


100%|██████████| 319/319 [00:30<00:00, 10.46it/s, Eval_Loss=26.3]


Validation -> Loss: 26.285617200558463
Validation score improved (32.91006547455503 --> 26.285617200558463). Saving model!


  0%|          | 0/1276 [00:00<?, ?it/s]

Epoch 2, lr 1.8516044721939887e-05
Mode Scheduler: OK


100%|██████████| 1276/1276 [05:04<00:00,  4.19it/s, LR=1.67e-5, Train_Loss=24.6]
  0%|          | 0/319 [00:00<?, ?it/s]

Training -> Loss: 24.64155409926531


100%|██████████| 319/319 [00:30<00:00, 10.35it/s, Eval_Loss=25]  


Validation -> Loss: 24.99150720733834
Validation score improved (26.285617200558463 --> 24.99150720733834). Saving model!


  0%|          | 0/1276 [00:00<?, ?it/s]

Epoch 3, lr 1.6663278640917674e-05
Mode Scheduler: OK


100%|██████████| 1276/1276 [05:03<00:00,  4.20it/s, LR=1.48e-5, Train_Loss=22]  
  0%|          | 0/319 [00:00<?, ?it/s]

Training -> Loss: 21.97710532902924


100%|██████████| 319/319 [00:30<00:00, 10.43it/s, Eval_Loss=24.3]


Validation -> Loss: 24.26881038881021
Validation score improved (24.99150720733834 --> 24.26881038881021). Saving model!


  0%|          | 0/1276 [00:00<?, ?it/s]

Epoch 4, lr 1.4810512559895456e-05
Mode Scheduler: OK


100%|██████████| 1276/1276 [05:04<00:00,  4.19it/s, LR=1.3e-5, Train_Loss=20.2] 
  0%|          | 0/319 [00:00<?, ?it/s]

Training -> Loss: 20.151521967122548


100%|██████████| 319/319 [00:30<00:00, 10.48it/s, Eval_Loss=23.6]


Validation -> Loss: 23.557359581830735
Validation score improved (24.26881038881021 --> 23.557359581830735). Saving model!


  0%|          | 0/1276 [00:00<?, ?it/s]

Epoch 5, lr 1.2957746478873242e-05
Mode Scheduler: OK


100%|██████████| 1276/1276 [05:05<00:00,  4.18it/s, LR=1.11e-5, Train_Loss=18.9]
  0%|          | 0/319 [00:00<?, ?it/s]

Training -> Loss: 18.85803728193325


100%|██████████| 319/319 [00:31<00:00, 10.20it/s, Eval_Loss=23.3]


Validation -> Loss: 23.264226276672748
Validation score improved (23.557359581830735 --> 23.264226276672748). Saving model!


  0%|          | 0/1276 [00:00<?, ?it/s]

Epoch 6, lr 1.1104980397851025e-05
Mode Scheduler: OK


100%|██████████| 1276/1276 [05:06<00:00,  4.17it/s, LR=9.25e-6, Train_Loss=17.7]
  0%|          | 0/319 [00:00<?, ?it/s]

Training -> Loss: 17.70229396270734


100%|██████████| 319/319 [00:30<00:00, 10.51it/s, Eval_Loss=23.2]


Validation -> Loss: 23.204936099276647
Validation score improved (23.264226276672748 --> 23.204936099276647). Saving model!


  0%|          | 0/1276 [00:00<?, ?it/s]

Epoch 7, lr 9.252214316828807e-06
Mode Scheduler: OK


100%|██████████| 1276/1276 [05:05<00:00,  4.17it/s, LR=7.4e-6, Train_Loss=16.8] 
  0%|          | 0/319 [00:00<?, ?it/s]

Training -> Loss: 16.846363781948448


100%|██████████| 319/319 [00:30<00:00, 10.41it/s, Eval_Loss=23.2]


Validation -> Loss: 23.189819389749843
Validation score improved (23.204936099276647 --> 23.189819389749843). Saving model!


  0%|          | 0/1276 [00:00<?, ?it/s]

Epoch 8, lr 7.399448235806593e-06
Mode Scheduler: OK


100%|██████████| 1276/1276 [05:05<00:00,  4.17it/s, LR=5.55e-6, Train_Loss=16.2]
  0%|          | 0/319 [00:00<?, ?it/s]

Training -> Loss: 16.164178113485203


100%|██████████| 319/319 [00:30<00:00, 10.58it/s, Eval_Loss=22.8]


Validation -> Loss: 22.766169867919157
Validation score improved (23.189819389749843 --> 22.766169867919157). Saving model!


  0%|          | 0/1276 [00:00<?, ?it/s]

Epoch 9, lr 5.546682154784376e-06
Mode Scheduler: OK


100%|██████████| 1276/1276 [05:07<00:00,  4.15it/s, LR=3.69e-6, Train_Loss=15.7]
  0%|          | 0/319 [00:00<?, ?it/s]

Training -> Loss: 15.656845564191991


100%|██████████| 319/319 [00:30<00:00, 10.58it/s, Eval_Loss=23]  
  0%|          | 0/1276 [00:00<?, ?it/s]

Validation -> Loss: 23.022405235744944
EarlyStopping counter: 1 out of 5
Epoch 10, lr 3.693916073762161e-06
Mode Scheduler: OK


100%|██████████| 1276/1276 [05:04<00:00,  4.19it/s, LR=1.84e-6, Train_Loss=15.2]
  0%|          | 0/319 [00:00<?, ?it/s]

Training -> Loss: 15.197378664928545


100%|██████████| 319/319 [00:30<00:00, 10.49it/s, Eval_Loss=22.7]


Validation -> Loss: 22.667208247050223
Validation score improved (22.766169867919157 --> 22.667208247050223). Saving model!


  0%|          | 0/1276 [00:00<?, ?it/s]

Epoch 11, lr 1.841149992739945e-06
Mode Scheduler: OK


100%|██████████| 1276/1276 [05:05<00:00,  4.18it/s, LR=0, Train_Loss=14.9]      
  0%|          | 0/319 [00:00<?, ?it/s]

Training -> Loss: 14.894868367050883


100%|██████████| 319/319 [00:30<00:00, 10.37it/s, Eval_Loss=22.6]


Validation -> Loss: 22.633750287716666
Validation score improved (22.667208247050223 --> 22.633750287716666). Saving model!
Didn't meet early stopping
******************** Model Fold 2  *****************
Train:  81622 Valid:  20406


  0%|          | 0/1276 [00:00<?, ?it/s]

num_train_steps: 15304
Epoch 0, lr 0.0
Mode Scheduler: OK


100%|██████████| 1276/1276 [05:04<00:00,  4.19it/s, LR=1.67e-5, Train_Loss=67.4]
  0%|          | 0/319 [00:00<?, ?it/s]

Training -> Loss: 67.44226939551135


100%|██████████| 319/319 [00:30<00:00, 10.35it/s, Eval_Loss=32.4]


Validation -> Loss: 32.43990907101048
Validation score improved (inf --> 32.43990907101048). Saving model!


  0%|          | 0/1276 [00:00<?, ?it/s]

Epoch 1, lr 1.6679738562091504e-05
Mode Scheduler: OK


100%|██████████| 1276/1276 [05:05<00:00,  4.17it/s, LR=1.85e-5, Train_Loss=29.9]
  0%|          | 0/319 [00:00<?, ?it/s]

Training -> Loss: 29.85000348165865


100%|██████████| 319/319 [00:29<00:00, 10.76it/s, Eval_Loss=25.6]


Validation -> Loss: 25.602959211343506
Validation score improved (32.43990907101048 --> 25.602959211343506). Saving model!


  0%|          | 0/1276 [00:00<?, ?it/s]

Epoch 2, lr 1.8516044721939887e-05
Mode Scheduler: OK


100%|██████████| 1276/1276 [05:05<00:00,  4.18it/s, LR=1.67e-5, Train_Loss=24.7]
  0%|          | 0/319 [00:00<?, ?it/s]

Training -> Loss: 24.683336984027516


100%|██████████| 319/319 [00:29<00:00, 10.79it/s, Eval_Loss=24.2]


Validation -> Loss: 24.200891306407772
Validation score improved (25.602959211343506 --> 24.200891306407772). Saving model!


  0%|          | 0/1276 [00:00<?, ?it/s]

Epoch 3, lr 1.6663278640917674e-05
Mode Scheduler: OK


100%|██████████| 1276/1276 [05:04<00:00,  4.19it/s, LR=1.48e-5, Train_Loss=22.3]
  0%|          | 0/319 [00:00<?, ?it/s]

Training -> Loss: 22.317987915864187


100%|██████████| 319/319 [00:30<00:00, 10.53it/s, Eval_Loss=23.5]


Validation -> Loss: 23.53151334266304
Validation score improved (24.200891306407772 --> 23.53151334266304). Saving model!


  0%|          | 0/1276 [00:00<?, ?it/s]

Epoch 4, lr 1.4810512559895456e-05
Mode Scheduler: OK


100%|██████████| 1276/1276 [05:05<00:00,  4.18it/s, LR=1.3e-5, Train_Loss=20.4] 
  0%|          | 0/319 [00:00<?, ?it/s]

Training -> Loss: 20.421523943590145


100%|██████████| 319/319 [00:30<00:00, 10.54it/s, Eval_Loss=22.8]


Validation -> Loss: 22.808058230480803
Validation score improved (23.53151334266304 --> 22.808058230480803). Saving model!


  0%|          | 0/1276 [00:00<?, ?it/s]

Epoch 5, lr 1.2957746478873242e-05
Mode Scheduler: OK


100%|██████████| 1276/1276 [05:05<00:00,  4.18it/s, LR=1.11e-5, Train_Loss=19]  
  0%|          | 0/319 [00:00<?, ?it/s]

Training -> Loss: 18.97804765342545


100%|██████████| 319/319 [00:29<00:00, 10.90it/s, Eval_Loss=22.8]


Validation -> Loss: 22.796519955123852
Validation score improved (22.808058230480803 --> 22.796519955123852). Saving model!


  0%|          | 0/1276 [00:00<?, ?it/s]

Epoch 6, lr 1.1104980397851025e-05
Mode Scheduler: OK


100%|██████████| 1276/1276 [05:06<00:00,  4.16it/s, LR=9.25e-6, Train_Loss=18]  
  0%|          | 0/319 [00:00<?, ?it/s]

Training -> Loss: 17.98353244817369


100%|██████████| 319/319 [00:29<00:00, 10.69it/s, Eval_Loss=23.1]
  0%|          | 0/1276 [00:00<?, ?it/s]

Validation -> Loss: 23.116281503420264
EarlyStopping counter: 1 out of 5
Epoch 7, lr 9.252214316828807e-06
Mode Scheduler: OK


100%|██████████| 1276/1276 [05:04<00:00,  4.19it/s, LR=7.4e-6, Train_Loss=17.2] 
  0%|          | 0/319 [00:00<?, ?it/s]

Training -> Loss: 17.197536032775353


100%|██████████| 319/319 [00:30<00:00, 10.56it/s, Eval_Loss=22.7]


Validation -> Loss: 22.684481321831107
Validation score improved (22.796519955123852 --> 22.684481321831107). Saving model!


  0%|          | 0/1276 [00:00<?, ?it/s]

Epoch 8, lr 7.399448235806593e-06
Mode Scheduler: OK


100%|██████████| 1276/1276 [05:05<00:00,  4.18it/s, LR=5.55e-6, Train_Loss=16.5]
  0%|          | 0/319 [00:00<?, ?it/s]

Training -> Loss: 16.46393825968605


100%|██████████| 319/319 [00:29<00:00, 10.80it/s, Eval_Loss=22.5]


Validation -> Loss: 22.50192280772338
Validation score improved (22.684481321831107 --> 22.50192280772338). Saving model!


  0%|          | 0/1276 [00:00<?, ?it/s]

Epoch 9, lr 5.546682154784376e-06
Mode Scheduler: OK


100%|██████████| 1276/1276 [05:05<00:00,  4.17it/s, LR=3.69e-6, Train_Loss=15.9]
  0%|          | 0/319 [00:00<?, ?it/s]

Training -> Loss: 15.890581269809818


100%|██████████| 319/319 [00:30<00:00, 10.55it/s, Eval_Loss=22.3]


Validation -> Loss: 22.311984154871638
Validation score improved (22.50192280772338 --> 22.311984154871638). Saving model!


  0%|          | 0/1276 [00:00<?, ?it/s]

Epoch 10, lr 3.693916073762161e-06
Mode Scheduler: OK


100%|██████████| 1276/1276 [05:04<00:00,  4.19it/s, LR=1.84e-6, Train_Loss=15.5]
  0%|          | 0/319 [00:00<?, ?it/s]

Training -> Loss: 15.456671341459579


100%|██████████| 319/319 [00:29<00:00, 10.73it/s, Eval_Loss=22]  


Validation -> Loss: 22.007583791559394
Validation score improved (22.311984154871638 --> 22.007583791559394). Saving model!


  0%|          | 0/1276 [00:00<?, ?it/s]

Epoch 11, lr 1.841149992739945e-06
Mode Scheduler: OK


100%|██████████| 1276/1276 [05:05<00:00,  4.18it/s, LR=0, Train_Loss=15.1]      
  0%|          | 0/319 [00:00<?, ?it/s]

Training -> Loss: 15.122548420787979


100%|██████████| 319/319 [00:29<00:00, 10.75it/s, Eval_Loss=21.9]


Validation -> Loss: 21.93970409979267
Validation score improved (22.007583791559394 --> 21.93970409979267). Saving model!
Didn't meet early stopping
******************** Model Fold 3  *****************
Train:  81623 Valid:  20405


  0%|          | 0/1276 [00:00<?, ?it/s]

num_train_steps: 15304
Epoch 0, lr 0.0
Mode Scheduler: OK


100%|██████████| 1276/1276 [05:03<00:00,  4.21it/s, LR=1.67e-5, Train_Loss=67.5]
  0%|          | 0/319 [00:00<?, ?it/s]

Training -> Loss: 67.51813169108662


100%|██████████| 319/319 [00:30<00:00, 10.57it/s, Eval_Loss=35.1]


Validation -> Loss: 35.05412807285225
Validation score improved (inf --> 35.05412807285225). Saving model!


  0%|          | 0/1276 [00:00<?, ?it/s]

Epoch 1, lr 1.6679738562091504e-05
Mode Scheduler: OK


100%|██████████| 1276/1276 [05:04<00:00,  4.19it/s, LR=1.85e-5, Train_Loss=29.7]
  0%|          | 0/319 [00:00<?, ?it/s]

Training -> Loss: 29.709441410710447


100%|██████████| 319/319 [00:30<00:00, 10.43it/s, Eval_Loss=27.4]


Validation -> Loss: 27.435747852146065
Validation score improved (35.05412807285225 --> 27.435747852146065). Saving model!


  0%|          | 0/1276 [00:00<?, ?it/s]

Epoch 2, lr 1.8516044721939887e-05
Mode Scheduler: OK


100%|██████████| 1276/1276 [05:05<00:00,  4.18it/s, LR=1.67e-5, Train_Loss=24.6]
  0%|          | 0/319 [00:00<?, ?it/s]

Training -> Loss: 24.631230520230474


100%|██████████| 319/319 [00:29<00:00, 10.66it/s, Eval_Loss=25.4]


Validation -> Loss: 25.439675453688285
Validation score improved (27.435747852146065 --> 25.439675453688285). Saving model!


  0%|          | 0/1276 [00:00<?, ?it/s]

Epoch 3, lr 1.6663278640917674e-05
Mode Scheduler: OK


100%|██████████| 1276/1276 [05:04<00:00,  4.20it/s, LR=1.48e-5, Train_Loss=21.9]
  0%|          | 0/319 [00:00<?, ?it/s]

Training -> Loss: 21.937425701595775


100%|██████████| 319/319 [00:30<00:00, 10.61it/s, Eval_Loss=24.9]


Validation -> Loss: 24.933309761334364
Validation score improved (25.439675453688285 --> 24.933309761334364). Saving model!


  0%|          | 0/1276 [00:00<?, ?it/s]

Epoch 4, lr 1.4810512559895456e-05
Mode Scheduler: OK


100%|██████████| 1276/1276 [05:06<00:00,  4.17it/s, LR=1.3e-5, Train_Loss=20.1] 
  0%|          | 0/319 [00:00<?, ?it/s]

Training -> Loss: 20.136313857329675


100%|██████████| 319/319 [00:30<00:00, 10.49it/s, Eval_Loss=24.3]


Validation -> Loss: 24.3086216517003
Validation score improved (24.933309761334364 --> 24.3086216517003). Saving model!


  0%|          | 0/1276 [00:00<?, ?it/s]

Epoch 5, lr 1.2957746478873242e-05
Mode Scheduler: OK


100%|██████████| 1276/1276 [05:05<00:00,  4.18it/s, LR=1.11e-5, Train_Loss=18.8]
  0%|          | 0/319 [00:00<?, ?it/s]

Training -> Loss: 18.821231679109196


100%|██████████| 319/319 [00:29<00:00, 10.76it/s, Eval_Loss=24.1]


Validation -> Loss: 24.08404503944899
Validation score improved (24.3086216517003 --> 24.08404503944899). Saving model!


  0%|          | 0/1276 [00:00<?, ?it/s]

Epoch 6, lr 1.1104980397851025e-05
Mode Scheduler: OK


100%|██████████| 1276/1276 [05:02<00:00,  4.22it/s, LR=9.25e-6, Train_Loss=17.7]
  0%|          | 0/319 [00:00<?, ?it/s]

Training -> Loss: 17.677973025644835


100%|██████████| 319/319 [00:29<00:00, 10.95it/s, Eval_Loss=24]  


Validation -> Loss: 23.98093361989084
Validation score improved (24.08404503944899 --> 23.98093361989084). Saving model!


  0%|          | 0/1276 [00:00<?, ?it/s]

Epoch 7, lr 9.252214316828807e-06
Mode Scheduler: OK


100%|██████████| 1276/1276 [05:02<00:00,  4.22it/s, LR=7.4e-6, Train_Loss=16.9] 
  0%|          | 0/319 [00:00<?, ?it/s]

Training -> Loss: 16.867251094605855


100%|██████████| 319/319 [00:29<00:00, 10.69it/s, Eval_Loss=24.1]
  0%|          | 0/1276 [00:00<?, ?it/s]

Validation -> Loss: 24.138138687349038
EarlyStopping counter: 1 out of 5
Epoch 8, lr 7.399448235806593e-06
Mode Scheduler: OK


100%|██████████| 1276/1276 [05:02<00:00,  4.22it/s, LR=5.55e-6, Train_Loss=16.3]
  0%|          | 0/319 [00:00<?, ?it/s]

Training -> Loss: 16.29249244704142


100%|██████████| 319/319 [00:29<00:00, 10.84it/s, Eval_Loss=24.1]
  0%|          | 0/1276 [00:00<?, ?it/s]

Validation -> Loss: 24.064586513842162
EarlyStopping counter: 2 out of 5
Epoch 9, lr 5.546682154784376e-06
Mode Scheduler: OK


100%|██████████| 1276/1276 [05:02<00:00,  4.22it/s, LR=3.69e-6, Train_Loss=15.7]
  0%|          | 0/319 [00:00<?, ?it/s]

Training -> Loss: 15.650145706151346


100%|██████████| 319/319 [00:29<00:00, 10.96it/s, Eval_Loss=23.4]


Validation -> Loss: 23.404559464290223
Validation score improved (23.98093361989084 --> 23.404559464290223). Saving model!


  0%|          | 0/1276 [00:00<?, ?it/s]

Epoch 10, lr 3.693916073762161e-06
Mode Scheduler: OK


100%|██████████| 1276/1276 [05:02<00:00,  4.22it/s, LR=1.84e-6, Train_Loss=15.2]
  0%|          | 0/319 [00:00<?, ?it/s]

Training -> Loss: 15.225294433043668


100%|██████████| 319/319 [00:29<00:00, 10.72it/s, Eval_Loss=23.3]


Validation -> Loss: 23.307876204248505
Validation score improved (23.404559464290223 --> 23.307876204248505). Saving model!


  0%|          | 0/1276 [00:00<?, ?it/s]

Epoch 11, lr 1.841149992739945e-06
Mode Scheduler: OK


100%|██████████| 1276/1276 [05:01<00:00,  4.23it/s, LR=0, Train_Loss=14.9]      
  0%|          | 0/319 [00:00<?, ?it/s]

Training -> Loss: 14.942477258208404


100%|██████████| 319/319 [00:29<00:00, 10.81it/s, Eval_Loss=23.1]


Validation -> Loss: 23.130541046585034
Validation score improved (23.307876204248505 --> 23.130541046585034). Saving model!
Didn't meet early stopping
******************** Model Fold 4  *****************
Train:  81623 Valid:  20405


  0%|          | 0/1276 [00:00<?, ?it/s]

num_train_steps: 15304
Epoch 0, lr 0.0
Mode Scheduler: OK


100%|██████████| 1276/1276 [05:02<00:00,  4.22it/s, LR=1.67e-5, Train_Loss=68.5]
  0%|          | 0/319 [00:00<?, ?it/s]

Training -> Loss: 68.50139272026134


100%|██████████| 319/319 [00:29<00:00, 10.96it/s, Eval_Loss=33.2]


Validation -> Loss: 33.19418364258769
Validation score improved (inf --> 33.19418364258769). Saving model!


  0%|          | 0/1276 [00:00<?, ?it/s]

Epoch 1, lr 1.6679738562091504e-05
Mode Scheduler: OK


100%|██████████| 1276/1276 [05:01<00:00,  4.23it/s, LR=1.85e-5, Train_Loss=29.8]
  0%|          | 0/319 [00:00<?, ?it/s]

Training -> Loss: 29.847240975657975


100%|██████████| 319/319 [00:29<00:00, 10.83it/s, Eval_Loss=26.4]


Validation -> Loss: 26.397076854885185
Validation score improved (33.19418364258769 --> 26.397076854885185). Saving model!


  0%|          | 0/1276 [00:00<?, ?it/s]

Epoch 2, lr 1.8516044721939887e-05
Mode Scheduler: OK


100%|██████████| 1276/1276 [05:01<00:00,  4.23it/s, LR=1.67e-5, Train_Loss=24.7]
  0%|          | 0/319 [00:00<?, ?it/s]

Training -> Loss: 24.718438025925973


100%|██████████| 319/319 [00:29<00:00, 10.65it/s, Eval_Loss=24.4]


Validation -> Loss: 24.44479826625238
Validation score improved (26.397076854885185 --> 24.44479826625238). Saving model!


  0%|          | 0/1276 [00:00<?, ?it/s]

Epoch 3, lr 1.6663278640917674e-05
Mode Scheduler: OK


100%|██████████| 1276/1276 [05:01<00:00,  4.23it/s, LR=1.48e-5, Train_Loss=22.1]
  0%|          | 0/319 [00:00<?, ?it/s]

Training -> Loss: 22.098102164866408


100%|██████████| 319/319 [00:28<00:00, 11.17it/s, Eval_Loss=25]  
  0%|          | 0/1276 [00:00<?, ?it/s]

Validation -> Loss: 25.013477800781825
EarlyStopping counter: 1 out of 5
Epoch 4, lr 1.4810512559895456e-05
Mode Scheduler: OK


100%|██████████| 1276/1276 [05:02<00:00,  4.22it/s, LR=1.3e-5, Train_Loss=20.2] 
  0%|          | 0/319 [00:00<?, ?it/s]

Training -> Loss: 20.232161164283752


100%|██████████| 319/319 [00:28<00:00, 11.14it/s, Eval_Loss=24]  


Validation -> Loss: 24.03768321189761
Validation score improved (24.44479826625238 --> 24.03768321189761). Saving model!


  0%|          | 0/1276 [00:00<?, ?it/s]

Epoch 5, lr 1.2957746478873242e-05
Mode Scheduler: OK


100%|██████████| 1276/1276 [05:02<00:00,  4.22it/s, LR=1.11e-5, Train_Loss=18.8]
  0%|          | 0/319 [00:00<?, ?it/s]

Training -> Loss: 18.840414776884277


100%|██████████| 319/319 [00:29<00:00, 10.71it/s, Eval_Loss=22.7]


Validation -> Loss: 22.68735703853978
Validation score improved (24.03768321189761 --> 22.68735703853978). Saving model!


  0%|          | 0/1276 [00:00<?, ?it/s]

Epoch 6, lr 1.1104980397851025e-05
Mode Scheduler: OK


100%|██████████| 1276/1276 [05:02<00:00,  4.22it/s, LR=9.25e-6, Train_Loss=17.7]
  0%|          | 0/319 [00:00<?, ?it/s]

Training -> Loss: 17.72562751613067


100%|██████████| 319/319 [00:29<00:00, 10.67it/s, Eval_Loss=23.2]
  0%|          | 0/1276 [00:00<?, ?it/s]

Validation -> Loss: 23.174696569532436
EarlyStopping counter: 1 out of 5
Epoch 7, lr 9.252214316828807e-06
Mode Scheduler: OK


100%|██████████| 1276/1276 [05:02<00:00,  4.22it/s, LR=7.4e-6, Train_Loss=16.9] 
  0%|          | 0/319 [00:00<?, ?it/s]

Training -> Loss: 16.851493262758822


100%|██████████| 319/319 [00:29<00:00, 10.78it/s, Eval_Loss=23.4]
  0%|          | 0/1276 [00:00<?, ?it/s]

Validation -> Loss: 23.44645991370222
EarlyStopping counter: 2 out of 5
Epoch 8, lr 7.399448235806593e-06
Mode Scheduler: OK


100%|██████████| 1276/1276 [05:02<00:00,  4.21it/s, LR=5.55e-6, Train_Loss=16.2]
  0%|          | 0/319 [00:00<?, ?it/s]

Training -> Loss: 16.216899175722396


100%|██████████| 319/319 [00:28<00:00, 11.10it/s, Eval_Loss=23.1]
  0%|          | 0/1276 [00:00<?, ?it/s]

Validation -> Loss: 23.102727049944168
EarlyStopping counter: 3 out of 5
Epoch 9, lr 5.546682154784376e-06
Mode Scheduler: OK


100%|██████████| 1276/1276 [05:01<00:00,  4.23it/s, LR=3.69e-6, Train_Loss=15.7]
  0%|          | 0/319 [00:00<?, ?it/s]

Training -> Loss: 15.71641726889954


100%|██████████| 319/319 [00:29<00:00, 10.89it/s, Eval_Loss=22.5]


Validation -> Loss: 22.46399335054021
Validation score improved (22.68735703853978 --> 22.46399335054021). Saving model!


  0%|          | 0/1276 [00:00<?, ?it/s]

Epoch 10, lr 3.693916073762161e-06
Mode Scheduler: OK


100%|██████████| 1276/1276 [05:02<00:00,  4.21it/s, LR=1.84e-6, Train_Loss=15.3]
  0%|          | 0/319 [00:00<?, ?it/s]

Training -> Loss: 15.257453742258974


100%|██████████| 319/319 [00:28<00:00, 11.13it/s, Eval_Loss=22.2]


Validation -> Loss: 22.229657197073337
Validation score improved (22.46399335054021 --> 22.229657197073337). Saving model!


  0%|          | 0/1276 [00:00<?, ?it/s]

Epoch 11, lr 1.841149992739945e-06
Mode Scheduler: OK


100%|██████████| 1276/1276 [05:02<00:00,  4.22it/s, LR=0, Train_Loss=14.9]      
  0%|          | 0/319 [00:00<?, ?it/s]

Training -> Loss: 14.923077658052355


100%|██████████| 319/319 [00:29<00:00, 10.75it/s, Eval_Loss=22.2]


Validation -> Loss: 22.19135571721953
Validation score improved (22.229657197073337 --> 22.19135571721953). Saving model!
Didn't meet early stopping


In [6]:
calc_oof(df_train,config_ad)

Predicting Model: 0


HBox(children=(FloatProgress(value=0.0, max=319.0), HTML(value='')))


Predicting Model: 1


HBox(children=(FloatProgress(value=0.0, max=319.0), HTML(value='')))


Predicting Model: 2


HBox(children=(FloatProgress(value=0.0, max=319.0), HTML(value='')))


Predicting Model: 3


HBox(children=(FloatProgress(value=0.0, max=319.0), HTML(value='')))


Predicting Model: 4


HBox(children=(FloatProgress(value=0.0, max=319.0), HTML(value='')))


OOF_SCORE (RMSE):  4.78286983163525


4.78286983163525

## 3.Extracting Embedding Layer

In [47]:
df_train     = pd.read_csv(os.path.join("../01.Data",'fold.csv'))
embedding_all = 0
for fold in np.sort(df_train['fold'].unique()):
    valid       = df_train[df_train['fold']==fold]
    valid_index = valid.index.to_list()
    valid = valid.reset_index(drop=True)
    col_unique = generate_col_unique(valid,config_ad.COLUMNS_ENCODE)
    tokenizer     = transformers.RobertaTokenizer.from_pretrained(config_ad.PRETRAINED)
    valid_dataset = BNPParibasText(valid,config_ad.MAX_LENGTH,tokenizer,col_unique)
    valid_loader = torch.utils.data.DataLoader(
            valid_dataset,
            batch_size  = 32,
            pin_memory  = True,
            num_workers = 72
        )
    
    
    model         = Roberta_Model(pretrained_model=config_ad.PRETRAINED)
    model.load_state_dict(torch.load(f'../03.Models/Roberta-Pretrained/BNP_PARIBAS_ROBERTA_ADDITIVE_FOLD_{fold}'))
    df_train.loc[valid_index,[f'emb_{col_unique}_{i}' for i in range(embedding_all_train.shape[1])]] = get_embedding(valid_loader, model, config_ad.DEVICE)
    del model
    torch.cuda.empty_cache()

HBox(children=(FloatProgress(value=0.0, max=638.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=638.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=638.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=638.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=638.0), HTML(value='')))




In [48]:
df_test      = pd.read_csv(os.path.join(DATA_PATH,'test_preprocessed.csv'))
df_test['target'] = -1

col_unique = generate_col_unique(df_test,config_ad.COLUMNS_ENCODE)
tokenizer     = transformers.RobertaTokenizer.from_pretrained(config_ad.PRETRAINED)
test_dataset = BNPParibasText(df_test,config_ad.MAX_LENGTH,tokenizer,col_unique)
test_loader = torch.utils.data.DataLoader(
            test_dataset,
            batch_size  = 32,
            pin_memory  = True,
            num_workers = 72
        )
embedding_all = 0
for fold in np.sort(df_train['fold'].unique()):
    model         = Roberta_Model(pretrained_model=config_ad.PRETRAINED)
    model.load_state_dict(torch.load(f'../03.Models/Roberta-Pretrained/BNP_PARIBAS_ROBERTA_ADDITIVE_FOLD_{fold}'))
    embedding_all += get_embedding(test_loader, model, config_ad.DEVICE)
    del model
    torch.cuda.empty_cache()
embedding_all_test = embedding_all/len(df_train['fold'].unique())
df_test[[f'emb_{col_unique}_{i}' for i in range(embedding_all_test.shape[1])]] = embedding_all_test

HBox(children=(FloatProgress(value=0.0, max=798.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=798.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=798.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=798.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=798.0), HTML(value='')))




In [49]:
# saving dataset
df_train.to_csv(os.path.join(DATA_PATH,'train_embeddings.csv'),index = False)
df_test.to_csv(os.path.join(DATA_PATH,'test_embeddings.csv'),index = False)
del df_train,df_test

## 4. Modeling Lightgbm

In [56]:
df_train = pd.read_csv(os.path.join(DATA_PATH,'train_embeddings.csv'))
df_test  = pd.read_csv(os.path.join(DATA_PATH,'test_embeddings.csv'))
columns_modeling = ['additives_n','ingredients_from_palm_oil_n',
                    'ingredients_that_may_be_from_palm_oil_n','target',
                    'states_en_brands','states_en_categories','states_en_characteristics','states_en_expiration date',
                    'states_en_general_complete','states_en_ingredients','pnns_groups_1','pnns_groups_2',
                    'states_en_packaging','states_en_packaging-code-','states_en_photo_upload',
                    'states_en_photo_validate','states_en_product name','states_en_quantity','diff_t'] + [f'emb_{col_unique}_{i}' for i in range(embedding_all.shape[1])]
columns_label = df_train[columns_modeling].select_dtypes(include=['object']).columns.to_list()
print(columns_label)


['states_en_brands', 'states_en_categories', 'states_en_characteristics', 'states_en_expiration date', 'states_en_general_complete', 'states_en_ingredients', 'pnns_groups_1', 'pnns_groups_2', 'states_en_packaging', 'states_en_packaging-code-', 'states_en_photo_upload', 'states_en_photo_validate', 'states_en_product name', 'states_en_quantity']


In [57]:
df_train,dict_le = label_encoding(df_train,label_cols = columns_label, drop_original = True, missing_new_cat = True)
a_file = open("../03.Models/General/label_encoding.pkl", "wb")
pickle.dump(dict_le, a_file)
a_file.close()
del a_file,dict_le

Mode: Missing as new category
Label Encoding:  label_states_en_brands
Label Encoding:  label_states_en_categories
Label Encoding:  label_states_en_characteristics
Label Encoding:  label_states_en_expiration date
Label Encoding:  label_states_en_general_complete
Label Encoding:  label_states_en_ingredients
Label Encoding:  label_pnns_groups_1
Label Encoding:  label_pnns_groups_2
Label Encoding:  label_states_en_packaging
Label Encoding:  label_states_en_packaging-code-
Label Encoding:  label_states_en_photo_upload
Label Encoding:  label_states_en_photo_validate
Label Encoding:  label_states_en_product name
Label Encoding:  label_states_en_quantity


In [58]:
a_file = open("../03.Models/General/label_encoding.pkl", "rb")
dict_le = pickle.load(a_file)
df_test = apply_label_encoder(df_test,dict_le,drop_original = True, missing_new_cat = True)
del dict_le,a_file

Mode: Missing as new category
Applying Label Encoding:  label_states_en_brands
Applying Label Encoding:  label_states_en_categories
Applying Label Encoding:  label_states_en_characteristics
Applying Label Encoding:  label_states_en_expiration date
Applying Label Encoding:  label_states_en_general_complete
Applying Label Encoding:  label_states_en_ingredients
Applying Label Encoding:  label_pnns_groups_1
Applying Label Encoding:  label_pnns_groups_2
Applying Label Encoding:  label_states_en_packaging
Applying Label Encoding:  label_states_en_packaging-code-
Applying Label Encoding:  label_states_en_photo_upload
Applying Label Encoding:  label_states_en_photo_validate
Applying Label Encoding:  label_states_en_product name
Applying Label Encoding:  label_states_en_quantity


In [59]:
params = {
        'task': 'train',
        'boosting_type': 'gbdt',
        'objective': 'regression',
        'metric': {'rmse'},
        'num_leaves':16,
        'learning_rate': 0.01,
        "min_child_samples": 150,
        "max_depth" : 5,
        'feature_fraction':  0.5,
        "bagging_freq": 1,
        'bagging_fraction': 0.75,
        "is_unbalance" : False,
        'force_col_wise':True,
        'num_threads':18,
        #"scale_pos_weight":5 -> Generally  is the ratio of number of negative class to the positive class.
        'bagging_seed':42,
        'lambda_l1':1.5,
        'lambda_l2':1,
        'verbose': 1

}
cat_columns = [i for i in df_train.columns.to_list() if i.startswith('label_')]
columns_modeling_last = list(set(columns_modeling)-set(columns_label)) + ['fold'] + cat_columns 
#columns_modeling_last.remove('len_countries')

In [60]:
results,models,importances,oof,feature_list = Training_Lightgbm(df_train[columns_modeling_last],params,fold_column = 'fold',target_column = 'target',cat_vars = cat_columns ,metric = 'RMSE',early_stopping = 200,max_boost_round = 8000)

Columns: ['emb_product_name_ingredients_text_brands_tags_additives__539', 'emb_product_name_ingredients_text_brands_tags_additives__385', 'ingredients_from_palm_oil_n', 'emb_product_name_ingredients_text_brands_tags_additives__165', 'emb_product_name_ingredients_text_brands_tags_additives__572', 'emb_product_name_ingredients_text_brands_tags_additives__589', 'emb_product_name_ingredients_text_brands_tags_additives__211', 'emb_product_name_ingredients_text_brands_tags_additives__601', 'emb_product_name_ingredients_text_brands_tags_additives__300', 'emb_product_name_ingredients_text_brands_tags_additives__237', 'emb_product_name_ingredients_text_brands_tags_additives__22', 'emb_product_name_ingredients_text_brands_tags_additives__73', 'emb_product_name_ingredients_text_brands_tags_additives__167', 'emb_product_name_ingredients_text_brands_tags_additives__203', 'emb_product_name_ingredients_text_brands_tags_additives__574', 'emb_product_name_ingredients_text_brands_tags_additives__112', '



[LightGBM] [Info] Start training from score 9.171473
Training until validation scores don't improve for 200 rounds
[50]	training's rmse: 6.63612	valid_1's rmse: 6.74194
[100]	training's rmse: 5.45671	valid_1's rmse: 5.60711
[150]	training's rmse: 4.92274	valid_1's rmse: 5.07422
[200]	training's rmse: 4.68737	valid_1's rmse: 4.83513
[250]	training's rmse: 4.58305	valid_1's rmse: 4.72692
[300]	training's rmse: 4.53311	valid_1's rmse: 4.67098
[350]	training's rmse: 4.50716	valid_1's rmse: 4.64303
[400]	training's rmse: 4.49114	valid_1's rmse: 4.62586
[450]	training's rmse: 4.48004	valid_1's rmse: 4.61482
[500]	training's rmse: 4.47084	valid_1's rmse: 4.6078
[550]	training's rmse: 4.46296	valid_1's rmse: 4.60184
[600]	training's rmse: 4.45624	valid_1's rmse: 4.59875
[650]	training's rmse: 4.44992	valid_1's rmse: 4.5959
[700]	training's rmse: 4.44415	valid_1's rmse: 4.59447
[750]	training's rmse: 4.43871	valid_1's rmse: 4.59304
[800]	training's rmse: 4.43361	valid_1's rmse: 4.59182
[850]	tr



[LightGBM] [Info] Start training from score 9.169930
Training until validation scores don't improve for 200 rounds
[50]	training's rmse: 6.62929	valid_1's rmse: 6.7207
[100]	training's rmse: 5.44793	valid_1's rmse: 5.61264
[150]	training's rmse: 4.91584	valid_1's rmse: 5.10965
[200]	training's rmse: 4.68206	valid_1's rmse: 4.87825
[250]	training's rmse: 4.57937	valid_1's rmse: 4.76964
[300]	training's rmse: 4.53063	valid_1's rmse: 4.71659
[350]	training's rmse: 4.50492	valid_1's rmse: 4.68899
[400]	training's rmse: 4.48906	valid_1's rmse: 4.67336
[450]	training's rmse: 4.47788	valid_1's rmse: 4.66398
[500]	training's rmse: 4.46884	valid_1's rmse: 4.65686
[550]	training's rmse: 4.46174	valid_1's rmse: 4.65207
[600]	training's rmse: 4.45465	valid_1's rmse: 4.64716
[650]	training's rmse: 4.44856	valid_1's rmse: 4.64393
[700]	training's rmse: 4.44264	valid_1's rmse: 4.64054
[750]	training's rmse: 4.43737	valid_1's rmse: 4.63934
[800]	training's rmse: 4.4321	valid_1's rmse: 4.63806
[850]	tr



[LightGBM] [Info] Start training from score 9.171253
Training until validation scores don't improve for 200 rounds
[50]	training's rmse: 6.64344	valid_1's rmse: 6.85049
[100]	training's rmse: 5.46751	valid_1's rmse: 5.72401
[150]	training's rmse: 4.93517	valid_1's rmse: 5.18179
[200]	training's rmse: 4.7004	valid_1's rmse: 4.90465
[250]	training's rmse: 4.59709	valid_1's rmse: 4.76762
[300]	training's rmse: 4.5482	valid_1's rmse: 4.69593
[350]	training's rmse: 4.52206	valid_1's rmse: 4.65731
[400]	training's rmse: 4.50566	valid_1's rmse: 4.63329
[450]	training's rmse: 4.4941	valid_1's rmse: 4.61926
[500]	training's rmse: 4.48464	valid_1's rmse: 4.60937
[550]	training's rmse: 4.47712	valid_1's rmse: 4.60402
[600]	training's rmse: 4.47015	valid_1's rmse: 4.59898
[650]	training's rmse: 4.46385	valid_1's rmse: 4.59777
[700]	training's rmse: 4.45809	valid_1's rmse: 4.59639
[750]	training's rmse: 4.45256	valid_1's rmse: 4.59518
[800]	training's rmse: 4.4472	valid_1's rmse: 4.59252
[850]	trai



[LightGBM] [Info] Start training from score 9.170344
Training until validation scores don't improve for 200 rounds
[50]	training's rmse: 6.63308	valid_1's rmse: 6.88231
[100]	training's rmse: 5.45133	valid_1's rmse: 5.762
[150]	training's rmse: 4.91604	valid_1's rmse: 5.21873
[200]	training's rmse: 4.67957	valid_1's rmse: 4.95332
[250]	training's rmse: 4.57499	valid_1's rmse: 4.82635
[300]	training's rmse: 4.5254	valid_1's rmse: 4.75884
[350]	training's rmse: 4.49843	valid_1's rmse: 4.72314
[400]	training's rmse: 4.48208	valid_1's rmse: 4.70244
[450]	training's rmse: 4.46997	valid_1's rmse: 4.68786
[500]	training's rmse: 4.46083	valid_1's rmse: 4.67864
[550]	training's rmse: 4.45279	valid_1's rmse: 4.67226
[600]	training's rmse: 4.44574	valid_1's rmse: 4.66909
[650]	training's rmse: 4.43953	valid_1's rmse: 4.667
[700]	training's rmse: 4.43347	valid_1's rmse: 4.66458
[750]	training's rmse: 4.42826	valid_1's rmse: 4.6644
[800]	training's rmse: 4.42291	valid_1's rmse: 4.66483
[850]	traini



[LightGBM] [Info] Start training from score 9.170246
Training until validation scores don't improve for 200 rounds
[50]	training's rmse: 6.63636	valid_1's rmse: 6.82241
[100]	training's rmse: 5.45878	valid_1's rmse: 5.69936
[150]	training's rmse: 4.92578	valid_1's rmse: 5.1704
[200]	training's rmse: 4.6909	valid_1's rmse: 4.93411
[250]	training's rmse: 4.5873	valid_1's rmse: 4.81846
[300]	training's rmse: 4.53824	valid_1's rmse: 4.76592
[350]	training's rmse: 4.51233	valid_1's rmse: 4.7337
[400]	training's rmse: 4.49638	valid_1's rmse: 4.71525
[450]	training's rmse: 4.48475	valid_1's rmse: 4.70097
[500]	training's rmse: 4.47535	valid_1's rmse: 4.69218
[550]	training's rmse: 4.46771	valid_1's rmse: 4.68492
[600]	training's rmse: 4.46071	valid_1's rmse: 4.68008
[650]	training's rmse: 4.45451	valid_1's rmse: 4.67736
[700]	training's rmse: 4.4485	valid_1's rmse: 4.67401
[750]	training's rmse: 4.44273	valid_1's rmse: 4.67168
[800]	training's rmse: 4.43722	valid_1's rmse: 4.67098
[850]	train

In [64]:
textfile = open("../03.Models/General/feature_list.txt", "w")
for element in feature_list:
    textfile.write(element + "\n")
textfile.close()

In [65]:
results

Unnamed: 0,Model_Name,Mean Valid RMSE,Std Valid RMSE,Mean Train RMSE,Std Train RMSE,OOF RMSE,Diff RMSE,Time
0,Lgbm Model,4.627371,0.033401,4.402904,0.01872,4.627491,-0.224467,114901 s


In [62]:
# Saving Models
for fold,model in enumerate(models):
    filename = f'../03.Models/Lightgbm/lgbm_fold_{fold}'
    pickle.dump(model, open(filename, 'wb'))
del models