## 1. Importing Libraries

In [1]:
import shutil
import apiquery
import pandas as pd
import sys
import seaborn as sns
import os
import numpy as np
import random
import torch
import gc
DATA_PATH = '../01.Data'
shutil.copy("apiquery_pyc.py", "apiquery.pyc")

module_path = "../src"
if module_path not in sys.path:
    sys.path.append(module_path)
    
from utils.training import *
from utils.encoding import *
from utils.utils import *
from utils.fetch import *
from dataset.dataset import BNPParibasText
from models.models import Roberta_Model
from utils.EarlyStopping import EarlyStopping
from utils.LoopFunctions import train_fn,valid_fn
from utils.prediction import get_prediction,get_embedding
pd.options.display.max_rows = 100
pd.options.display.max_columns = 100
import math
from collections import Counter
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import mean_squared_error
import math
import time
import lightgbm as lgbm
import matplotlib.pyplot as plt
import torch.nn as nn
import config
import transformers

In [6]:
%%time
df_train     = pd.read_csv(os.path.join("../01.Data",'fold.csv'))
y_submission = pd.read_csv(os.path.join(DATA_PATH,'y_test_submission_example.tsv'), index_col='Index', encoding='utf-8', sep='\t')

CPU times: user 2.45 s, sys: 301 ms, total: 2.75 s
Wall time: 2.75 s


In [2]:
def calc_oof(df,config):
    df.loc[:,'oof'] = -1
    for fold in np.sort(df.fold.unique()):
        print(f'Predicting Model: {fold}')
        valid       = df[df['fold']==fold]
        valid_index = valid.index.to_list()
        valid = valid.reset_index(drop=True)
        # Defining DataSet
        col_unique = generate_col_unique(valid,config.COLUMNS_ENCODE)
        tokenizer = transformers.RobertaTokenizer.from_pretrained(config.PRETRAINED)
        valid_dataset = BNPParibasText(valid,config.MAX_LENGTH,tokenizer,col_unique)
        valid_loader = torch.utils.data.DataLoader(
            valid_dataset,
            batch_size  = config.BATCH_SIZE,
            num_workers = config.NUM_WORKERS,
            shuffle     = False,
            pin_memory  = True,
        )

       # Defining Device
        model = Roberta_Model(pretrained_model=config.PRETRAINED,dropout = config.DROPOUT)
        model.load_state_dict(torch.load(f'../03.Models/BNP_PARIBAS_ROBERTA_FOLD_{fold}'))
        model.to(config.DEVICE)
        preds = get_prediction(valid_loader, model,config.DEVICE)
        df.loc[valid_index,'oof'] = preds
    oof_score = np.sqrt(mean_squared_error(df['target'],df['oof']))
    print('OOF_SCORE (RMSE): ',oof_score)
    return oof_score


# Calculating predictions for test
def calculate_test(test,config):
    col_unique = generate_col_unique(test,config.COLUMNS_ENCODE)   
    tokenizer     = transformers.RobertaTokenizer.from_pretrained(config.PRETRAINED)
    test_dataset = BNPParibasText(test,config.MAX_LENGTH,tokenizer,col_unique)
    test_loader = torch.utils.data.DataLoader(
        test_dataset,
        batch_size  = config.BATCH_SIZE,
        pin_memory  = True,
        num_workers = config.NUM_WORKERS
    )
    preds = 0
    for fold in range(0,5):
        model = Roberta_Model(pretrained_model=config.PRETRAINED,dropout = config.DROPOUT)
        model.load_state_dict(torch.load(f'../03.Models/BNP_PARIBAS_ROBERTA_FOLD_{fold}'))
        model.to(config.DEVICE)
        preds = preds + get_prediction(test_loader, model,config.DEVICE)
    test['preds'] = preds/5
    print(f'Real RMSE: ',math.sqrt(mean_squared_error(test['preds'].values,test['Target'].values)))

In [3]:
def run(data,fold,output_path,config,run=None):
    print(f'******************** Model Fold {fold}  *****************')
    seed_everything(seed=config.SEED)
    train = data[data['fold']!=fold].reset_index(drop=True)
    valid = data[data['fold']==fold].reset_index(drop=True)
    col_unique = generate_col_unique(train,config.COLUMNS_ENCODE)
    col_unique = generate_col_unique(valid,config.COLUMNS_ENCODE)
    
    
    print('Train: ',train.shape[0], 'Valid: ',valid.shape[0])
    # Defining DataSet
    tokenizer     = transformers.RobertaTokenizer.from_pretrained(config.PRETRAINED)
    train_dataset = BNPParibasText(train,config.MAX_LENGTH,tokenizer,col_unique)
    valid_dataset = BNPParibasText(valid,config.MAX_LENGTH,tokenizer,col_unique)
        
    train_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size  = config.BATCH_SIZE,
        pin_memory  = True,
        num_workers = config.NUM_WORKERS
    )

    valid_loader = torch.utils.data.DataLoader(
        valid_dataset,
        batch_size  = config.BATCH_SIZE,
        num_workers = config.NUM_WORKERS,
        shuffle     = False,
        pin_memory  = True,
    )
    
    # Defining Device
    model = Roberta_Model(pretrained_model=config.PRETRAINED,dropout = config.DROPOUT)
    model.to(config.DEVICE)
    criterion = nn.MSELoss()
    criterion.to(config.DEVICE)

    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_parameters = [
        {
            "params": [
                p for n, p in param_optimizer if not any(nd in n for nd in no_decay)
            ],
            "weight_decay": config.WEIGHT_DECAY,
        },
        {
            "params": [
                p for n, p in param_optimizer if any(nd in n for nd in no_decay)
            ],
            "weight_decay": 0.0,
        },
    ]

    num_train_steps = int((len(train) / config.BATCH_SIZE )* config.EPOCHS)
    config.SCHEDULER_PARAMETERS['NUM_TRAIN_STEPS'] = num_train_steps
    print(f'num_train_steps: {num_train_steps}')
    optimizer = fetch_optimizer(config.OPTIMIZER_NAME,config.LEARNING_RATE,optimizer_parameters)
    scheduler = fetch_scheduler(config.SCHEDULER_NAME,optimizer,config.SCHEDULER_PARAMETERS)
    
    es = EarlyStopping (patience = config.EARLY_STOPPING, mode = config.MODE,delta=0)
      
    for epoch in range(config.EPOCHS):
        print('Epoch {}, lr {}'.format(epoch, optimizer.param_groups[0]['lr']))        
        training_loss = train_fn(train_loader,model,criterion,optimizer,config.DEVICE,scheduler,mode_sched = config.MODE_SCHEDULER)
        valid_loss    = valid_fn(valid_loader,model,criterion,config.DEVICE)
        if run:
            run.log({'training_loss':training_loss,'valid_loss':valid_loss})
            
        es(valid_loss, model,output_path)
        
        if es.early_stop:
            print('Meet early stopping')
            return es.get_best_val_score()
        gc.collect()
        torch.cuda.empty_cache()
    print("Didn't meet early stopping")
    return es.get_best_val_score()

In [7]:
for i in range(0,5):
    output_path = f'../03.Models/BNP_PARIBAS_ROBERTA_FOLD_{i}'
    run(df_train,i,output_path,config)

******************** Model Fold 4  *****************
Train:  81623 Valid:  20405


  0%|          | 0/1276 [00:00<?, ?it/s]

num_train_steps: 15304
Epoch 0, lr 0.0
Mode Scheduler: OK


100%|██████████| 1276/1276 [05:04<00:00,  4.19it/s, LR=1.67e-5, Train_Loss=67.2]
  0%|          | 0/319 [00:00<?, ?it/s]

Training -> Loss: 67.19512297292488


100%|██████████| 319/319 [00:30<00:00, 10.48it/s, Eval_Loss=33.4]


Validation -> Loss: 33.36113730044948
Validation score improved (inf --> 33.36113730044948). Saving model!


  0%|          | 0/1276 [00:00<?, ?it/s]

Epoch 1, lr 1.6679738562091504e-05
Mode Scheduler: OK


100%|██████████| 1276/1276 [05:06<00:00,  4.16it/s, LR=1.85e-5, Train_Loss=29.7]
  0%|          | 0/319 [00:00<?, ?it/s]

Training -> Loss: 29.724486125300295


100%|██████████| 319/319 [00:30<00:00, 10.53it/s, Eval_Loss=26]  


Validation -> Loss: 26.022469039247326
Validation score improved (33.36113730044948 --> 26.022469039247326). Saving model!


  0%|          | 0/1276 [00:00<?, ?it/s]

Epoch 2, lr 1.8516044721939887e-05
Mode Scheduler: OK


100%|██████████| 1276/1276 [05:04<00:00,  4.19it/s, LR=1.67e-5, Train_Loss=24.5]
  0%|          | 0/319 [00:00<?, ?it/s]

Training -> Loss: 24.513245080332023


100%|██████████| 319/319 [00:29<00:00, 10.65it/s, Eval_Loss=25.4]


Validation -> Loss: 25.440321551594987
Validation score improved (26.022469039247326 --> 25.440321551594987). Saving model!


  0%|          | 0/1276 [00:00<?, ?it/s]

Epoch 3, lr 1.6663278640917674e-05
Mode Scheduler: OK


100%|██████████| 1276/1276 [05:05<00:00,  4.18it/s, LR=1.48e-5, Train_Loss=22]  
  0%|          | 0/319 [00:00<?, ?it/s]

Training -> Loss: 21.967160881873582


100%|██████████| 319/319 [00:29<00:00, 10.65it/s, Eval_Loss=24.1]


Validation -> Loss: 24.118778252676364
Validation score improved (25.440321551594987 --> 24.118778252676364). Saving model!


  0%|          | 0/1276 [00:00<?, ?it/s]

Epoch 4, lr 1.4810512559895456e-05
Mode Scheduler: OK


100%|██████████| 1276/1276 [05:01<00:00,  4.23it/s, LR=1.3e-5, Train_Loss=20.2] 
  0%|          | 0/319 [00:00<?, ?it/s]

Training -> Loss: 20.168962823559873


100%|██████████| 319/319 [00:29<00:00, 10.93it/s, Eval_Loss=23.1]


Validation -> Loss: 23.065489057463164
Validation score improved (24.118778252676364 --> 23.065489057463164). Saving model!


  0%|          | 0/1276 [00:00<?, ?it/s]

Epoch 5, lr 1.2957746478873242e-05
Mode Scheduler: OK


100%|██████████| 1276/1276 [05:01<00:00,  4.23it/s, LR=1.11e-5, Train_Loss=18.8]
  0%|          | 0/319 [00:00<?, ?it/s]

Training -> Loss: 18.785607548529825


100%|██████████| 319/319 [00:29<00:00, 10.99it/s, Eval_Loss=23.1]


Validation -> Loss: 23.056060429277093
Validation score improved (23.065489057463164 --> 23.056060429277093). Saving model!


  0%|          | 0/1276 [00:00<?, ?it/s]

Epoch 6, lr 1.1104980397851025e-05
Mode Scheduler: OK


100%|██████████| 1276/1276 [05:01<00:00,  4.23it/s, LR=9.25e-6, Train_Loss=17.8]
  0%|          | 0/319 [00:00<?, ?it/s]

Training -> Loss: 17.76202331347899


100%|██████████| 319/319 [00:29<00:00, 10.77it/s, Eval_Loss=22.9]


Validation -> Loss: 22.942585176808706
Validation score improved (23.056060429277093 --> 22.942585176808706). Saving model!


  0%|          | 0/1276 [00:00<?, ?it/s]

Epoch 7, lr 9.252214316828807e-06
Mode Scheduler: OK


100%|██████████| 1276/1276 [05:01<00:00,  4.23it/s, LR=7.4e-6, Train_Loss=16.9] 
  0%|          | 0/319 [00:00<?, ?it/s]

Training -> Loss: 16.89215257063181


100%|██████████| 319/319 [00:29<00:00, 10.78it/s, Eval_Loss=23.3]
  0%|          | 0/1276 [00:00<?, ?it/s]

Validation -> Loss: 23.323868285152233
EarlyStopping counter: 1 out of 5
Epoch 8, lr 7.399448235806593e-06
Mode Scheduler: OK


100%|██████████| 1276/1276 [05:02<00:00,  4.22it/s, LR=5.55e-6, Train_Loss=16.2]
  0%|          | 0/319 [00:00<?, ?it/s]

Training -> Loss: 16.22758826697508


100%|██████████| 319/319 [00:29<00:00, 10.81it/s, Eval_Loss=22.7]


Validation -> Loss: 22.71935713179059
Validation score improved (22.942585176808706 --> 22.71935713179059). Saving model!


  0%|          | 0/1276 [00:00<?, ?it/s]

Epoch 9, lr 5.546682154784376e-06
Mode Scheduler: OK


100%|██████████| 1276/1276 [05:01<00:00,  4.23it/s, LR=3.69e-6, Train_Loss=15.6]
  0%|          | 0/319 [00:00<?, ?it/s]

Training -> Loss: 15.600954898098793


100%|██████████| 319/319 [00:29<00:00, 10.74it/s, Eval_Loss=22.3]


Validation -> Loss: 22.314896108214757
Validation score improved (22.71935713179059 --> 22.314896108214757). Saving model!


  0%|          | 0/1276 [00:00<?, ?it/s]

Epoch 10, lr 3.693916073762161e-06
Mode Scheduler: OK


100%|██████████| 1276/1276 [05:01<00:00,  4.23it/s, LR=1.84e-6, Train_Loss=15.2]
  0%|          | 0/319 [00:00<?, ?it/s]

Training -> Loss: 15.21357053181968


100%|██████████| 319/319 [00:29<00:00, 10.75it/s, Eval_Loss=22.3]


Validation -> Loss: 22.253296053895383
Validation score improved (22.314896108214757 --> 22.253296053895383). Saving model!


  0%|          | 0/1276 [00:00<?, ?it/s]

Epoch 11, lr 1.841149992739945e-06
Mode Scheduler: OK


100%|██████████| 1276/1276 [05:02<00:00,  4.22it/s, LR=0, Train_Loss=14.9]      
  0%|          | 0/319 [00:00<?, ?it/s]

Training -> Loss: 14.920602123752284


100%|██████████| 319/319 [00:29<00:00, 10.75it/s, Eval_Loss=22.1]


Validation -> Loss: 22.061710692498377
Validation score improved (22.253296053895383 --> 22.061710692498377). Saving model!
Didn't meet early stopping


In [17]:
calc_oof(df_train,config)

Predicting Model: 0


HBox(children=(FloatProgress(value=0.0, max=319.0), HTML(value='')))


Predicting Model: 1


HBox(children=(FloatProgress(value=0.0, max=319.0), HTML(value='')))


Predicting Model: 2


HBox(children=(FloatProgress(value=0.0, max=319.0), HTML(value='')))


Predicting Model: 3


HBox(children=(FloatProgress(value=0.0, max=319.0), HTML(value='')))


Predicting Model: 4


HBox(children=(FloatProgress(value=0.0, max=319.0), HTML(value='')))


OOF_SCORE (RMSE):  4.740007182216003


4.740007182216003

In [22]:
test           = pd.read_csv(os.path.join(DATA_PATH,'test_preprocessed.csv'))
test['target'] = -1
calculate_test(test,config)

HBox(children=(FloatProgress(value=0.0, max=399.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=399.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=399.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=399.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=399.0), HTML(value='')))


Real RMSE:  4.6244272231443775


## Combining Word Embeddings with More Features

In [39]:
col_unique = generate_col_unique(df_train,config.COLUMNS_ENCODE)
tokenizer     = transformers.RobertaTokenizer.from_pretrained(config.PRETRAINED)
train_dataset = BNPParibasText(df_train,config.MAX_LENGTH,tokenizer,col_unique)
train_loader = torch.utils.data.DataLoader(
            train_dataset,
            batch_size  = 32,
            pin_memory  = True,
            num_workers = 72
        )
embedding_all = 0
for fold in np.sort(df_train['fold'].unique()):
    model         = Roberta_Model(pretrained_model=config.PRETRAINED)
    model.load_state_dict(torch.load(f'../03.Models/BNP_PARIBAS_ROBERTA_FOLD_{fold}'))
    embedding_all += get_embedding(train_loader, model, 'cuda')
    del model
    torch.cuda.empty_cache()
embedding_all = embedding_all/len(df_train['fold'].unique())
df_train[[f'emb_{col_unique}_{i}' for i in range(embedding_all.shape[1])]] = embedding_all

HBox(children=(FloatProgress(value=0.0, max=3189.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=3189.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=3189.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=3189.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=3189.0), HTML(value='')))




In [47]:
df_test      = pd.read_csv(os.path.join(DATA_PATH,'test_preprocessed.csv'))
df_test['target'] = -1

col_unique = generate_col_unique(df_test,config.COLUMNS_ENCODE)
tokenizer     = transformers.RobertaTokenizer.from_pretrained(config.PRETRAINED)
test_dataset = BNPParibasText(df_test,config.MAX_LENGTH,tokenizer,col_unique)
test_loader = torch.utils.data.DataLoader(
            test_dataset,
            batch_size  = 32,
            pin_memory  = True,
            num_workers = 72
        )
embedding_all = 0
for fold in np.sort(df_train['fold'].unique()):
    model         = Roberta_Model(pretrained_model=config.PRETRAINED)
    model.load_state_dict(torch.load(f'../03.Models/BNP_PARIBAS_ROBERTA_FOLD_{fold}'))
    embedding_all += get_embedding(test_loader, model, 'cuda')
    del model
    torch.cuda.empty_cache()
embedding_all = embedding_all/len(df_train['fold'].unique())
df_test[[f'emb_{col_unique}_{i}' for i in range(embedding_all.shape[1])]] = embedding_all

HBox(children=(FloatProgress(value=0.0, max=798.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=798.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=798.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=798.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=798.0), HTML(value='')))




In [43]:
columns_modeling = ['additives_n','ingredients_from_palm_oil_n',
                    'ingredients_that_may_be_from_palm_oil_n','target',
                    'states_en_brands','states_en_categories','states_en_characteristics','states_en_expiration date',
                    'states_en_general_complete','states_en_ingredients','pnns_groups_1','pnns_groups_2',
                    'states_en_packaging','states_en_packaging-code-','states_en_photo_upload',
                    'states_en_photo_validate','states_en_product name','states_en_quantity','diff_t'] + [f'emb_{col_unique}_{i}' for i in range(embedding_all.shape[1])]
columns_label = df_train[columns_modeling].select_dtypes(include=['object']).columns.to_list()
print(columns_label)

['states_en_brands', 'states_en_categories', 'states_en_characteristics', 'states_en_expiration date', 'states_en_general_complete', 'states_en_ingredients', 'pnns_groups_1', 'pnns_groups_2', 'states_en_packaging', 'states_en_packaging-code-', 'states_en_photo_upload', 'states_en_photo_validate', 'states_en_product name', 'states_en_quantity']


In [44]:
df_train,dict_le = label_encoding(df_train,label_cols = columns_label, drop_original = True, missing_new_cat = True)

Mode: Missing as new category
Label Encoding:  label_states_en_brands
Label Encoding:  label_states_en_categories
Label Encoding:  label_states_en_characteristics
Label Encoding:  label_states_en_expiration date
Label Encoding:  label_states_en_general_complete
Label Encoding:  label_states_en_ingredients
Label Encoding:  label_pnns_groups_1
Label Encoding:  label_pnns_groups_2
Label Encoding:  label_states_en_packaging
Label Encoding:  label_states_en_packaging-code-
Label Encoding:  label_states_en_photo_upload
Label Encoding:  label_states_en_photo_validate
Label Encoding:  label_states_en_product name
Label Encoding:  label_states_en_quantity


In [48]:
df_test = apply_label_encoder(df_test,dict_le,drop_original = True, missing_new_cat = True)

Mode: Missing as new category
Applying Label Encoding:  label_states_en_brands
Applying Label Encoding:  label_states_en_categories
Applying Label Encoding:  label_states_en_characteristics
Applying Label Encoding:  label_states_en_expiration date
Applying Label Encoding:  label_states_en_general_complete
Applying Label Encoding:  label_states_en_ingredients
Applying Label Encoding:  label_pnns_groups_1
Applying Label Encoding:  label_pnns_groups_2
Applying Label Encoding:  label_states_en_packaging
Applying Label Encoding:  label_states_en_packaging-code-
Applying Label Encoding:  label_states_en_photo_upload
Applying Label Encoding:  label_states_en_photo_validate
Applying Label Encoding:  label_states_en_product name
Applying Label Encoding:  label_states_en_quantity


In [52]:
params = {
        'task': 'train',
        'boosting_type': 'gbdt',
        'objective': 'regression',
        'metric': {'rmse'},
        'num_leaves':12,
        'learning_rate': 0.001,
        "min_child_samples": 150,
        "max_depth" : 5,
        'feature_fraction':  0.5,
        "bagging_freq": 1,
        'bagging_fraction': 0.75,
        "is_unbalance" : False,
        'force_col_wise':True,
        'num_threads':18,
        #"scale_pos_weight":5 -> Generally  is the ratio of number of negative class to the positive class.
        'bagging_seed':42,
        'lambda_l1':1.5,
        'lambda_l2':1,
        'verbose': 1

}
cat_columns = [i for i in df_train.columns.to_list() if i.startswith('label_')]
columns_modeling_last = list(set(columns_modeling)-set(columns_label)) + ['fold'] + cat_columns 

In [53]:
results,models,importances,oof,feature_list = Training_Lightgbm(df_train[columns_modeling_last],params,fold_column = 'fold',target_column = 'target',cat_vars = cat_columns ,metric = 'RMSE',early_stopping = 200,max_boost_round = 8000)

Columns: ['emb_product_name_ingredients_text_brands_tags__557', 'emb_product_name_ingredients_text_brands_tags__730', 'emb_product_name_ingredients_text_brands_tags__746', 'emb_product_name_ingredients_text_brands_tags__2', 'emb_product_name_ingredients_text_brands_tags__638', 'emb_product_name_ingredients_text_brands_tags__393', 'emb_product_name_ingredients_text_brands_tags__315', 'emb_product_name_ingredients_text_brands_tags__550', 'emb_product_name_ingredients_text_brands_tags__717', 'emb_product_name_ingredients_text_brands_tags__757', 'emb_product_name_ingredients_text_brands_tags__324', 'emb_product_name_ingredients_text_brands_tags__64', 'emb_product_name_ingredients_text_brands_tags__45', 'emb_product_name_ingredients_text_brands_tags__274', 'emb_product_name_ingredients_text_brands_tags__498', 'emb_product_name_ingredients_text_brands_tags__173', 'emb_product_name_ingredients_text_brands_tags__715', 'emb_product_name_ingredients_text_brands_tags__210', 'emb_product_name_ingr

[LightGBM] [Info] Total Bins 196221
[LightGBM] [Info] Number of data points in the train set: 81622, number of used features: 786




[LightGBM] [Info] Start training from score 9.171473
Training until validation scores don't improve for 200 rounds
[50]	training's rmse: 8.66406	valid_1's rmse: 8.65719
[100]	training's rmse: 8.32912	valid_1's rmse: 8.32015
[150]	training's rmse: 8.01344	valid_1's rmse: 8.00235
[200]	training's rmse: 7.71541	valid_1's rmse: 7.70243
[250]	training's rmse: 7.43425	valid_1's rmse: 7.41936
[300]	training's rmse: 7.16972	valid_1's rmse: 7.15305
[350]	training's rmse: 6.92061	valid_1's rmse: 6.90218
[400]	training's rmse: 6.68639	valid_1's rmse: 6.66638
[450]	training's rmse: 6.46655	valid_1's rmse: 6.4448
[500]	training's rmse: 6.25991	valid_1's rmse: 6.23659
[550]	training's rmse: 6.06628	valid_1's rmse: 6.04158
[600]	training's rmse: 5.88465	valid_1's rmse: 5.85848
[650]	training's rmse: 5.71458	valid_1's rmse: 5.68714
[700]	training's rmse: 5.55565	valid_1's rmse: 5.52705
[750]	training's rmse: 5.40722	valid_1's rmse: 5.37746
[800]	training's rmse: 5.26827	valid_1's rmse: 5.23746
[850]	t

[7150]	training's rmse: 3.49025	valid_1's rmse: 3.50769
[7200]	training's rmse: 3.48983	valid_1's rmse: 3.50767
[7250]	training's rmse: 3.48938	valid_1's rmse: 3.50759
[7300]	training's rmse: 3.48893	valid_1's rmse: 3.50752
[7350]	training's rmse: 3.4885	valid_1's rmse: 3.5075
[7400]	training's rmse: 3.48804	valid_1's rmse: 3.5074
[7450]	training's rmse: 3.48763	valid_1's rmse: 3.50735
[7500]	training's rmse: 3.48722	valid_1's rmse: 3.50737
[7550]	training's rmse: 3.48684	valid_1's rmse: 3.50735
[7600]	training's rmse: 3.48643	valid_1's rmse: 3.50731
[7650]	training's rmse: 3.48603	valid_1's rmse: 3.50728
[7700]	training's rmse: 3.48565	valid_1's rmse: 3.50729
[7750]	training's rmse: 3.48524	valid_1's rmse: 3.50726
[7800]	training's rmse: 3.48485	valid_1's rmse: 3.50723
[7850]	training's rmse: 3.48446	valid_1's rmse: 3.50721
[7900]	training's rmse: 3.48406	valid_1's rmse: 3.50717
[7950]	training's rmse: 3.48368	valid_1's rmse: 3.50719
[8000]	training's rmse: 3.48326	valid_1's rmse: 3.5

[LightGBM] [Info] Total Bins 196221
[LightGBM] [Info] Number of data points in the train set: 81622, number of used features: 786




[LightGBM] [Info] Start training from score 9.169930
Training until validation scores don't improve for 200 rounds
[50]	training's rmse: 8.66061	valid_1's rmse: 8.66939
[100]	training's rmse: 8.32525	valid_1's rmse: 8.33301
[150]	training's rmse: 8.00917	valid_1's rmse: 8.01585
[200]	training's rmse: 7.71063	valid_1's rmse: 7.71689
[250]	training's rmse: 7.42911	valid_1's rmse: 7.43512
[300]	training's rmse: 7.16412	valid_1's rmse: 7.16992
[350]	training's rmse: 6.91461	valid_1's rmse: 6.92008
[400]	training's rmse: 6.67988	valid_1's rmse: 6.68534
[450]	training's rmse: 6.45957	valid_1's rmse: 6.46514
[500]	training's rmse: 6.25247	valid_1's rmse: 6.25807
[550]	training's rmse: 6.05847	valid_1's rmse: 6.06436
[600]	training's rmse: 5.8764	valid_1's rmse: 5.8827
[650]	training's rmse: 5.70588	valid_1's rmse: 5.71283
[700]	training's rmse: 5.54653	valid_1's rmse: 5.55423
[750]	training's rmse: 5.39768	valid_1's rmse: 5.40624
[800]	training's rmse: 5.25844	valid_1's rmse: 5.26801
[850]	tr

[LightGBM] [Info] Total Bins 196221
[LightGBM] [Info] Number of data points in the train set: 81622, number of used features: 786




[LightGBM] [Info] Start training from score 9.171253
Training until validation scores don't improve for 200 rounds
[50]	training's rmse: 8.66487	valid_1's rmse: 8.65381
[100]	training's rmse: 8.32973	valid_1's rmse: 8.3181
[150]	training's rmse: 8.01376	valid_1's rmse: 8.00161
[200]	training's rmse: 7.71564	valid_1's rmse: 7.7029
[250]	training's rmse: 7.43439	valid_1's rmse: 7.42072
[300]	training's rmse: 7.16949	valid_1's rmse: 7.1552
[350]	training's rmse: 6.9202	valid_1's rmse: 6.90543
[400]	training's rmse: 6.68581	valid_1's rmse: 6.67048
[450]	training's rmse: 6.46567	valid_1's rmse: 6.44971
[500]	training's rmse: 6.2588	valid_1's rmse: 6.2423
[550]	training's rmse: 6.06505	valid_1's rmse: 6.04841
[600]	training's rmse: 5.88313	valid_1's rmse: 5.86605
[650]	training's rmse: 5.71286	valid_1's rmse: 5.69539
[700]	training's rmse: 5.55371	valid_1's rmse: 5.5358
[750]	training's rmse: 5.40513	valid_1's rmse: 5.38695
[800]	training's rmse: 5.26612	valid_1's rmse: 5.2479
[850]	training

[LightGBM] [Info] Total Bins 196221
[LightGBM] [Info] Number of data points in the train set: 81623, number of used features: 786




[LightGBM] [Info] Start training from score 9.170344
Training until validation scores don't improve for 200 rounds
[50]	training's rmse: 8.66146	valid_1's rmse: 8.66793
[100]	training's rmse: 8.32458	valid_1's rmse: 8.33943
[150]	training's rmse: 8.00703	valid_1's rmse: 8.03032
[200]	training's rmse: 7.70727	valid_1's rmse: 7.73857
[250]	training's rmse: 7.42448	valid_1's rmse: 7.4632
[300]	training's rmse: 7.15818	valid_1's rmse: 7.20444
[350]	training's rmse: 6.90742	valid_1's rmse: 6.96077
[400]	training's rmse: 6.67152	valid_1's rmse: 6.73206
[450]	training's rmse: 6.45009	valid_1's rmse: 6.51771
[500]	training's rmse: 6.24193	valid_1's rmse: 6.31657
[550]	training's rmse: 6.04678	valid_1's rmse: 6.12846
[600]	training's rmse: 5.86383	valid_1's rmse: 5.95184
[650]	training's rmse: 5.69243	valid_1's rmse: 5.78652
[700]	training's rmse: 5.53226	valid_1's rmse: 5.63254
[750]	training's rmse: 5.38252	valid_1's rmse: 5.48862
[800]	training's rmse: 5.24252	valid_1's rmse: 5.3541
[850]	tr

[LightGBM] [Info] Total Bins 196221
[LightGBM] [Info] Number of data points in the train set: 81623, number of used features: 786




[LightGBM] [Info] Start training from score 9.170246
Training until validation scores don't improve for 200 rounds
[50]	training's rmse: 8.66088	valid_1's rmse: 8.669
[100]	training's rmse: 8.32513	valid_1's rmse: 8.33438
[150]	training's rmse: 8.00877	valid_1's rmse: 8.01938
[200]	training's rmse: 7.7099	valid_1's rmse: 7.72185
[250]	training's rmse: 7.42797	valid_1's rmse: 7.44136
[300]	training's rmse: 7.16261	valid_1's rmse: 7.17768
[350]	training's rmse: 6.91289	valid_1's rmse: 6.92955
[400]	training's rmse: 6.67797	valid_1's rmse: 6.69606
[450]	training's rmse: 6.45742	valid_1's rmse: 6.47712
[500]	training's rmse: 6.25017	valid_1's rmse: 6.2717
[550]	training's rmse: 6.05586	valid_1's rmse: 6.07886
[600]	training's rmse: 5.87354	valid_1's rmse: 5.89854
[650]	training's rmse: 5.70287	valid_1's rmse: 5.72969
[700]	training's rmse: 5.54328	valid_1's rmse: 5.57209
[750]	training's rmse: 5.39428	valid_1's rmse: 5.42503
[800]	training's rmse: 5.2548	valid_1's rmse: 5.28737
[850]	train

In [55]:
probs = 0
for i in models:
    probs = probs + (i.predict(df_test[feature_list]))
    print('fin_predict')
y_test_pred = probs/5.0
print(f'Real: ',math.sqrt(mean_squared_error(y_test_pred,df_test['Target'].values)))

fin_predict
fin_predict
fin_predict
fin_predict
fin_predict
Real:  4.4556653551634025
