In [24]:
import shutil
import apiquery
import pandas as pd
import sys
import seaborn as sns
import os
import torch
import numpy as np
import random
DATA_PATH = '../01.Data'
shutil.copy("apiquery_pyc.py", "apiquery.pyc")
module_path = "../src"
if module_path not in sys.path:
    sys.path.append(module_path)
    
from utils.training import *
from utils.encoding import *
from utils.utils import *
from models.models import Roberta_Model
from dataset.dataset import BNPParibasText
pd.options.display.max_rows = 100
pd.options.display.max_columns = 100
import math
from collections import Counter
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import mean_squared_error
import math
import time
import matplotlib.pyplot as plt
import transformers
pd.set_option('display.max_rows', 900)


In [None]:
df_train['in']

In [6]:
def get_embedding(data_loader, model, device):
    from tqdm.notebook import tqdm
    # Put the model in eval mode
    model.to(device)
    model.eval()
    # List for store final predictions
    final_predictions = []
    with torch.no_grad():
        tk0 = tqdm(data_loader, total=len(data_loader))
        for b_idx, data in enumerate(tk0):
            for key,value in data.items():
                data[key] = value.to(device)
            predictions = model._embeddings(data['ids'],data['mask'])
            predictions = predictions.cpu()
            final_predictions.append(predictions)
    return np.vstack(final_predictions)

In [2]:
MAX_LENGTH   = 16
PRETRAINED   = 'roberta-base'
SEED         = 42

In [3]:
%%time
df_train     = pd.read_csv(os.path.join("../01.Data",'fold.csv'))
y_submission = pd.read_csv(os.path.join(DATA_PATH,'y_test_submission_example.tsv'), index_col='Index', encoding='utf-8', sep='\t')

CPU times: user 2.26 s, sys: 217 ms, total: 2.48 s
Wall time: 2.48 s


In [7]:
%%time
COLUMN_NAME  = 'product_name' 
tokenizer     = transformers.RobertaTokenizer.from_pretrained(PRETRAINED)
train_dataset = BNPParibasText(df_train,MAX_LENGTH,tokenizer,COLUMN_NAME)
model         = Roberta_Model(pretrained_model=PRETRAINED)
train_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size  = 32,
        pin_memory  = True,
        num_workers = 72
    )
emb_sentence_train = get_embedding(train_loader, model, 'cuda')
df_train[[f'emb_{COLUMN_NAME}_{i}' for i in range(emb_sentence_train.shape[1])]] = emb_sentence_train

HBox(children=(FloatProgress(value=0.0, max=3189.0), HTML(value='')))


CPU times: user 1min 19s, sys: 12.4 s, total: 1min 31s
Wall time: 1min 29s


In [None]:
#COLUMN_NAME  = 'ingredients_text' 
#tokenizer     = transformers.RobertaTokenizer.from_pretrained(PRETRAINED)
#train_dataset = BNPParibasText(df_train,MAX_LENGTH,tokenizer,COLUMN_NAME)
#model         = Roberta_Model(pretrained_model=PRETRAINED)
#train_loader = torch.utils.data.DataLoader(
#        train_dataset,
#        batch_size  = 32,
#        pin_memory  = True,
#        num_workers = 72
#    )
#emb_sentence_train = get_embedding(train_loader, model, 'cuda')
#df_train[[f'emb_{COLUMN_NAME}_{i}' for i in range(emb_sentence_train.shape[1])]] = emb_sentence_train

In [8]:
columns_modeling = ['additives_n','ingredients_from_palm_oil_n',
                    'ingredients_that_may_be_from_palm_oil_n','target',
                    'states_en_brands','states_en_categories','states_en_characteristics','states_en_expiration date',
                    'states_en_general_complete','states_en_ingredients','pnns_groups_1','pnns_groups_2',
                    'states_en_packaging','states_en_packaging-code-','states_en_photo_upload',
                    'states_en_photo_validate','states_en_product name','states_en_quantity','diff_t'] + [f'emb_{i}' for i in range(emb_sentence_train.shape[1])]
columns_label = df_train[columns_modeling].select_dtypes(include=['object']).columns.to_list()
print(columns_label)

['states_en_brands', 'states_en_categories', 'states_en_characteristics', 'states_en_expiration date', 'states_en_general_complete', 'states_en_ingredients', 'pnns_groups_1', 'pnns_groups_2', 'states_en_packaging', 'states_en_packaging-code-', 'states_en_photo_upload', 'states_en_photo_validate', 'states_en_product name', 'states_en_quantity']


In [9]:
df_train,dict_le = label_encoding(df_train,label_cols = columns_label, drop_original = True, missing_new_cat = True)

Mode: Missing as new category
Label Encoding:  label_states_en_brands
Label Encoding:  label_states_en_categories
Label Encoding:  label_states_en_characteristics
Label Encoding:  label_states_en_expiration date
Label Encoding:  label_states_en_general_complete
Label Encoding:  label_states_en_ingredients
Label Encoding:  label_pnns_groups_1
Label Encoding:  label_pnns_groups_2
Label Encoding:  label_states_en_packaging
Label Encoding:  label_states_en_packaging-code-
Label Encoding:  label_states_en_photo_upload
Label Encoding:  label_states_en_photo_validate
Label Encoding:  label_states_en_product name
Label Encoding:  label_states_en_quantity


In [17]:
params = {
        'task': 'train',
        'boosting_type': 'gbdt',
        'objective': 'regression',
        'metric': {'rmse'},
        'num_leaves':10,
        'learning_rate': 0.1,
        "min_child_samples": 150,
        "max_depth" : 5,
        'feature_fraction':  0.7,
        "bagging_freq": 1,
        'bagging_fraction': 0.75,
        "is_unbalance" : False,
        'force_col_wise':True,
        'num_threads':18,
        #"scale_pos_weight":5 -> Generally  is the ratio of number of negative class to the positive class.
        'bagging_seed':42,
        'lambda_l1':1.5,
        'lambda_l2':1,
        'verbose': 1

}
cat_columns = [i for i in df_train.columns.to_list() if i.startswith('label_')]
columns_modeling_last = list(set(columns_modeling)-set(columns_label)) + ['fold'] + cat_columns 

In [18]:
results,models,importances,oof,feature_list = Training_Lightgbm(df_train[columns_modeling_last],params,fold_column = 'fold',target_column = 'target',cat_vars = cat_columns ,metric = 'RMSE',early_stopping = 200,max_boost_round = 8000)

Columns: ['emb_714', 'emb_432', 'emb_139', 'emb_587', 'emb_306', 'emb_321', 'emb_433', 'emb_45', 'emb_511', 'emb_168', 'emb_156', 'emb_177', 'emb_151', 'emb_600', 'emb_162', 'emb_295', 'emb_318', 'emb_326', 'emb_249', 'emb_583', 'emb_328', 'emb_29', 'emb_649', 'emb_665', 'emb_69', 'emb_580', 'emb_598', 'emb_495', 'emb_647', 'emb_554', 'emb_485', 'emb_541', 'emb_348', 'emb_675', 'emb_264', 'emb_193', 'emb_115', 'emb_64', 'diff_t', 'emb_288', 'emb_72', 'emb_155', 'emb_585', 'emb_59', 'emb_100', 'emb_97', 'emb_722', 'emb_88', 'emb_104', 'emb_559', 'emb_3', 'emb_429', 'emb_142', 'emb_407', 'emb_467', 'emb_451', 'emb_682', 'emb_522', 'emb_292', 'emb_458', 'emb_687', 'emb_444', 'emb_691', 'emb_514', 'emb_277', 'emb_323', 'emb_693', 'emb_241', 'emb_255', 'emb_91', 'emb_362', 'emb_82', 'emb_297', 'emb_259', 'emb_582', 'emb_99', 'emb_285', 'emb_570', 'emb_41', 'emb_105', 'emb_409', 'emb_445', 'emb_534', 'emb_110', 'emb_557', 'emb_388', 'emb_669', 'emb_590', 'emb_305', 'emb_86', 'emb_159', 'emb_

[LightGBM] [Info] Total Bins 196221
[LightGBM] [Info] Number of data points in the train set: 81622, number of used features: 786




[LightGBM] [Info] Start training from score 9.171473
Training until validation scores don't improve for 200 rounds
[50]	training's rmse: 7.15048	valid_1's rmse: 7.19671
[100]	training's rmse: 6.829	valid_1's rmse: 6.91856
[150]	training's rmse: 6.6475	valid_1's rmse: 6.78574
[200]	training's rmse: 6.51129	valid_1's rmse: 6.69623
[250]	training's rmse: 6.39836	valid_1's rmse: 6.62652
[300]	training's rmse: 6.30048	valid_1's rmse: 6.57082
[350]	training's rmse: 6.21597	valid_1's rmse: 6.53202
[400]	training's rmse: 6.1356	valid_1's rmse: 6.49608
[450]	training's rmse: 6.06237	valid_1's rmse: 6.46456
[500]	training's rmse: 5.99195	valid_1's rmse: 6.43434
[550]	training's rmse: 5.92712	valid_1's rmse: 6.40972
[600]	training's rmse: 5.86328	valid_1's rmse: 6.38479
[650]	training's rmse: 5.80185	valid_1's rmse: 6.36345
[700]	training's rmse: 5.74489	valid_1's rmse: 6.34442
[750]	training's rmse: 5.68924	valid_1's rmse: 6.32766
[800]	training's rmse: 5.63713	valid_1's rmse: 6.31062
[850]	trai

[LightGBM] [Info] Total Bins 196221
[LightGBM] [Info] Number of data points in the train set: 81622, number of used features: 786




[LightGBM] [Info] Start training from score 9.169930
Training until validation scores don't improve for 200 rounds
[50]	training's rmse: 7.14542	valid_1's rmse: 7.22901
[100]	training's rmse: 6.82198	valid_1's rmse: 6.96796
[150]	training's rmse: 6.63491	valid_1's rmse: 6.82724
[200]	training's rmse: 6.50186	valid_1's rmse: 6.74207
[250]	training's rmse: 6.39226	valid_1's rmse: 6.6741
[300]	training's rmse: 6.2954	valid_1's rmse: 6.6148
[350]	training's rmse: 6.20914	valid_1's rmse: 6.56807
[400]	training's rmse: 6.12831	valid_1's rmse: 6.52763
[450]	training's rmse: 6.05491	valid_1's rmse: 6.49649
[500]	training's rmse: 5.98832	valid_1's rmse: 6.46949
[550]	training's rmse: 5.92	valid_1's rmse: 6.43991
[600]	training's rmse: 5.8585	valid_1's rmse: 6.4174
[650]	training's rmse: 5.79987	valid_1's rmse: 6.3966
[700]	training's rmse: 5.74296	valid_1's rmse: 6.37641
[750]	training's rmse: 5.68963	valid_1's rmse: 6.35868
[800]	training's rmse: 5.6377	valid_1's rmse: 6.34144
[850]	training's

[7200]	training's rmse: 2.84674	valid_1's rmse: 5.92315
[7250]	training's rmse: 2.83721	valid_1's rmse: 5.92298
[7300]	training's rmse: 2.82681	valid_1's rmse: 5.92213
[7350]	training's rmse: 2.81743	valid_1's rmse: 5.92207
[7400]	training's rmse: 2.80714	valid_1's rmse: 5.92162
[7450]	training's rmse: 2.79732	valid_1's rmse: 5.9218
[7500]	training's rmse: 2.78777	valid_1's rmse: 5.92111
[7550]	training's rmse: 2.7785	valid_1's rmse: 5.92095
[7600]	training's rmse: 2.76921	valid_1's rmse: 5.92102
[7650]	training's rmse: 2.76002	valid_1's rmse: 5.9196
[7700]	training's rmse: 2.75058	valid_1's rmse: 5.91847
[7750]	training's rmse: 2.74152	valid_1's rmse: 5.91763
[7800]	training's rmse: 2.73277	valid_1's rmse: 5.91784
[7850]	training's rmse: 2.72378	valid_1's rmse: 5.91681
[7900]	training's rmse: 2.71482	valid_1's rmse: 5.91623
[7950]	training's rmse: 2.70595	valid_1's rmse: 5.91559
[8000]	training's rmse: 2.69722	valid_1's rmse: 5.91486
Did not meet early stopping. Best iteration is:
[80

[LightGBM] [Info] Total Bins 196221
[LightGBM] [Info] Number of data points in the train set: 81622, number of used features: 786




[LightGBM] [Info] Start training from score 9.171253
Training until validation scores don't improve for 200 rounds
[50]	training's rmse: 7.15605	valid_1's rmse: 7.20285
[100]	training's rmse: 6.83253	valid_1's rmse: 6.92401
[150]	training's rmse: 6.65122	valid_1's rmse: 6.78626
[200]	training's rmse: 6.51673	valid_1's rmse: 6.69902
[250]	training's rmse: 6.40788	valid_1's rmse: 6.63087
[300]	training's rmse: 6.30966	valid_1's rmse: 6.57792
[350]	training's rmse: 6.22475	valid_1's rmse: 6.53299
[400]	training's rmse: 6.14233	valid_1's rmse: 6.49271
[450]	training's rmse: 6.06788	valid_1's rmse: 6.4577
[500]	training's rmse: 5.99605	valid_1's rmse: 6.4287
[550]	training's rmse: 5.93071	valid_1's rmse: 6.40362
[600]	training's rmse: 5.86737	valid_1's rmse: 6.37744
[650]	training's rmse: 5.80526	valid_1's rmse: 6.35132
[700]	training's rmse: 5.74824	valid_1's rmse: 6.33386
[750]	training's rmse: 5.69346	valid_1's rmse: 6.31412
[800]	training's rmse: 5.63625	valid_1's rmse: 6.29351
[850]	tr

[7100]	training's rmse: 2.85368	valid_1's rmse: 5.88202
[7150]	training's rmse: 2.84311	valid_1's rmse: 5.88091
[7200]	training's rmse: 2.83382	valid_1's rmse: 5.8808
[7250]	training's rmse: 2.82374	valid_1's rmse: 5.87976
[7300]	training's rmse: 2.81375	valid_1's rmse: 5.8796
[7350]	training's rmse: 2.80395	valid_1's rmse: 5.87891
[7400]	training's rmse: 2.79406	valid_1's rmse: 5.87882
[7450]	training's rmse: 2.78456	valid_1's rmse: 5.87784
[7500]	training's rmse: 2.77543	valid_1's rmse: 5.87753
[7550]	training's rmse: 2.76585	valid_1's rmse: 5.87756
[7600]	training's rmse: 2.75649	valid_1's rmse: 5.8778
[7650]	training's rmse: 2.74746	valid_1's rmse: 5.8765
[7700]	training's rmse: 2.73781	valid_1's rmse: 5.87578
[7750]	training's rmse: 2.72837	valid_1's rmse: 5.87562
[7800]	training's rmse: 2.71935	valid_1's rmse: 5.87517
[7850]	training's rmse: 2.71044	valid_1's rmse: 5.8759
[7900]	training's rmse: 2.70142	valid_1's rmse: 5.87583
[7950]	training's rmse: 2.69258	valid_1's rmse: 5.874

[LightGBM] [Info] Total Bins 196221
[LightGBM] [Info] Number of data points in the train set: 81623, number of used features: 786




[LightGBM] [Info] Start training from score 9.170344
Training until validation scores don't improve for 200 rounds
[50]	training's rmse: 7.14322	valid_1's rmse: 7.21582
[100]	training's rmse: 6.82372	valid_1's rmse: 6.9549
[150]	training's rmse: 6.63726	valid_1's rmse: 6.82102
[200]	training's rmse: 6.4998	valid_1's rmse: 6.73171
[250]	training's rmse: 6.38862	valid_1's rmse: 6.66664
[300]	training's rmse: 6.2924	valid_1's rmse: 6.62009
[350]	training's rmse: 6.20667	valid_1's rmse: 6.57454
[400]	training's rmse: 6.12872	valid_1's rmse: 6.53721
[450]	training's rmse: 6.05685	valid_1's rmse: 6.50609
[500]	training's rmse: 5.98803	valid_1's rmse: 6.47827
[550]	training's rmse: 5.92074	valid_1's rmse: 6.44804
[600]	training's rmse: 5.85986	valid_1's rmse: 6.42755
[650]	training's rmse: 5.79921	valid_1's rmse: 6.40353
[700]	training's rmse: 5.74222	valid_1's rmse: 6.38549
[750]	training's rmse: 5.68775	valid_1's rmse: 6.36416
[800]	training's rmse: 5.63764	valid_1's rmse: 6.34903
[850]	tra

[7200]	training's rmse: 2.83871	valid_1's rmse: 5.9385
[7250]	training's rmse: 2.82808	valid_1's rmse: 5.93804
[7300]	training's rmse: 2.81794	valid_1's rmse: 5.93809
[7350]	training's rmse: 2.80801	valid_1's rmse: 5.9376
[7400]	training's rmse: 2.798	valid_1's rmse: 5.93683
[7450]	training's rmse: 2.78885	valid_1's rmse: 5.93717
[7500]	training's rmse: 2.77914	valid_1's rmse: 5.93693
[7550]	training's rmse: 2.76938	valid_1's rmse: 5.93584
[7600]	training's rmse: 2.75992	valid_1's rmse: 5.93573
[7650]	training's rmse: 2.75102	valid_1's rmse: 5.93565
[7700]	training's rmse: 2.74215	valid_1's rmse: 5.93502
[7750]	training's rmse: 2.7332	valid_1's rmse: 5.93446
[7800]	training's rmse: 2.72379	valid_1's rmse: 5.934
[7850]	training's rmse: 2.71471	valid_1's rmse: 5.9334
[7900]	training's rmse: 2.70572	valid_1's rmse: 5.93296
[7950]	training's rmse: 2.69735	valid_1's rmse: 5.93173
[8000]	training's rmse: 2.68854	valid_1's rmse: 5.93156
Did not meet early stopping. Best iteration is:
[8000]	t

[LightGBM] [Info] Total Bins 196221
[LightGBM] [Info] Number of data points in the train set: 81623, number of used features: 786




[LightGBM] [Info] Start training from score 9.170246
Training until validation scores don't improve for 200 rounds
[50]	training's rmse: 7.16241	valid_1's rmse: 7.18803
[100]	training's rmse: 6.83579	valid_1's rmse: 6.91048
[150]	training's rmse: 6.6518	valid_1's rmse: 6.77142
[200]	training's rmse: 6.51427	valid_1's rmse: 6.67706
[250]	training's rmse: 6.40554	valid_1's rmse: 6.61535
[300]	training's rmse: 6.30597	valid_1's rmse: 6.56082
[350]	training's rmse: 6.21958	valid_1's rmse: 6.5162
[400]	training's rmse: 6.14002	valid_1's rmse: 6.4803
[450]	training's rmse: 6.06542	valid_1's rmse: 6.44491
[500]	training's rmse: 5.99462	valid_1's rmse: 6.41173
[550]	training's rmse: 5.92895	valid_1's rmse: 6.38833
[600]	training's rmse: 5.8686	valid_1's rmse: 6.36837
[650]	training's rmse: 5.81047	valid_1's rmse: 6.34591
[700]	training's rmse: 5.75227	valid_1's rmse: 6.32412
[750]	training's rmse: 5.69784	valid_1's rmse: 6.30488
[800]	training's rmse: 5.64605	valid_1's rmse: 6.28518
[850]	trai

[6950]	training's rmse: 2.88673	valid_1's rmse: 5.87186
[7000]	training's rmse: 2.87594	valid_1's rmse: 5.8715
[7050]	training's rmse: 2.86492	valid_1's rmse: 5.87079
[7100]	training's rmse: 2.85507	valid_1's rmse: 5.87071
[7150]	training's rmse: 2.84462	valid_1's rmse: 5.87044
[7200]	training's rmse: 2.83424	valid_1's rmse: 5.86994
[7250]	training's rmse: 2.82359	valid_1's rmse: 5.86899
[7300]	training's rmse: 2.81351	valid_1's rmse: 5.86759
[7350]	training's rmse: 2.80328	valid_1's rmse: 5.86739
[7400]	training's rmse: 2.79306	valid_1's rmse: 5.86638
[7450]	training's rmse: 2.78327	valid_1's rmse: 5.86513
[7500]	training's rmse: 2.77377	valid_1's rmse: 5.86538
[7550]	training's rmse: 2.76394	valid_1's rmse: 5.86491
[7600]	training's rmse: 2.75467	valid_1's rmse: 5.86445
[7650]	training's rmse: 2.7449	valid_1's rmse: 5.86444
[7700]	training's rmse: 2.73509	valid_1's rmse: 5.86328
[7750]	training's rmse: 2.72567	valid_1's rmse: 5.86264
[7800]	training's rmse: 2.71616	valid_1's rmse: 5.

## Evaluando

In [19]:
df_test      = pd.read_csv(os.path.join(DATA_PATH,'test_preprocessed.csv'))
df_test['target'] = -1
test_dataset = BNPParibasText(df_test,MAX_LENGTH,tokenizer,COLUMN_NAME)
model         = Roberta_Model(pretrained_model=PRETRAINED)
test_loader = torch.utils.data.DataLoader(
        test_dataset,
        batch_size  = 32,
        pin_memory  = True,
        num_workers = 72
    )
emb_sentence_test = get_embedding(test_loader, model, 'cuda')
df_test[[f'emb_{i}' for i in range(emb_sentence_train.shape[1])]] = emb_sentence_test
df_test = apply_label_encoder(df_test,dict_le,drop_original = True, missing_new_cat = True)

HBox(children=(FloatProgress(value=0.0, max=798.0), HTML(value='')))


Mode: Missing as new category
Applying Label Encoding:  label_states_en_brands
Applying Label Encoding:  label_states_en_categories
Applying Label Encoding:  label_states_en_characteristics
Applying Label Encoding:  label_states_en_expiration date
Applying Label Encoding:  label_states_en_general_complete
Applying Label Encoding:  label_states_en_ingredients
Applying Label Encoding:  label_pnns_groups_1
Applying Label Encoding:  label_pnns_groups_2
Applying Label Encoding:  label_states_en_packaging
Applying Label Encoding:  label_states_en_packaging-code-
Applying Label Encoding:  label_states_en_photo_upload
Applying Label Encoding:  label_states_en_photo_validate
Applying Label Encoding:  label_states_en_product name
Applying Label Encoding:  label_states_en_quantity


In [20]:
probs = 0
for i in models:
    probs = probs + (i.predict(df_test[feature_list]))
    
    print('fin_predict')
y_test_pred = probs/5.0
print(f'Real: ',math.sqrt(mean_squared_error(y_test_pred,df_test['Target'].values)))

fin_predict
fin_predict
fin_predict
fin_predict
fin_predict
Real:  5.748087686708285


In [21]:
y_submission['target'] = y_test_pred
y_submission.head()

Unnamed: 0_level_0,target
Index,Unnamed: 1_level_1
37320,14.072644
3913,20.0384
112180,10.424357
128820,12.515797
16037,19.302766


In [22]:
#Enviar los resultados
apiquery.submit_api(y_submission,
       competition_name='food',
        subname='test_v2', # Pueden cambiar esto sin problemas, poner el nombre que quieran.
        holdout_key='None',
        update_ldb=True,
        username="Insight ML - DD" # Poner el nombre de su equipo como un string. 
                                  # El mejor de los resultados dentro de sus envios es el que aparecera en la tabla de posiciones.
)

requests number 1
200
{'Date': 'Tue, 18 May 2021 20:58:56 GMT', 'Content-Type': 'application/json', 'Content-Length': '495', 'Connection': 'keep-alive', 'X-Request-ID': '9VDYQEXOTIL4RSGH', 'Access-Control-Allow-Origin': '*', 'Access-Control-Allow-Methods': 'POST', 'Access-Control-Allow-Headers': 'authorization,content-type'}


{'competition_name': 'food',
 'file_path': 'none',
 'message': 'Submission validated.',
 'name': 'Insight ML - DD',
 'result_csv_file': 'test_v2',
 'score': 5.748294411988524,
 'score2': None,
 'score3': None,
 'sub_name': 'test_v2',
 'sub_uid': '8ff2732f-f618-4572-912d-bfd4d0799d1d',
 'submission_time': '2021/05/18, 20:58:56'}