In [1]:
import shutil
import apiquery
import pandas as pd
import sys
import seaborn as sns
import os
import torch
import numpy as np
import random
DATA_PATH = '../01.Data'
shutil.copy("apiquery_pyc.py", "apiquery.pyc")
module_path = "../src"
if module_path not in sys.path:
    sys.path.append(module_path)
    
from utils.training import *
from utils.encoding import *
from utils.utils import *
from models.models import XLMRoberta
from dataset.dataset import BNPParibasText
pd.options.display.max_rows = 100
pd.options.display.max_columns = 100
import math
from collections import Counter
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import mean_squared_error
import math
import time
import matplotlib.pyplot as plt
import transformers
pd.set_option('display.max_rows',100 )


In [2]:
def get_embedding(data_loader, model, device):
    from tqdm.notebook import tqdm
    # Put the model in eval mode
    model.to(device)
    model.eval()
    # List for store final predictions
    final_predictions = []
    with torch.no_grad():
        tk0 = tqdm(data_loader, total=len(data_loader))
        for b_idx, data in enumerate(tk0):
            for key,value in data.items():
                data[key] = value.to(device)
            predictions = model._embeddings(data['ids'],data['mask'])
            predictions = predictions.cpu()
            final_predictions.append(predictions)
    return np.vstack(final_predictions)

In [3]:
MAX_LENGTH   = 16
PRETRAINED   = 'xlm-roberta-base'
SEED         = 42

In [4]:
%%time
df_train     = pd.read_csv(os.path.join("../01.Data",'fold.csv'))
y_submission = pd.read_csv(os.path.join(DATA_PATH,'y_test_submission_example.tsv'), index_col='Index', encoding='utf-8', sep='\t')

CPU times: user 2.22 s, sys: 227 ms, total: 2.44 s
Wall time: 2.45 s


In [5]:
seed_everything(SEED)

In [6]:
%%time
COLUMN_NAME  = 'product_name' 
tokenizer     = transformers.XLMRobertaTokenizer.from_pretrained(PRETRAINED)
train_dataset = BNPParibasText(df_train,MAX_LENGTH,tokenizer,COLUMN_NAME)
model         = XLMRoberta(pretrained_model=PRETRAINED)
train_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size  = 32,
        pin_memory  = True,
        num_workers = 72
    )
emb_sentence_train = get_embedding(train_loader, model, 'cuda')
df_train[[f'emb_{COLUMN_NAME}_{i}' for i in range(emb_sentence_train.shape[1])]] = emb_sentence_train

HBox(children=(FloatProgress(value=0.0, max=3189.0), HTML(value='')))


CPU times: user 1min 45s, sys: 14.5 s, total: 1min 59s
Wall time: 1min 37s


In [7]:
COLUMN_NAME  = 'ingredients_text' 
tokenizer     = transformers.XLMRobertaTokenizer.from_pretrained(PRETRAINED)
train_dataset = BNPParibasText(df_train,MAX_LENGTH,tokenizer,COLUMN_NAME)
model         = XLMRoberta(pretrained_model=PRETRAINED)
train_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size  = 32,
        pin_memory  = True,
        num_workers = 72
    )
emb_sentence_train = get_embedding(train_loader, model, 'cuda')
df_train[[f'emb_{COLUMN_NAME}_{i}' for i in range(emb_sentence_train.shape[1])]] = emb_sentence_train

HBox(children=(FloatProgress(value=0.0, max=3189.0), HTML(value='')))




In [8]:
COLUMN_NAME  = 'brands_tags' 
tokenizer     = transformers.XLMRobertaTokenizer.from_pretrained(PRETRAINED)
train_dataset = BNPParibasText(df_train,MAX_LENGTH,tokenizer,COLUMN_NAME)
model         = XLMRoberta(pretrained_model=PRETRAINED)
train_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size  = 32,
        pin_memory  = True,
        num_workers = 72
    )
emb_sentence_train = get_embedding(train_loader, model, 'cuda')
df_train[[f'emb_{COLUMN_NAME}_{i}' for i in range(emb_sentence_train.shape[1])]] = emb_sentence_train

HBox(children=(FloatProgress(value=0.0, max=3189.0), HTML(value='')))




In [9]:
columns_modeling = ['additives_n','ingredients_from_palm_oil_n',
                    'ingredients_that_may_be_from_palm_oil_n','target',
                    'states_en_brands','states_en_categories','states_en_characteristics','states_en_expiration date',
                    'states_en_general_complete','states_en_ingredients','pnns_groups_1','pnns_groups_2',
                    'states_en_packaging','states_en_packaging-code-','states_en_photo_upload',
                    'states_en_photo_validate','states_en_product name','states_en_quantity','diff_t'] + [f'emb_product_name_{i}' for i in range(emb_sentence_train.shape[1])] + [f'emb_ingredients_text_{i}' for i in range(emb_sentence_train.shape[1])]+[f'emb_brands_tags_{i}' for i in range(emb_sentence_train.shape[1])]
columns_label = df_train[columns_modeling].select_dtypes(include=['object']).columns.to_list()
print(columns_label)

['states_en_brands', 'states_en_categories', 'states_en_characteristics', 'states_en_expiration date', 'states_en_general_complete', 'states_en_ingredients', 'pnns_groups_1', 'pnns_groups_2', 'states_en_packaging', 'states_en_packaging-code-', 'states_en_photo_upload', 'states_en_photo_validate', 'states_en_product name', 'states_en_quantity']


In [10]:
df_train,dict_le = label_encoding(df_train,label_cols = columns_label, drop_original = True, missing_new_cat = True)

Mode: Missing as new category
Label Encoding:  label_states_en_brands
Label Encoding:  label_states_en_categories
Label Encoding:  label_states_en_characteristics
Label Encoding:  label_states_en_expiration date
Label Encoding:  label_states_en_general_complete
Label Encoding:  label_states_en_ingredients
Label Encoding:  label_pnns_groups_1
Label Encoding:  label_pnns_groups_2
Label Encoding:  label_states_en_packaging
Label Encoding:  label_states_en_packaging-code-
Label Encoding:  label_states_en_photo_upload
Label Encoding:  label_states_en_photo_validate
Label Encoding:  label_states_en_product name
Label Encoding:  label_states_en_quantity


In [11]:
params = {
        'task': 'train',
        'boosting_type': 'gbdt',
        'objective': 'regression',
        'metric': {'rmse'},
        'num_leaves':12,
        'learning_rate': 0.08,
        "min_child_samples": 150,
        "max_depth" : 5,
        'feature_fraction':  0.5,
        "bagging_freq": 1,
        'bagging_fraction': 0.75,
        "is_unbalance" : False,
        'force_col_wise':True,
        'num_threads':18,
        #"scale_pos_weight":5 -> Generally  is the ratio of number of negative class to the positive class.
        'bagging_seed':42,
        'lambda_l1':1.5,
        'lambda_l2':1,
        'verbose': 1

}
cat_columns = [i for i in df_train.columns.to_list() if i.startswith('label_')]
columns_modeling_last = list(set(columns_modeling)-set(columns_label)) + ['fold'] + cat_columns 

In [12]:
results,models,importances,oof,feature_list = Training_Lightgbm(df_train[columns_modeling_last],params,fold_column = 'fold',target_column = 'target',cat_vars = cat_columns ,metric = 'RMSE',early_stopping = 200,max_boost_round = 8000)

Columns: ['emb_product_name_250', 'emb_product_name_41', 'emb_product_name_633', 'emb_ingredients_text_275', 'emb_brands_tags_563', 'emb_brands_tags_650', 'emb_ingredients_text_685', 'emb_ingredients_text_285', 'emb_ingredients_text_84', 'emb_ingredients_text_356', 'emb_brands_tags_494', 'emb_ingredients_text_102', 'emb_ingredients_text_71', 'emb_brands_tags_619', 'emb_ingredients_text_591', 'emb_ingredients_text_409', 'emb_product_name_550', 'emb_ingredients_text_351', 'emb_ingredients_text_122', 'emb_ingredients_text_100', 'emb_brands_tags_267', 'emb_ingredients_text_47', 'emb_brands_tags_300', 'emb_product_name_422', 'emb_brands_tags_432', 'emb_ingredients_text_761', 'emb_product_name_291', 'emb_product_name_599', 'emb_ingredients_text_211', 'emb_brands_tags_628', 'emb_ingredients_text_180', 'emb_brands_tags_324', 'emb_brands_tags_642', 'emb_ingredients_text_33', 'emb_brands_tags_537', 'emb_brands_tags_349', 'emb_product_name_125', 'emb_product_name_429', 'emb_ingredients_text_246',

[LightGBM] [Info] Total Bins 587901
[LightGBM] [Info] Number of data points in the train set: 81622, number of used features: 2322




[LightGBM] [Info] Start training from score 9.171473
Training until validation scores don't improve for 200 rounds
[50]	training's rmse: 6.91828	valid_1's rmse: 6.97418
[100]	training's rmse: 6.51149	valid_1's rmse: 6.60104
[150]	training's rmse: 6.2683	valid_1's rmse: 6.39501
[200]	training's rmse: 6.09244	valid_1's rmse: 6.25838
[250]	training's rmse: 5.94772	valid_1's rmse: 6.15288
[300]	training's rmse: 5.82874	valid_1's rmse: 6.07191
[350]	training's rmse: 5.72272	valid_1's rmse: 6.0028
[400]	training's rmse: 5.62702	valid_1's rmse: 5.94577
[450]	training's rmse: 5.54017	valid_1's rmse: 5.89683
[500]	training's rmse: 5.4614	valid_1's rmse: 5.85434
[550]	training's rmse: 5.38585	valid_1's rmse: 5.81381
[600]	training's rmse: 5.31384	valid_1's rmse: 5.77972
[650]	training's rmse: 5.246	valid_1's rmse: 5.75121
[700]	training's rmse: 5.18468	valid_1's rmse: 5.72624
[750]	training's rmse: 5.1257	valid_1's rmse: 5.70504
[800]	training's rmse: 5.06844	valid_1's rmse: 5.68313
[850]	traini

[3750]	training's rmse: 3.14277	valid_1's rmse: 5.26926
[3800]	training's rmse: 3.12138	valid_1's rmse: 5.2669
[3850]	training's rmse: 3.10015	valid_1's rmse: 5.26501
[3900]	training's rmse: 3.07946	valid_1's rmse: 5.26224
[3950]	training's rmse: 3.05883	valid_1's rmse: 5.26034
[4000]	training's rmse: 3.0383	valid_1's rmse: 5.25789
[4050]	training's rmse: 3.0189	valid_1's rmse: 5.2556
[4100]	training's rmse: 2.9986	valid_1's rmse: 5.2534
[4150]	training's rmse: 2.97846	valid_1's rmse: 5.25262
[4200]	training's rmse: 2.95917	valid_1's rmse: 5.25048
[4250]	training's rmse: 2.9399	valid_1's rmse: 5.24814
[4300]	training's rmse: 2.92034	valid_1's rmse: 5.24634
[4350]	training's rmse: 2.90058	valid_1's rmse: 5.24425
[4400]	training's rmse: 2.88147	valid_1's rmse: 5.24169
[4450]	training's rmse: 2.8626	valid_1's rmse: 5.24034
[4500]	training's rmse: 2.84385	valid_1's rmse: 5.23823
[4550]	training's rmse: 2.82606	valid_1's rmse: 5.23623
[4600]	training's rmse: 2.80801	valid_1's rmse: 5.23539


[LightGBM] [Info] Total Bins 587901
[LightGBM] [Info] Number of data points in the train set: 81622, number of used features: 2322




[LightGBM] [Info] Start training from score 9.169930
Training until validation scores don't improve for 200 rounds
[50]	training's rmse: 6.91068	valid_1's rmse: 6.98658
[100]	training's rmse: 6.50595	valid_1's rmse: 6.63667
[150]	training's rmse: 6.26724	valid_1's rmse: 6.44882
[200]	training's rmse: 6.08576	valid_1's rmse: 6.30903
[250]	training's rmse: 5.94276	valid_1's rmse: 6.21092
[300]	training's rmse: 5.81841	valid_1's rmse: 6.12679
[350]	training's rmse: 5.71041	valid_1's rmse: 6.05741
[400]	training's rmse: 5.61615	valid_1's rmse: 6.00038
[450]	training's rmse: 5.52954	valid_1's rmse: 5.95351
[500]	training's rmse: 5.45027	valid_1's rmse: 5.9097
[550]	training's rmse: 5.37382	valid_1's rmse: 5.87267
[600]	training's rmse: 5.30474	valid_1's rmse: 5.83831
[650]	training's rmse: 5.23918	valid_1's rmse: 5.80824
[700]	training's rmse: 5.17563	valid_1's rmse: 5.77955
[750]	training's rmse: 5.11413	valid_1's rmse: 5.75179
[800]	training's rmse: 5.05788	valid_1's rmse: 5.73154
[850]	t

[3100]	training's rmse: 3.43875	valid_1's rmse: 5.35014
[3150]	training's rmse: 3.41453	valid_1's rmse: 5.34751
[3200]	training's rmse: 3.38989	valid_1's rmse: 5.34379
[3250]	training's rmse: 3.36644	valid_1's rmse: 5.34039
[3300]	training's rmse: 3.34288	valid_1's rmse: 5.33636
[3350]	training's rmse: 3.31937	valid_1's rmse: 5.3328
[3400]	training's rmse: 3.29649	valid_1's rmse: 5.33021
[3450]	training's rmse: 3.27421	valid_1's rmse: 5.32693
[3500]	training's rmse: 3.25174	valid_1's rmse: 5.32295
[3550]	training's rmse: 3.22919	valid_1's rmse: 5.32031
[3600]	training's rmse: 3.20674	valid_1's rmse: 5.31728
[3650]	training's rmse: 3.1846	valid_1's rmse: 5.31457
[3700]	training's rmse: 3.16319	valid_1's rmse: 5.31131
[3750]	training's rmse: 3.14124	valid_1's rmse: 5.30849
[3800]	training's rmse: 3.11956	valid_1's rmse: 5.30572
[3850]	training's rmse: 3.09869	valid_1's rmse: 5.3033
[3900]	training's rmse: 3.07675	valid_1's rmse: 5.301
[3950]	training's rmse: 3.05543	valid_1's rmse: 5.298

[LightGBM] [Info] Total Bins 587901
[LightGBM] [Info] Number of data points in the train set: 81622, number of used features: 2322




[LightGBM] [Info] Start training from score 9.171253
Training until validation scores don't improve for 200 rounds
[50]	training's rmse: 6.91366	valid_1's rmse: 6.95196
[100]	training's rmse: 6.51201	valid_1's rmse: 6.60358
[150]	training's rmse: 6.27279	valid_1's rmse: 6.41217
[200]	training's rmse: 6.09241	valid_1's rmse: 6.27678
[250]	training's rmse: 5.94543	valid_1's rmse: 6.17076
[300]	training's rmse: 5.82641	valid_1's rmse: 6.0931
[350]	training's rmse: 5.71883	valid_1's rmse: 6.02454
[400]	training's rmse: 5.62296	valid_1's rmse: 5.96682
[450]	training's rmse: 5.53534	valid_1's rmse: 5.91633
[500]	training's rmse: 5.4527	valid_1's rmse: 5.87068
[550]	training's rmse: 5.37884	valid_1's rmse: 5.83476
[600]	training's rmse: 5.30903	valid_1's rmse: 5.80339
[650]	training's rmse: 5.24383	valid_1's rmse: 5.77433
[700]	training's rmse: 5.18163	valid_1's rmse: 5.74628
[750]	training's rmse: 5.12364	valid_1's rmse: 5.7227
[800]	training's rmse: 5.06656	valid_1's rmse: 5.69922
[850]	tra

[3750]	training's rmse: 3.13982	valid_1's rmse: 5.27128
[3800]	training's rmse: 3.11856	valid_1's rmse: 5.26763
[3850]	training's rmse: 3.09784	valid_1's rmse: 5.26425
[3900]	training's rmse: 3.07695	valid_1's rmse: 5.26175
[3950]	training's rmse: 3.05721	valid_1's rmse: 5.25985
[4000]	training's rmse: 3.03612	valid_1's rmse: 5.2584
[4050]	training's rmse: 3.01604	valid_1's rmse: 5.25541
[4100]	training's rmse: 2.99589	valid_1's rmse: 5.25298
[4150]	training's rmse: 2.97584	valid_1's rmse: 5.25103
[4200]	training's rmse: 2.95582	valid_1's rmse: 5.24943
[4250]	training's rmse: 2.93614	valid_1's rmse: 5.24708
[4300]	training's rmse: 2.91664	valid_1's rmse: 5.24523
[4350]	training's rmse: 2.89881	valid_1's rmse: 5.24314
[4400]	training's rmse: 2.87906	valid_1's rmse: 5.24138
[4450]	training's rmse: 2.86002	valid_1's rmse: 5.23954
[4500]	training's rmse: 2.84109	valid_1's rmse: 5.23772
[4550]	training's rmse: 2.82343	valid_1's rmse: 5.23505
[4600]	training's rmse: 2.80435	valid_1's rmse: 5

[LightGBM] [Info] Total Bins 587901
[LightGBM] [Info] Number of data points in the train set: 81623, number of used features: 2322




[LightGBM] [Info] Start training from score 9.170344
Training until validation scores don't improve for 200 rounds
[50]	training's rmse: 6.90086	valid_1's rmse: 6.96241
[100]	training's rmse: 6.49625	valid_1's rmse: 6.61451
[150]	training's rmse: 6.25324	valid_1's rmse: 6.4258
[200]	training's rmse: 6.07085	valid_1's rmse: 6.29185
[250]	training's rmse: 5.92855	valid_1's rmse: 6.19471
[300]	training's rmse: 5.80954	valid_1's rmse: 6.12087
[350]	training's rmse: 5.69966	valid_1's rmse: 6.05563
[400]	training's rmse: 5.60461	valid_1's rmse: 5.99785
[450]	training's rmse: 5.51656	valid_1's rmse: 5.95183
[500]	training's rmse: 5.43727	valid_1's rmse: 5.91228
[550]	training's rmse: 5.36293	valid_1's rmse: 5.87675
[600]	training's rmse: 5.29424	valid_1's rmse: 5.84633
[650]	training's rmse: 5.2283	valid_1's rmse: 5.82092
[700]	training's rmse: 5.16599	valid_1's rmse: 5.79588
[750]	training's rmse: 5.10598	valid_1's rmse: 5.7696
[800]	training's rmse: 5.04918	valid_1's rmse: 5.75135
[850]	tra

[3500]	training's rmse: 3.24141	valid_1's rmse: 5.36113
[3550]	training's rmse: 3.21879	valid_1's rmse: 5.35889
[3600]	training's rmse: 3.19709	valid_1's rmse: 5.35738
[3650]	training's rmse: 3.17566	valid_1's rmse: 5.3559
[3700]	training's rmse: 3.15375	valid_1's rmse: 5.35465
[3750]	training's rmse: 3.13195	valid_1's rmse: 5.35287
[3800]	training's rmse: 3.11076	valid_1's rmse: 5.34966
[3850]	training's rmse: 3.0901	valid_1's rmse: 5.34679
[3900]	training's rmse: 3.06947	valid_1's rmse: 5.3445
[3950]	training's rmse: 3.04928	valid_1's rmse: 5.34199
[4000]	training's rmse: 3.02814	valid_1's rmse: 5.33971
[4050]	training's rmse: 3.00745	valid_1's rmse: 5.33728
[4100]	training's rmse: 2.9877	valid_1's rmse: 5.33552
[4150]	training's rmse: 2.96816	valid_1's rmse: 5.33292
[4200]	training's rmse: 2.94847	valid_1's rmse: 5.33159
[4250]	training's rmse: 2.92916	valid_1's rmse: 5.33067
[4300]	training's rmse: 2.9098	valid_1's rmse: 5.32823
[4350]	training's rmse: 2.89049	valid_1's rmse: 5.326

[LightGBM] [Info] Total Bins 587901
[LightGBM] [Info] Number of data points in the train set: 81623, number of used features: 2322




[LightGBM] [Info] Start training from score 9.170246
Training until validation scores don't improve for 200 rounds
[50]	training's rmse: 6.91718	valid_1's rmse: 6.98047
[100]	training's rmse: 6.5087	valid_1's rmse: 6.62989
[150]	training's rmse: 6.26314	valid_1's rmse: 6.43861
[200]	training's rmse: 6.08208	valid_1's rmse: 6.30534
[250]	training's rmse: 5.93703	valid_1's rmse: 6.20659
[300]	training's rmse: 5.81831	valid_1's rmse: 6.12867
[350]	training's rmse: 5.71171	valid_1's rmse: 6.06152
[400]	training's rmse: 5.6158	valid_1's rmse: 6.00615
[450]	training's rmse: 5.52808	valid_1's rmse: 5.95885
[500]	training's rmse: 5.44974	valid_1's rmse: 5.91775
[550]	training's rmse: 5.3776	valid_1's rmse: 5.88233
[600]	training's rmse: 5.30564	valid_1's rmse: 5.84729
[650]	training's rmse: 5.23915	valid_1's rmse: 5.81645
[700]	training's rmse: 5.17669	valid_1's rmse: 5.7912
[750]	training's rmse: 5.1174	valid_1's rmse: 5.76539
[800]	training's rmse: 5.0585	valid_1's rmse: 5.74126
[850]	traini

[3400]	training's rmse: 3.2933	valid_1's rmse: 5.35808
[3450]	training's rmse: 3.2705	valid_1's rmse: 5.35527
[3500]	training's rmse: 3.24802	valid_1's rmse: 5.35239
[3550]	training's rmse: 3.22562	valid_1's rmse: 5.34902
[3600]	training's rmse: 3.2044	valid_1's rmse: 5.34613
[3650]	training's rmse: 3.1824	valid_1's rmse: 5.34457
[3700]	training's rmse: 3.16066	valid_1's rmse: 5.34167
[3750]	training's rmse: 3.13834	valid_1's rmse: 5.33946
[3800]	training's rmse: 3.11705	valid_1's rmse: 5.33579
[3850]	training's rmse: 3.09645	valid_1's rmse: 5.33228
[3900]	training's rmse: 3.07585	valid_1's rmse: 5.33056
[3950]	training's rmse: 3.05534	valid_1's rmse: 5.3287
[4000]	training's rmse: 3.03485	valid_1's rmse: 5.32679
[4050]	training's rmse: 3.01458	valid_1's rmse: 5.32445
[4100]	training's rmse: 2.9944	valid_1's rmse: 5.32164
[4150]	training's rmse: 2.97411	valid_1's rmse: 5.31925
[4200]	training's rmse: 2.95477	valid_1's rmse: 5.31799
[4250]	training's rmse: 2.93553	valid_1's rmse: 5.3164

## Evaluando

In [13]:
df_test      = pd.read_csv(os.path.join(DATA_PATH,'test_preprocessed.csv'))
df_test['target'] = -1

COLUMN_NAME  = 'product_name' 

test_dataset = BNPParibasText(df_test,MAX_LENGTH,tokenizer,COLUMN_NAME)
model         = XLMRoberta(pretrained_model=PRETRAINED)
test_loader = torch.utils.data.DataLoader(
        test_dataset,
        batch_size  = 32,
        pin_memory  = True,
        num_workers = 72
    )
emb_sentence_test = get_embedding(test_loader, model, 'cuda')
df_test[[f'emb_{COLUMN_NAME}_{i}' for i in range(emb_sentence_test.shape[1])]] = emb_sentence_test


COLUMN_NAME  = 'ingredients_text' 

test_dataset = BNPParibasText(df_test,MAX_LENGTH,tokenizer,COLUMN_NAME)
model         = XLMRoberta(pretrained_model=PRETRAINED)
test_loader = torch.utils.data.DataLoader(
        test_dataset,
        batch_size  = 32,
        pin_memory  = True,
        num_workers = 72
    )
emb_sentence_test = get_embedding(test_loader, model, 'cuda')
df_test[[f'emb_{COLUMN_NAME}_{i}' for i in range(emb_sentence_test.shape[1])]] = emb_sentence_test

COLUMN_NAME  = 'brands_tags' 

test_dataset = BNPParibasText(df_test,MAX_LENGTH,tokenizer,COLUMN_NAME)
model         = XLMRoberta(pretrained_model=PRETRAINED)
test_loader = torch.utils.data.DataLoader(
        test_dataset,
        batch_size  = 32,
        pin_memory  = True,
        num_workers = 72
    )
emb_sentence_test = get_embedding(test_loader, model, 'cuda')
df_test[[f'emb_{COLUMN_NAME}_{i}' for i in range(emb_sentence_test.shape[1])]] = emb_sentence_test

df_test = apply_label_encoder(df_test,dict_le,drop_original = True, missing_new_cat = True)

HBox(children=(FloatProgress(value=0.0, max=798.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=798.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=798.0), HTML(value='')))


Mode: Missing as new category
Applying Label Encoding:  label_states_en_brands
Applying Label Encoding:  label_states_en_categories
Applying Label Encoding:  label_states_en_characteristics
Applying Label Encoding:  label_states_en_expiration date
Applying Label Encoding:  label_states_en_general_complete
Applying Label Encoding:  label_states_en_ingredients
Applying Label Encoding:  label_pnns_groups_1
Applying Label Encoding:  label_pnns_groups_2
Applying Label Encoding:  label_states_en_packaging
Applying Label Encoding:  label_states_en_packaging-code-
Applying Label Encoding:  label_states_en_photo_upload
Applying Label Encoding:  label_states_en_photo_validate
Applying Label Encoding:  label_states_en_product name
Applying Label Encoding:  label_states_en_quantity


In [37]:
probs = 0
for i in models:
    probs = probs + (i.predict(df_test[feature_list]))
    print('fin_predict')
y_test_pred = probs/5.0
print(f'Real: ',math.sqrt(mean_squared_error(y_test_pred,df_test['Target'].values)))

fin_predict
fin_predict
fin_predict
fin_predict
fin_predict
Real:  5.029986946556404


In [34]:
print(f'Real: ',math.sqrt(mean_squared_error(df_test['preds'],df_test['Target'].values)))

Real:  4.949090622538854


In [35]:
y_submission['target'] = df_test['preds'].values
y_submission.head()

Unnamed: 0_level_0,target
Index,Unnamed: 1_level_1
37320,14.0
3913,23.0
112180,8.0
128820,12.0
16037,22.0


In [36]:
#Enviar los resultados
apiquery.submit_api(y_submission,
       competition_name='food',
        subname='test_v8', # Pueden cambiar esto sin problemas, poner el nombre que quieran.
        holdout_key='None',
        update_ldb=True,
        username="Insight ML - DD" # Poner el nombre de su equipo como un string. 
                                  # El mejor de los resultados dentro de sus envios es el que aparecera en la tabla de posiciones.
)

requests number 1
200
{'Date': 'Thu, 20 May 2021 01:12:51 GMT', 'Content-Type': 'application/json', 'Content-Length': '496', 'Connection': 'keep-alive', 'X-Request-ID': '38TKR5SBNUC2OIEL', 'Access-Control-Allow-Origin': '*', 'Access-Control-Allow-Methods': 'POST', 'Access-Control-Allow-Headers': 'authorization,content-type'}


{'competition_name': 'food',
 'file_path': 'none',
 'message': 'Submission validated.',
 'name': 'Insight ML - DD',
 'result_csv_file': 'test_v8',
 'score': 4.949135680861754,
 'score2': None,
 'score3': None,
 'sub_name': 'test_v8',
 'sub_uid': '3c742345-a4bd-4d05-9512-1c905b235010',
 'submission_time': '2021/05/20, 01:12:51'}