In [3]:
import shutil
import apiquery
import pandas as pd
import sys
import seaborn as sns
import os
import torch
import numpy as np
import random
DATA_PATH = '../01.Data'
shutil.copy("apiquery_pyc.py", "apiquery.pyc")
module_path = "../src"
if module_path not in sys.path:
    sys.path.append(module_path)
    
from utils.training import *
from utils.encoding import *
from utils.utils import *
from models.models import Roberta_Model
from dataset.dataset import BNPParibasText
pd.options.display.max_rows = 100
pd.options.display.max_columns = 100
import math
from collections import Counter
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import mean_squared_error
import math
import time
import matplotlib.pyplot as plt
import transformers
pd.set_option('display.max_rows',100 )


In [4]:
def get_embedding(data_loader, model, device):
    from tqdm.notebook import tqdm
    # Put the model in eval mode
    model.to(device)
    model.eval()
    # List for store final predictions
    final_predictions = []
    with torch.no_grad():
        tk0 = tqdm(data_loader, total=len(data_loader))
        for b_idx, data in enumerate(tk0):
            for key,value in data.items():
                data[key] = value.to(device)
            predictions = model._embeddings(data['ids'],data['mask'])
            predictions = predictions.cpu()
            final_predictions.append(predictions)
    return np.vstack(final_predictions)

In [5]:
MAX_LENGTH   = 16
PRETRAINED   = 'roberta-base'
SEED         = 42

In [6]:
%%time
df_train     = pd.read_csv(os.path.join("../01.Data",'fold.csv'))
y_submission = pd.read_csv(os.path.join(DATA_PATH,'y_test_submission_example.tsv'), index_col='Index', encoding='utf-8', sep='\t')

CPU times: user 2.34 s, sys: 214 ms, total: 2.56 s
Wall time: 2.56 s


In [79]:
seed_everything(SEED)

In [80]:
%%time
COLUMN_NAME  = 'product_name' 
tokenizer     = transformers.RobertaTokenizer.from_pretrained(PRETRAINED)
train_dataset = BNPParibasText(df_train,MAX_LENGTH,tokenizer,COLUMN_NAME)
model         = Roberta_Model(pretrained_model=PRETRAINED)
train_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size  = 32,
        pin_memory  = True,
        num_workers = 72
    )
emb_sentence_train = get_embedding(train_loader, model, 'cuda')
df_train[[f'emb_{COLUMN_NAME}_{i}' for i in range(emb_sentence_train.shape[1])]] = emb_sentence_train

HBox(children=(FloatProgress(value=0.0, max=3189.0), HTML(value='')))


CPU times: user 1min 8s, sys: 17.7 s, total: 1min 25s
Wall time: 1min 25s


In [81]:
COLUMN_NAME  = 'ingredients_text' 
tokenizer     = transformers.RobertaTokenizer.from_pretrained(PRETRAINED)
train_dataset = BNPParibasText(df_train,MAX_LENGTH,tokenizer,COLUMN_NAME)
model         = Roberta_Model(pretrained_model=PRETRAINED)
train_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size  = 32,
        pin_memory  = True,
        num_workers = 72
    )
emb_sentence_train = get_embedding(train_loader, model, 'cuda')
df_train[[f'emb_{COLUMN_NAME}_{i}' for i in range(emb_sentence_train.shape[1])]] = emb_sentence_train

HBox(children=(FloatProgress(value=0.0, max=3189.0), HTML(value='')))




In [82]:
COLUMN_NAME  = 'countries_en' 
tokenizer     = transformers.RobertaTokenizer.from_pretrained(PRETRAINED)
train_dataset = BNPParibasText(df_train,MAX_LENGTH,tokenizer,COLUMN_NAME)
model         = Roberta_Model(pretrained_model=PRETRAINED)
train_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size  = 32,
        pin_memory  = True,
        num_workers = 72
    )
emb_sentence_train = get_embedding(train_loader, model, 'cuda')
df_train[[f'emb_{COLUMN_NAME}_{i}' for i in range(emb_sentence_train.shape[1])]] = emb_sentence_train

HBox(children=(FloatProgress(value=0.0, max=3189.0), HTML(value='')))




In [84]:
columns_modeling = ['additives_n','ingredients_from_palm_oil_n',
                    'ingredients_that_may_be_from_palm_oil_n','target',
                    'states_en_brands','states_en_categories','states_en_characteristics','states_en_expiration date',
                    'states_en_general_complete','states_en_ingredients','pnns_groups_1','pnns_groups_2',
                    'states_en_packaging','states_en_packaging-code-','states_en_photo_upload',
                    'states_en_photo_validate','states_en_product name','states_en_quantity','diff_t'] + [f'emb_product_name_{i}' for i in range(emb_sentence_train.shape[1])] + [f'emb_ingredients_text_{i}' for i in range(emb_sentence_train.shape[1])]+[f'emb_countries_en_{i}' for i in range(emb_sentence_train.shape[1])]
columns_label = df_train[columns_modeling].select_dtypes(include=['object']).columns.to_list()
print(columns_label)

['states_en_brands', 'states_en_categories', 'states_en_characteristics', 'states_en_expiration date', 'states_en_general_complete', 'states_en_ingredients', 'pnns_groups_1', 'pnns_groups_2', 'states_en_packaging', 'states_en_packaging-code-', 'states_en_photo_upload', 'states_en_photo_validate', 'states_en_product name', 'states_en_quantity']


In [85]:
df_train,dict_le = label_encoding(df_train,label_cols = columns_label, drop_original = True, missing_new_cat = True)

Mode: Missing as new category
Label Encoding:  label_states_en_brands
Label Encoding:  label_states_en_categories
Label Encoding:  label_states_en_characteristics
Label Encoding:  label_states_en_expiration date
Label Encoding:  label_states_en_general_complete
Label Encoding:  label_states_en_ingredients
Label Encoding:  label_pnns_groups_1
Label Encoding:  label_pnns_groups_2
Label Encoding:  label_states_en_packaging
Label Encoding:  label_states_en_packaging-code-
Label Encoding:  label_states_en_photo_upload
Label Encoding:  label_states_en_photo_validate
Label Encoding:  label_states_en_product name
Label Encoding:  label_states_en_quantity


In [86]:
params = {
        'task': 'train',
        'boosting_type': 'gbdt',
        'objective': 'regression',
        'metric': {'rmse'},
        'num_leaves':12,
        'learning_rate': 0.08,
        "min_child_samples": 150,
        "max_depth" : 5,
        'feature_fraction':  0.5,
        "bagging_freq": 1,
        'bagging_fraction': 0.75,
        "is_unbalance" : False,
        'force_col_wise':True,
        'num_threads':18,
        #"scale_pos_weight":5 -> Generally  is the ratio of number of negative class to the positive class.
        'bagging_seed':42,
        'lambda_l1':1.5,
        'lambda_l2':1,
        'verbose': 1

}
cat_columns = [i for i in df_train.columns.to_list() if i.startswith('label_')]
columns_modeling_last = list(set(columns_modeling)-set(columns_label)) + ['fold'] + cat_columns 

In [87]:
results,models,importances,oof,feature_list = Training_Lightgbm(df_train[columns_modeling_last],params,fold_column = 'fold',target_column = 'target',cat_vars = cat_columns ,metric = 'RMSE',early_stopping = 200,max_boost_round = 8000)

Columns: ['emb_ingredients_text_456', 'emb_product_name_287', 'emb_ingredients_text_67', 'emb_product_name_525', 'emb_ingredients_text_511', 'emb_product_name_39', 'emb_countries_en_523', 'emb_ingredients_text_559', 'emb_ingredients_text_431', 'emb_countries_en_738', 'emb_product_name_718', 'emb_countries_en_225', 'emb_product_name_270', 'emb_countries_en_365', 'emb_product_name_155', 'emb_product_name_715', 'emb_countries_en_545', 'emb_countries_en_181', 'emb_product_name_103', 'emb_countries_en_367', 'emb_product_name_164', 'emb_ingredients_text_373', 'emb_ingredients_text_185', 'emb_product_name_83', 'emb_ingredients_text_65', 'emb_countries_en_63', 'emb_countries_en_139', 'emb_product_name_246', 'emb_ingredients_text_45', 'emb_ingredients_text_425', 'emb_countries_en_536', 'emb_product_name_162', 'emb_ingredients_text_368', 'emb_countries_en_322', 'emb_product_name_632', 'emb_product_name_315', 'emb_countries_en_54', 'emb_countries_en_102', 'emb_ingredients_text_556', 'emb_ingredie

[LightGBM] [Info] Total Bins 494507
[LightGBM] [Info] Number of data points in the train set: 81622, number of used features: 2322




[LightGBM] [Info] Start training from score 9.171473
Training until validation scores don't improve for 200 rounds
[50]	training's rmse: 6.72899	valid_1's rmse: 6.76435
[100]	training's rmse: 6.31385	valid_1's rmse: 6.3863
[150]	training's rmse: 6.08837	valid_1's rmse: 6.20117
[200]	training's rmse: 5.92679	valid_1's rmse: 6.08004
[250]	training's rmse: 5.79935	valid_1's rmse: 5.99347
[300]	training's rmse: 5.69055	valid_1's rmse: 5.92113
[350]	training's rmse: 5.59584	valid_1's rmse: 5.86334
[400]	training's rmse: 5.50879	valid_1's rmse: 5.81379
[450]	training's rmse: 5.43124	valid_1's rmse: 5.77359
[500]	training's rmse: 5.35987	valid_1's rmse: 5.74011
[550]	training's rmse: 5.29092	valid_1's rmse: 5.70713
[600]	training's rmse: 5.22481	valid_1's rmse: 5.67737
[650]	training's rmse: 5.16425	valid_1's rmse: 5.65098
[700]	training's rmse: 5.10682	valid_1's rmse: 5.63053
[750]	training's rmse: 5.05203	valid_1's rmse: 5.61211
[800]	training's rmse: 4.99894	valid_1's rmse: 5.5945
[850]	tr

[4750]	training's rmse: 2.84922	valid_1's rmse: 5.20461
[4800]	training's rmse: 2.83328	valid_1's rmse: 5.20299
[4850]	training's rmse: 2.81673	valid_1's rmse: 5.20094
[4900]	training's rmse: 2.80043	valid_1's rmse: 5.20038
[4950]	training's rmse: 2.78465	valid_1's rmse: 5.19956
[5000]	training's rmse: 2.76879	valid_1's rmse: 5.1976
[5050]	training's rmse: 2.75347	valid_1's rmse: 5.19685
[5100]	training's rmse: 2.73802	valid_1's rmse: 5.19484
[5150]	training's rmse: 2.7226	valid_1's rmse: 5.19333
[5200]	training's rmse: 2.70803	valid_1's rmse: 5.19159
[5250]	training's rmse: 2.69304	valid_1's rmse: 5.19029
[5300]	training's rmse: 2.67805	valid_1's rmse: 5.18889
[5350]	training's rmse: 2.6626	valid_1's rmse: 5.18787
[5400]	training's rmse: 2.64763	valid_1's rmse: 5.18676
[5450]	training's rmse: 2.63327	valid_1's rmse: 5.18578
[5500]	training's rmse: 2.61917	valid_1's rmse: 5.18468
[5550]	training's rmse: 2.60479	valid_1's rmse: 5.18427
[5600]	training's rmse: 2.59042	valid_1's rmse: 5.1

[LightGBM] [Info] Total Bins 495441
[LightGBM] [Info] Number of data points in the train set: 81622, number of used features: 2322




[LightGBM] [Info] Start training from score 9.169930
Training until validation scores don't improve for 200 rounds
[50]	training's rmse: 6.72035	valid_1's rmse: 6.7885
[100]	training's rmse: 6.30565	valid_1's rmse: 6.42523
[150]	training's rmse: 6.0789	valid_1's rmse: 6.24508
[200]	training's rmse: 5.92195	valid_1's rmse: 6.12965
[250]	training's rmse: 5.79573	valid_1's rmse: 6.04434
[300]	training's rmse: 5.68342	valid_1's rmse: 5.96743
[350]	training's rmse: 5.59008	valid_1's rmse: 5.91412
[400]	training's rmse: 5.50527	valid_1's rmse: 5.86611
[450]	training's rmse: 5.42573	valid_1's rmse: 5.82416
[500]	training's rmse: 5.35422	valid_1's rmse: 5.78806
[550]	training's rmse: 5.2865	valid_1's rmse: 5.75925
[600]	training's rmse: 5.22193	valid_1's rmse: 5.72949
[650]	training's rmse: 5.16126	valid_1's rmse: 5.70336
[700]	training's rmse: 5.1043	valid_1's rmse: 5.67758
[750]	training's rmse: 5.04766	valid_1's rmse: 5.6548
[800]	training's rmse: 4.99446	valid_1's rmse: 5.63369
[850]	train

[4000]	training's rmse: 3.11374	valid_1's rmse: 5.24833
[4050]	training's rmse: 3.09497	valid_1's rmse: 5.24644
[4100]	training's rmse: 3.07633	valid_1's rmse: 5.24441
[4150]	training's rmse: 3.05827	valid_1's rmse: 5.24204
[4200]	training's rmse: 3.03992	valid_1's rmse: 5.23957
[4250]	training's rmse: 3.0211	valid_1's rmse: 5.23706
[4300]	training's rmse: 3.003	valid_1's rmse: 5.23457
[4350]	training's rmse: 2.98466	valid_1's rmse: 5.2321
[4400]	training's rmse: 2.96714	valid_1's rmse: 5.23009
[4450]	training's rmse: 2.94901	valid_1's rmse: 5.2287
[4500]	training's rmse: 2.93188	valid_1's rmse: 5.22733
[4550]	training's rmse: 2.91465	valid_1's rmse: 5.22517
[4600]	training's rmse: 2.89765	valid_1's rmse: 5.22359
[4650]	training's rmse: 2.88095	valid_1's rmse: 5.22165
[4700]	training's rmse: 2.86485	valid_1's rmse: 5.22016
[4750]	training's rmse: 2.84845	valid_1's rmse: 5.21727
[4800]	training's rmse: 2.83239	valid_1's rmse: 5.2165
[4850]	training's rmse: 2.81616	valid_1's rmse: 5.2142

[LightGBM] [Info] Total Bins 497084
[LightGBM] [Info] Number of data points in the train set: 81622, number of used features: 2322




[LightGBM] [Info] Start training from score 9.171253
Training until validation scores don't improve for 200 rounds
[50]	training's rmse: 6.73679	valid_1's rmse: 6.78169
[100]	training's rmse: 6.32076	valid_1's rmse: 6.40293
[150]	training's rmse: 6.09532	valid_1's rmse: 6.20922
[200]	training's rmse: 5.93748	valid_1's rmse: 6.09446
[250]	training's rmse: 5.80764	valid_1's rmse: 6.00132
[300]	training's rmse: 5.69721	valid_1's rmse: 5.928
[350]	training's rmse: 5.60006	valid_1's rmse: 5.87067
[400]	training's rmse: 5.51372	valid_1's rmse: 5.81982
[450]	training's rmse: 5.43462	valid_1's rmse: 5.77924
[500]	training's rmse: 5.36101	valid_1's rmse: 5.74147
[550]	training's rmse: 5.29367	valid_1's rmse: 5.71045
[600]	training's rmse: 5.22851	valid_1's rmse: 5.68469
[650]	training's rmse: 5.16815	valid_1's rmse: 5.65702
[700]	training's rmse: 5.11019	valid_1's rmse: 5.63166
[750]	training's rmse: 5.05694	valid_1's rmse: 5.60968
[800]	training's rmse: 5.00454	valid_1's rmse: 5.5936
[850]	tra

[4600]	training's rmse: 2.89546	valid_1's rmse: 5.20529
[4650]	training's rmse: 2.87835	valid_1's rmse: 5.20433
[4700]	training's rmse: 2.86112	valid_1's rmse: 5.20264
[4750]	training's rmse: 2.84368	valid_1's rmse: 5.20012
[4800]	training's rmse: 2.8279	valid_1's rmse: 5.19898
[4850]	training's rmse: 2.81178	valid_1's rmse: 5.19741
[4900]	training's rmse: 2.79627	valid_1's rmse: 5.19712
[4950]	training's rmse: 2.78015	valid_1's rmse: 5.19673
[5000]	training's rmse: 2.76451	valid_1's rmse: 5.19523
[5050]	training's rmse: 2.74892	valid_1's rmse: 5.19385
[5100]	training's rmse: 2.7328	valid_1's rmse: 5.19249
[5150]	training's rmse: 2.71769	valid_1's rmse: 5.19218
[5200]	training's rmse: 2.70187	valid_1's rmse: 5.19085
[5250]	training's rmse: 2.68757	valid_1's rmse: 5.18922
[5300]	training's rmse: 2.67305	valid_1's rmse: 5.18813
[5350]	training's rmse: 2.6575	valid_1's rmse: 5.1868
[5400]	training's rmse: 2.64223	valid_1's rmse: 5.18598
[5450]	training's rmse: 2.62756	valid_1's rmse: 5.18

[LightGBM] [Info] Total Bins 495744
[LightGBM] [Info] Number of data points in the train set: 81623, number of used features: 2322




[LightGBM] [Info] Start training from score 9.170344
Training until validation scores don't improve for 200 rounds
[50]	training's rmse: 6.72413	valid_1's rmse: 6.77784
[100]	training's rmse: 6.30645	valid_1's rmse: 6.41832
[150]	training's rmse: 6.08111	valid_1's rmse: 6.24339
[200]	training's rmse: 5.92402	valid_1's rmse: 6.13848
[250]	training's rmse: 5.79266	valid_1's rmse: 6.05052
[300]	training's rmse: 5.68208	valid_1's rmse: 5.98297
[350]	training's rmse: 5.58738	valid_1's rmse: 5.92797
[400]	training's rmse: 5.50062	valid_1's rmse: 5.87804
[450]	training's rmse: 5.423	valid_1's rmse: 5.83953
[500]	training's rmse: 5.35138	valid_1's rmse: 5.80363
[550]	training's rmse: 5.28363	valid_1's rmse: 5.77328
[600]	training's rmse: 5.22283	valid_1's rmse: 5.75067
[650]	training's rmse: 5.16047	valid_1's rmse: 5.72482
[700]	training's rmse: 5.10282	valid_1's rmse: 5.7015
[750]	training's rmse: 5.04729	valid_1's rmse: 5.68163
[800]	training's rmse: 4.99345	valid_1's rmse: 5.66194
[850]	tra

[5100]	training's rmse: 2.7337	valid_1's rmse: 5.2479
[5150]	training's rmse: 2.71795	valid_1's rmse: 5.24665
[5200]	training's rmse: 2.70266	valid_1's rmse: 5.24621
[5250]	training's rmse: 2.68747	valid_1's rmse: 5.2445
[5300]	training's rmse: 2.67233	valid_1's rmse: 5.24272
[5350]	training's rmse: 2.65727	valid_1's rmse: 5.24171
[5400]	training's rmse: 2.64216	valid_1's rmse: 5.24039
[5450]	training's rmse: 2.62726	valid_1's rmse: 5.24029
[5500]	training's rmse: 2.61295	valid_1's rmse: 5.23914
[5550]	training's rmse: 2.59833	valid_1's rmse: 5.23787
[5600]	training's rmse: 2.58417	valid_1's rmse: 5.23669
[5650]	training's rmse: 2.57024	valid_1's rmse: 5.23455
[5700]	training's rmse: 2.55594	valid_1's rmse: 5.23247
[5750]	training's rmse: 2.54246	valid_1's rmse: 5.23104
[5800]	training's rmse: 2.52881	valid_1's rmse: 5.22972
[5850]	training's rmse: 2.51522	valid_1's rmse: 5.2284
[5900]	training's rmse: 2.5017	valid_1's rmse: 5.22726
[5950]	training's rmse: 2.48821	valid_1's rmse: 5.226

[LightGBM] [Info] Total Bins 495776
[LightGBM] [Info] Number of data points in the train set: 81623, number of used features: 2322




[LightGBM] [Info] Start training from score 9.170246
Training until validation scores don't improve for 200 rounds
[50]	training's rmse: 6.73085	valid_1's rmse: 6.7972
[100]	training's rmse: 6.31549	valid_1's rmse: 6.43203
[150]	training's rmse: 6.0899	valid_1's rmse: 6.25521
[200]	training's rmse: 5.92949	valid_1's rmse: 6.13825
[250]	training's rmse: 5.79925	valid_1's rmse: 6.05134
[300]	training's rmse: 5.68657	valid_1's rmse: 5.97698
[350]	training's rmse: 5.58863	valid_1's rmse: 5.91801
[400]	training's rmse: 5.50493	valid_1's rmse: 5.87441
[450]	training's rmse: 5.42755	valid_1's rmse: 5.83237
[500]	training's rmse: 5.35435	valid_1's rmse: 5.79592
[550]	training's rmse: 5.28644	valid_1's rmse: 5.76374
[600]	training's rmse: 5.22006	valid_1's rmse: 5.73284
[650]	training's rmse: 5.15795	valid_1's rmse: 5.70559
[700]	training's rmse: 5.10029	valid_1's rmse: 5.68495
[750]	training's rmse: 5.04548	valid_1's rmse: 5.66164
[800]	training's rmse: 4.99213	valid_1's rmse: 5.64254
[850]	tr

[5000]	training's rmse: 2.76282	valid_1's rmse: 5.21858
[5050]	training's rmse: 2.74778	valid_1's rmse: 5.21762
[5100]	training's rmse: 2.7321	valid_1's rmse: 5.21686
[5150]	training's rmse: 2.71664	valid_1's rmse: 5.21521
[5200]	training's rmse: 2.70158	valid_1's rmse: 5.21519
[5250]	training's rmse: 2.68744	valid_1's rmse: 5.21407
[5300]	training's rmse: 2.6727	valid_1's rmse: 5.21195
[5350]	training's rmse: 2.65807	valid_1's rmse: 5.21085
[5400]	training's rmse: 2.64292	valid_1's rmse: 5.20986
[5450]	training's rmse: 2.62837	valid_1's rmse: 5.20898
[5500]	training's rmse: 2.61375	valid_1's rmse: 5.20795
[5550]	training's rmse: 2.59951	valid_1's rmse: 5.20683
[5600]	training's rmse: 2.58555	valid_1's rmse: 5.20553
[5650]	training's rmse: 2.57074	valid_1's rmse: 5.20438
[5700]	training's rmse: 2.55698	valid_1's rmse: 5.20326
[5750]	training's rmse: 2.54271	valid_1's rmse: 5.2019
[5800]	training's rmse: 2.52884	valid_1's rmse: 5.20044
[5850]	training's rmse: 2.51448	valid_1's rmse: 5.1

## Evaluando

In [88]:
df_test      = pd.read_csv(os.path.join(DATA_PATH,'test_preprocessed.csv'))
df_test['target'] = -1

COLUMN_NAME  = 'product_name' 

test_dataset = BNPParibasText(df_test,MAX_LENGTH,tokenizer,COLUMN_NAME)
model         = Roberta_Model(pretrained_model=PRETRAINED)
test_loader = torch.utils.data.DataLoader(
        test_dataset,
        batch_size  = 32,
        pin_memory  = True,
        num_workers = 72
    )
emb_sentence_test = get_embedding(test_loader, model, 'cuda')
df_test[[f'emb_{COLUMN_NAME}_{i}' for i in range(emb_sentence_test.shape[1])]] = emb_sentence_test


COLUMN_NAME  = 'ingredients_text' 

test_dataset = BNPParibasText(df_test,MAX_LENGTH,tokenizer,COLUMN_NAME)
model         = Roberta_Model(pretrained_model=PRETRAINED)
test_loader = torch.utils.data.DataLoader(
        test_dataset,
        batch_size  = 32,
        pin_memory  = True,
        num_workers = 72
    )
emb_sentence_test = get_embedding(test_loader, model, 'cuda')
df_test[[f'emb_{COLUMN_NAME}_{i}' for i in range(emb_sentence_test.shape[1])]] = emb_sentence_test

COLUMN_NAME  = 'countries_en' 

test_dataset = BNPParibasText(df_test,MAX_LENGTH,tokenizer,COLUMN_NAME)
model         = Roberta_Model(pretrained_model=PRETRAINED)
test_loader = torch.utils.data.DataLoader(
        test_dataset,
        batch_size  = 32,
        pin_memory  = True,
        num_workers = 72
    )
emb_sentence_test = get_embedding(test_loader, model, 'cuda')
df_test[[f'emb_{COLUMN_NAME}_{i}' for i in range(emb_sentence_test.shape[1])]] = emb_sentence_test

df_test = apply_label_encoder(df_test,dict_le,drop_original = True, missing_new_cat = True)

HBox(children=(FloatProgress(value=0.0, max=798.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=798.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=798.0), HTML(value='')))


Mode: Missing as new category
Applying Label Encoding:  label_states_en_brands
Applying Label Encoding:  label_states_en_categories
Applying Label Encoding:  label_states_en_characteristics
Applying Label Encoding:  label_states_en_expiration date
Applying Label Encoding:  label_states_en_general_complete
Applying Label Encoding:  label_states_en_ingredients
Applying Label Encoding:  label_pnns_groups_1
Applying Label Encoding:  label_pnns_groups_2
Applying Label Encoding:  label_states_en_packaging
Applying Label Encoding:  label_states_en_packaging-code-
Applying Label Encoding:  label_states_en_photo_upload
Applying Label Encoding:  label_states_en_photo_validate
Applying Label Encoding:  label_states_en_product name
Applying Label Encoding:  label_states_en_quantity


In [89]:
probs = 0
for i in models:
    probs = probs + (i.predict(df_test[feature_list]))
    print('fin_predict')
y_test_pred = probs/5.0
print(f'Real: ',math.sqrt(mean_squared_error(y_test_pred,df_test['Target'].values)))

fin_predict
fin_predict
fin_predict
fin_predict
fin_predict
Real:  5.0183674573216654


In [None]:
y_submission['target'] = y_test_pred
y_submission.head()

In [22]:
#Enviar los resultados
#apiquery.submit_api(y_submission,
#       competition_name='food',
#        subname='test_v2', # Pueden cambiar esto sin problemas, poner el nombre que quieran.
#        holdout_key='None',
#        update_ldb=True,
#        username="Insight ML - DD" # Poner el nombre de su equipo como un string. 
                                  # El mejor de los resultados dentro de sus envios es el que aparecera en la tabla de posiciones.
#)

requests number 1
200
{'Date': 'Tue, 18 May 2021 20:58:56 GMT', 'Content-Type': 'application/json', 'Content-Length': '495', 'Connection': 'keep-alive', 'X-Request-ID': '9VDYQEXOTIL4RSGH', 'Access-Control-Allow-Origin': '*', 'Access-Control-Allow-Methods': 'POST', 'Access-Control-Allow-Headers': 'authorization,content-type'}


{'competition_name': 'food',
 'file_path': 'none',
 'message': 'Submission validated.',
 'name': 'Insight ML - DD',
 'result_csv_file': 'test_v2',
 'score': 5.748294411988524,
 'score2': None,
 'score3': None,
 'sub_name': 'test_v2',
 'sub_uid': '8ff2732f-f618-4572-912d-bfd4d0799d1d',
 'submission_time': '2021/05/18, 20:58:56'}