In [10]:
import shutil
import apiquery
import pandas as pd
import sys
import seaborn as sns
import os
import torch
import numpy as np
import random
DATA_PATH = '../01.Data'
shutil.copy("apiquery_pyc.py", "apiquery.pyc")
module_path = "../src"
if module_path not in sys.path:
    sys.path.append(module_path)
    
from utils.training import *
from utils.encoding import *
from utils.utils import *
from models.models import Roberta_Model
from dataset.dataset import BNPParibasText
pd.options.display.max_rows = 100
pd.options.display.max_columns = 100
import math
from collections import Counter
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import mean_squared_error
import math
import time
import matplotlib.pyplot as plt
import transformers
pd.set_option('display.max_rows',100 )


In [11]:
def get_embedding(data_loader, model, device):
    from tqdm.notebook import tqdm
    # Put the model in eval mode
    model.to(device)
    model.eval()
    # List for store final predictions
    final_predictions = []
    with torch.no_grad():
        tk0 = tqdm(data_loader, total=len(data_loader))
        for b_idx, data in enumerate(tk0):
            for key,value in data.items():
                data[key] = value.to(device)
            predictions = model._embeddings(data['ids'],data['mask'])
            predictions = predictions.cpu()
            final_predictions.append(predictions)
    return np.vstack(final_predictions)

In [12]:
MAX_LENGTH   = 16
PRETRAINED   = 'roberta-base'
SEED         = 42

In [13]:
%%time
df_train     = pd.read_csv(os.path.join("../01.Data",'fold.csv'))
y_submission = pd.read_csv(os.path.join(DATA_PATH,'y_test_submission_example.tsv'), index_col='Index', encoding='utf-8', sep='\t')

CPU times: user 2.42 s, sys: 237 ms, total: 2.65 s
Wall time: 2.65 s


In [14]:
seed_everything(SEED)

In [15]:
%%time
COLUMN_NAME  = 'product_name' 
tokenizer     = transformers.RobertaTokenizer.from_pretrained(PRETRAINED)
train_dataset = BNPParibasText(df_train,MAX_LENGTH,tokenizer,COLUMN_NAME)
model         = Roberta_Model(pretrained_model=PRETRAINED)
train_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size  = 32,
        pin_memory  = True,
        num_workers = 72
    )
emb_sentence_train = get_embedding(train_loader, model, 'cuda')
df_train[[f'emb_{COLUMN_NAME}_{i}' for i in range(emb_sentence_train.shape[1])]] = emb_sentence_train

HBox(children=(FloatProgress(value=0.0, max=3189.0), HTML(value='')))


CPU times: user 1min 29s, sys: 14.1 s, total: 1min 43s
Wall time: 1min 34s


In [16]:
COLUMN_NAME  = 'ingredients_text' 
tokenizer     = transformers.RobertaTokenizer.from_pretrained(PRETRAINED)
train_dataset = BNPParibasText(df_train,MAX_LENGTH,tokenizer,COLUMN_NAME)
model         = Roberta_Model(pretrained_model=PRETRAINED)
train_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size  = 32,
        pin_memory  = True,
        num_workers = 72
    )
emb_sentence_train = get_embedding(train_loader, model, 'cuda')
df_train[[f'emb_{COLUMN_NAME}_{i}' for i in range(emb_sentence_train.shape[1])]] = emb_sentence_train

HBox(children=(FloatProgress(value=0.0, max=3189.0), HTML(value='')))




In [17]:
COLUMN_NAME  = 'brands_tags' 
tokenizer     = transformers.RobertaTokenizer.from_pretrained(PRETRAINED)
train_dataset = BNPParibasText(df_train,MAX_LENGTH,tokenizer,COLUMN_NAME)
model         = Roberta_Model(pretrained_model=PRETRAINED)
train_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size  = 32,
        pin_memory  = True,
        num_workers = 72
    )
emb_sentence_train = get_embedding(train_loader, model, 'cuda')
df_train[[f'emb_{COLUMN_NAME}_{i}' for i in range(emb_sentence_train.shape[1])]] = emb_sentence_train

HBox(children=(FloatProgress(value=0.0, max=3189.0), HTML(value='')))




In [19]:
columns_modeling = ['additives_n','ingredients_from_palm_oil_n',
                    'ingredients_that_may_be_from_palm_oil_n','target',
                    'states_en_brands','states_en_categories','states_en_characteristics','states_en_expiration date',
                    'states_en_general_complete','states_en_ingredients','pnns_groups_1','pnns_groups_2',
                    'states_en_packaging','states_en_packaging-code-','states_en_photo_upload',
                    'states_en_photo_validate','states_en_product name','states_en_quantity','diff_t'] + [f'emb_product_name_{i}' for i in range(emb_sentence_train.shape[1])] + [f'emb_ingredients_text_{i}' for i in range(emb_sentence_train.shape[1])]+[f'emb_brands_tags_{i}' for i in range(emb_sentence_train.shape[1])]
columns_label = df_train[columns_modeling].select_dtypes(include=['object']).columns.to_list()
print(columns_label)

['states_en_brands', 'states_en_categories', 'states_en_characteristics', 'states_en_expiration date', 'states_en_general_complete', 'states_en_ingredients', 'pnns_groups_1', 'pnns_groups_2', 'states_en_packaging', 'states_en_packaging-code-', 'states_en_photo_upload', 'states_en_photo_validate', 'states_en_product name', 'states_en_quantity']


In [20]:
df_train,dict_le = label_encoding(df_train,label_cols = columns_label, drop_original = True, missing_new_cat = True)

Mode: Missing as new category
Label Encoding:  label_states_en_brands
Label Encoding:  label_states_en_categories
Label Encoding:  label_states_en_characteristics
Label Encoding:  label_states_en_expiration date
Label Encoding:  label_states_en_general_complete
Label Encoding:  label_states_en_ingredients
Label Encoding:  label_pnns_groups_1
Label Encoding:  label_pnns_groups_2
Label Encoding:  label_states_en_packaging
Label Encoding:  label_states_en_packaging-code-
Label Encoding:  label_states_en_photo_upload
Label Encoding:  label_states_en_photo_validate
Label Encoding:  label_states_en_product name
Label Encoding:  label_states_en_quantity


In [25]:
params = {
        'task': 'train',
        'boosting_type': 'gbdt',
        'objective': 'regression',
        'metric': {'rmse'},
        'num_leaves':12,
        'learning_rate': 0.08,
        "min_child_samples": 150,
        "max_depth" : 5,
        'feature_fraction':  0.5,
        "bagging_freq": 1,
        'bagging_fraction': 0.75,
        "is_unbalance" : False,
        'force_col_wise':True,
        'num_threads':18,
        #"scale_pos_weight":5 -> Generally  is the ratio of number of negative class to the positive class.
        'bagging_seed':42,
        'lambda_l1':1.5,
        'lambda_l2':1,
        'verbose': 1

}
cat_columns = [i for i in df_train.columns.to_list() if i.startswith('label_')]
columns_modeling_last = list(set(columns_modeling)-set(columns_label)) + ['fold'] + cat_columns 

In [26]:
results,models,importances,oof,feature_list = Training_Lightgbm(df_train[columns_modeling_last],params,fold_column = 'fold',target_column = 'target',cat_vars = cat_columns ,metric = 'RMSE',early_stopping = 200,max_boost_round = 8000)

Columns: ['emb_product_name_516', 'emb_ingredients_text_444', 'emb_ingredients_text_648', 'emb_ingredients_text_157', 'emb_product_name_295', 'emb_ingredients_text_126', 'emb_ingredients_text_634', 'emb_ingredients_text_740', 'emb_brands_tags_27', 'emb_brands_tags_292', 'emb_brands_tags_764', 'emb_product_name_40', 'emb_ingredients_text_621', 'emb_product_name_616', 'emb_product_name_721', 'emb_ingredients_text_22', 'emb_brands_tags_738', 'emb_brands_tags_452', 'emb_ingredients_text_490', 'emb_ingredients_text_223', 'emb_product_name_369', 'emb_ingredients_text_5', 'emb_brands_tags_295', 'emb_ingredients_text_677', 'emb_product_name_493', 'emb_product_name_525', 'emb_product_name_655', 'emb_product_name_500', 'emb_product_name_704', 'emb_product_name_731', 'emb_brands_tags_264', 'emb_brands_tags_468', 'emb_product_name_558', 'emb_ingredients_text_315', 'emb_product_name_735', 'emb_brands_tags_178', 'emb_ingredients_text_512', 'emb_ingredients_text_556', 'emb_brands_tags_347', 'emb_prod

[LightGBM] [Info] Total Bins 587900
[LightGBM] [Info] Number of data points in the train set: 81622, number of used features: 2322




[LightGBM] [Info] Start training from score 9.171473
Training until validation scores don't improve for 200 rounds
[50]	training's rmse: 6.7291	valid_1's rmse: 6.775
[100]	training's rmse: 6.31221	valid_1's rmse: 6.39453
[150]	training's rmse: 6.08458	valid_1's rmse: 6.20699
[200]	training's rmse: 5.91025	valid_1's rmse: 6.0778
[250]	training's rmse: 5.77522	valid_1's rmse: 5.98731
[300]	training's rmse: 5.66109	valid_1's rmse: 5.91398
[350]	training's rmse: 5.55914	valid_1's rmse: 5.85182
[400]	training's rmse: 5.46682	valid_1's rmse: 5.79701
[450]	training's rmse: 5.38092	valid_1's rmse: 5.75234
[500]	training's rmse: 5.30334	valid_1's rmse: 5.71681
[550]	training's rmse: 5.23249	valid_1's rmse: 5.68058
[600]	training's rmse: 5.16433	valid_1's rmse: 5.64823
[650]	training's rmse: 5.09986	valid_1's rmse: 5.61827
[700]	training's rmse: 5.03835	valid_1's rmse: 5.5915
[750]	training's rmse: 4.98063	valid_1's rmse: 5.56835
[800]	training's rmse: 4.9239	valid_1's rmse: 5.54487
[850]	traini

[4900]	training's rmse: 2.63506	valid_1's rmse: 5.11553
[4950]	training's rmse: 2.61863	valid_1's rmse: 5.11401
[5000]	training's rmse: 2.60177	valid_1's rmse: 5.11201
[5050]	training's rmse: 2.58548	valid_1's rmse: 5.11108
[5100]	training's rmse: 2.56936	valid_1's rmse: 5.10957
[5150]	training's rmse: 2.55416	valid_1's rmse: 5.10768
[5200]	training's rmse: 2.5382	valid_1's rmse: 5.10566
[5250]	training's rmse: 2.52265	valid_1's rmse: 5.1049
[5300]	training's rmse: 2.50675	valid_1's rmse: 5.10397
[5350]	training's rmse: 2.49125	valid_1's rmse: 5.10271
[5400]	training's rmse: 2.47593	valid_1's rmse: 5.10139
[5450]	training's rmse: 2.46077	valid_1's rmse: 5.09982
[5500]	training's rmse: 2.44581	valid_1's rmse: 5.09868
[5550]	training's rmse: 2.43082	valid_1's rmse: 5.09815
[5600]	training's rmse: 2.41599	valid_1's rmse: 5.09707
[5650]	training's rmse: 2.401	valid_1's rmse: 5.09596
[5700]	training's rmse: 2.38624	valid_1's rmse: 5.0952
[5750]	training's rmse: 2.3716	valid_1's rmse: 5.0930

[LightGBM] [Info] Total Bins 587901
[LightGBM] [Info] Number of data points in the train set: 81622, number of used features: 2322




[LightGBM] [Info] Start training from score 9.169930
Training until validation scores don't improve for 200 rounds
[50]	training's rmse: 6.72702	valid_1's rmse: 6.79657
[100]	training's rmse: 6.30906	valid_1's rmse: 6.43716
[150]	training's rmse: 6.07297	valid_1's rmse: 6.24592
[200]	training's rmse: 5.90535	valid_1's rmse: 6.12289
[250]	training's rmse: 5.76952	valid_1's rmse: 6.02479
[300]	training's rmse: 5.64995	valid_1's rmse: 5.94357
[350]	training's rmse: 5.54855	valid_1's rmse: 5.88061
[400]	training's rmse: 5.45708	valid_1's rmse: 5.82661
[450]	training's rmse: 5.37225	valid_1's rmse: 5.77986
[500]	training's rmse: 5.2975	valid_1's rmse: 5.74139
[550]	training's rmse: 5.22522	valid_1's rmse: 5.70423
[600]	training's rmse: 5.15611	valid_1's rmse: 5.67158
[650]	training's rmse: 5.09178	valid_1's rmse: 5.63917
[700]	training's rmse: 5.02925	valid_1's rmse: 5.6134
[750]	training's rmse: 4.97144	valid_1's rmse: 5.59148
[800]	training's rmse: 4.91682	valid_1's rmse: 5.57179
[850]	tr

[4900]	training's rmse: 2.62872	valid_1's rmse: 5.12064
[4950]	training's rmse: 2.61273	valid_1's rmse: 5.11875
[5000]	training's rmse: 2.59634	valid_1's rmse: 5.11736
[5050]	training's rmse: 2.58043	valid_1's rmse: 5.11547
[5100]	training's rmse: 2.56451	valid_1's rmse: 5.11415
[5150]	training's rmse: 2.54874	valid_1's rmse: 5.11298
[5200]	training's rmse: 2.53261	valid_1's rmse: 5.11233
[5250]	training's rmse: 2.5166	valid_1's rmse: 5.11108
[5300]	training's rmse: 2.50075	valid_1's rmse: 5.1092
[5350]	training's rmse: 2.4854	valid_1's rmse: 5.10839
[5400]	training's rmse: 2.47005	valid_1's rmse: 5.10779
[5450]	training's rmse: 2.45485	valid_1's rmse: 5.10636
[5500]	training's rmse: 2.44021	valid_1's rmse: 5.10471
[5550]	training's rmse: 2.42473	valid_1's rmse: 5.1033
[5600]	training's rmse: 2.40994	valid_1's rmse: 5.10211
[5650]	training's rmse: 2.39553	valid_1's rmse: 5.10098
[5700]	training's rmse: 2.38076	valid_1's rmse: 5.09925
[5750]	training's rmse: 2.36607	valid_1's rmse: 5.09

[LightGBM] [Info] Total Bins 587900
[LightGBM] [Info] Number of data points in the train set: 81622, number of used features: 2322




[LightGBM] [Info] Start training from score 9.171253
Training until validation scores don't improve for 200 rounds
[50]	training's rmse: 6.73842	valid_1's rmse: 6.78689
[100]	training's rmse: 6.31725	valid_1's rmse: 6.40454
[150]	training's rmse: 6.08207	valid_1's rmse: 6.20498
[200]	training's rmse: 5.91475	valid_1's rmse: 6.07667
[250]	training's rmse: 5.78007	valid_1's rmse: 5.97895
[300]	training's rmse: 5.66232	valid_1's rmse: 5.90099
[350]	training's rmse: 5.55818	valid_1's rmse: 5.8349
[400]	training's rmse: 5.46537	valid_1's rmse: 5.78118
[450]	training's rmse: 5.38343	valid_1's rmse: 5.73701
[500]	training's rmse: 5.30389	valid_1's rmse: 5.69577
[550]	training's rmse: 5.23032	valid_1's rmse: 5.6595
[600]	training's rmse: 5.16482	valid_1's rmse: 5.62944
[650]	training's rmse: 5.10282	valid_1's rmse: 5.60563
[700]	training's rmse: 5.04123	valid_1's rmse: 5.58123
[750]	training's rmse: 4.98214	valid_1's rmse: 5.55687
[800]	training's rmse: 4.926	valid_1's rmse: 5.53771
[850]	trai

[4850]	training's rmse: 2.64556	valid_1's rmse: 5.10029
[4900]	training's rmse: 2.62882	valid_1's rmse: 5.09872
[4950]	training's rmse: 2.61213	valid_1's rmse: 5.09696
[5000]	training's rmse: 2.59532	valid_1's rmse: 5.09532
[5050]	training's rmse: 2.57955	valid_1's rmse: 5.09398
[5100]	training's rmse: 2.56322	valid_1's rmse: 5.09331
[5150]	training's rmse: 2.54765	valid_1's rmse: 5.09201
[5200]	training's rmse: 2.53181	valid_1's rmse: 5.09092
[5250]	training's rmse: 2.51625	valid_1's rmse: 5.08929
[5300]	training's rmse: 2.50066	valid_1's rmse: 5.08888
[5350]	training's rmse: 2.48479	valid_1's rmse: 5.08772
[5400]	training's rmse: 2.47009	valid_1's rmse: 5.08648
[5450]	training's rmse: 2.45543	valid_1's rmse: 5.08494
[5500]	training's rmse: 2.44057	valid_1's rmse: 5.08327
[5550]	training's rmse: 2.42539	valid_1's rmse: 5.08217
[5600]	training's rmse: 2.41035	valid_1's rmse: 5.08099
[5650]	training's rmse: 2.39567	valid_1's rmse: 5.08055
[5700]	training's rmse: 2.38147	valid_1's rmse: 

[LightGBM] [Info] Total Bins 587899
[LightGBM] [Info] Number of data points in the train set: 81623, number of used features: 2322




[LightGBM] [Info] Start training from score 9.170344
Training until validation scores don't improve for 200 rounds
[50]	training's rmse: 6.71866	valid_1's rmse: 6.78325
[100]	training's rmse: 6.29479	valid_1's rmse: 6.41511
[150]	training's rmse: 6.0606	valid_1's rmse: 6.2331
[200]	training's rmse: 5.89222	valid_1's rmse: 6.11175
[250]	training's rmse: 5.75481	valid_1's rmse: 6.01869
[300]	training's rmse: 5.63647	valid_1's rmse: 5.9438
[350]	training's rmse: 5.53383	valid_1's rmse: 5.88029
[400]	training's rmse: 5.44248	valid_1's rmse: 5.82577
[450]	training's rmse: 5.35835	valid_1's rmse: 5.78135
[500]	training's rmse: 5.28117	valid_1's rmse: 5.74131
[550]	training's rmse: 5.20661	valid_1's rmse: 5.70215
[600]	training's rmse: 5.14039	valid_1's rmse: 5.67305
[650]	training's rmse: 5.07574	valid_1's rmse: 5.6455
[700]	training's rmse: 5.01586	valid_1's rmse: 5.62162
[750]	training's rmse: 4.9583	valid_1's rmse: 5.59935
[800]	training's rmse: 4.90313	valid_1's rmse: 5.57979
[850]	train

[4450]	training's rmse: 2.7807	valid_1's rmse: 5.17307
[4500]	training's rmse: 2.76313	valid_1's rmse: 5.17182
[4550]	training's rmse: 2.74552	valid_1's rmse: 5.16943
[4600]	training's rmse: 2.72749	valid_1's rmse: 5.16816
[4650]	training's rmse: 2.70984	valid_1's rmse: 5.16662
[4700]	training's rmse: 2.69228	valid_1's rmse: 5.16504
[4750]	training's rmse: 2.67484	valid_1's rmse: 5.16357
[4800]	training's rmse: 2.65741	valid_1's rmse: 5.16146
[4850]	training's rmse: 2.64016	valid_1's rmse: 5.15938
[4900]	training's rmse: 2.62355	valid_1's rmse: 5.15798
[4950]	training's rmse: 2.60787	valid_1's rmse: 5.15616
[5000]	training's rmse: 2.59155	valid_1's rmse: 5.15488
[5050]	training's rmse: 2.57526	valid_1's rmse: 5.15364
[5100]	training's rmse: 2.55934	valid_1's rmse: 5.15233
[5150]	training's rmse: 2.54351	valid_1's rmse: 5.15238
[5200]	training's rmse: 2.52741	valid_1's rmse: 5.15103
[5250]	training's rmse: 2.51174	valid_1's rmse: 5.14916
[5300]	training's rmse: 2.49661	valid_1's rmse: 5

[LightGBM] [Info] Total Bins 587900
[LightGBM] [Info] Number of data points in the train set: 81623, number of used features: 2322




[LightGBM] [Info] Start training from score 9.170246
Training until validation scores don't improve for 200 rounds
[50]	training's rmse: 6.72462	valid_1's rmse: 6.79045
[100]	training's rmse: 6.30185	valid_1's rmse: 6.4218
[150]	training's rmse: 6.07236	valid_1's rmse: 6.24132
[200]	training's rmse: 5.90084	valid_1's rmse: 6.11684
[250]	training's rmse: 5.76869	valid_1's rmse: 6.02659
[300]	training's rmse: 5.65007	valid_1's rmse: 5.94831
[350]	training's rmse: 5.54811	valid_1's rmse: 5.88641
[400]	training's rmse: 5.45708	valid_1's rmse: 5.83218
[450]	training's rmse: 5.374	valid_1's rmse: 5.78432
[500]	training's rmse: 5.29536	valid_1's rmse: 5.74357
[550]	training's rmse: 5.22584	valid_1's rmse: 5.71142
[600]	training's rmse: 5.15648	valid_1's rmse: 5.67745
[650]	training's rmse: 5.09267	valid_1's rmse: 5.64971
[700]	training's rmse: 5.0311	valid_1's rmse: 5.62134
[750]	training's rmse: 4.97422	valid_1's rmse: 5.59729
[800]	training's rmse: 4.91944	valid_1's rmse: 5.57486
[850]	trai

[4850]	training's rmse: 2.64514	valid_1's rmse: 5.14325
[4900]	training's rmse: 2.62871	valid_1's rmse: 5.14139
[4950]	training's rmse: 2.61176	valid_1's rmse: 5.13956
[5000]	training's rmse: 2.59531	valid_1's rmse: 5.13804
[5050]	training's rmse: 2.57951	valid_1's rmse: 5.13628
[5100]	training's rmse: 2.56345	valid_1's rmse: 5.13557
[5150]	training's rmse: 2.54731	valid_1's rmse: 5.13331
[5200]	training's rmse: 2.53161	valid_1's rmse: 5.13183
[5250]	training's rmse: 2.51559	valid_1's rmse: 5.12888
[5300]	training's rmse: 2.49997	valid_1's rmse: 5.12689
[5350]	training's rmse: 2.48495	valid_1's rmse: 5.12495
[5400]	training's rmse: 2.46943	valid_1's rmse: 5.12367
[5450]	training's rmse: 2.45447	valid_1's rmse: 5.12328
[5500]	training's rmse: 2.43937	valid_1's rmse: 5.1224
[5550]	training's rmse: 2.42451	valid_1's rmse: 5.12173
[5600]	training's rmse: 2.4101	valid_1's rmse: 5.12019
[5650]	training's rmse: 2.39527	valid_1's rmse: 5.11845
[5700]	training's rmse: 2.3804	valid_1's rmse: 5.1

## Evaluando

In [27]:
df_test      = pd.read_csv(os.path.join(DATA_PATH,'test_preprocessed.csv'))
df_test['target'] = -1

COLUMN_NAME  = 'product_name' 

test_dataset = BNPParibasText(df_test,MAX_LENGTH,tokenizer,COLUMN_NAME)
model         = Roberta_Model(pretrained_model=PRETRAINED)
test_loader = torch.utils.data.DataLoader(
        test_dataset,
        batch_size  = 32,
        pin_memory  = True,
        num_workers = 72
    )
emb_sentence_test = get_embedding(test_loader, model, 'cuda')
df_test[[f'emb_{COLUMN_NAME}_{i}' for i in range(emb_sentence_test.shape[1])]] = emb_sentence_test


COLUMN_NAME  = 'ingredients_text' 

test_dataset = BNPParibasText(df_test,MAX_LENGTH,tokenizer,COLUMN_NAME)
model         = Roberta_Model(pretrained_model=PRETRAINED)
test_loader = torch.utils.data.DataLoader(
        test_dataset,
        batch_size  = 32,
        pin_memory  = True,
        num_workers = 72
    )
emb_sentence_test = get_embedding(test_loader, model, 'cuda')
df_test[[f'emb_{COLUMN_NAME}_{i}' for i in range(emb_sentence_test.shape[1])]] = emb_sentence_test

COLUMN_NAME  = 'brands_tags' 

test_dataset = BNPParibasText(df_test,MAX_LENGTH,tokenizer,COLUMN_NAME)
model         = Roberta_Model(pretrained_model=PRETRAINED)
test_loader = torch.utils.data.DataLoader(
        test_dataset,
        batch_size  = 32,
        pin_memory  = True,
        num_workers = 72
    )
emb_sentence_test = get_embedding(test_loader, model, 'cuda')
df_test[[f'emb_{COLUMN_NAME}_{i}' for i in range(emb_sentence_test.shape[1])]] = emb_sentence_test

df_test = apply_label_encoder(df_test,dict_le,drop_original = True, missing_new_cat = True)

HBox(children=(FloatProgress(value=0.0, max=798.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=798.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=798.0), HTML(value='')))


Mode: Missing as new category
Applying Label Encoding:  label_states_en_brands
Applying Label Encoding:  label_states_en_categories
Applying Label Encoding:  label_states_en_characteristics
Applying Label Encoding:  label_states_en_expiration date
Applying Label Encoding:  label_states_en_general_complete
Applying Label Encoding:  label_states_en_ingredients
Applying Label Encoding:  label_pnns_groups_1
Applying Label Encoding:  label_pnns_groups_2
Applying Label Encoding:  label_states_en_packaging
Applying Label Encoding:  label_states_en_packaging-code-
Applying Label Encoding:  label_states_en_photo_upload
Applying Label Encoding:  label_states_en_photo_validate
Applying Label Encoding:  label_states_en_product name
Applying Label Encoding:  label_states_en_quantity


In [33]:
probs = 0
for i in models:
    probs = probs + (i.predict(df_test[feature_list]))
    print('fin_predict')
y_test_pred = probs/5.0
print(f'Real: ',math.sqrt(mean_squared_error(y_test_pred+1.5,df_test['Target'].values)))

fin_predict
fin_predict
fin_predict
fin_predict
fin_predict
Real:  5.128065432170818


In [34]:
y_submission['target'] = y_test_pred +1.5
y_submission.head()

Unnamed: 0_level_0,target
Index,Unnamed: 1_level_1
37320,17.005029
3913,23.107371
112180,7.631445
128820,12.213423
16037,16.837885


In [35]:
#Enviar los resultados
apiquery.submit_api(y_submission,
       competition_name='food',
        subname='test_v2', # Pueden cambiar esto sin problemas, poner el nombre que quieran.
        holdout_key='None',
        update_ldb=True,
        username="Insight ML - DD" # Poner el nombre de su equipo como un string. 
                                  # El mejor de los resultados dentro de sus envios es el que aparecera en la tabla de posiciones.
)

requests number 1
200
{'Date': 'Wed, 19 May 2021 22:00:13 GMT', 'Content-Type': 'application/json', 'Content-Length': '494', 'Connection': 'keep-alive', 'X-Request-ID': 'IDBT87NQAGLW4CJZ', 'Access-Control-Allow-Origin': '*', 'Access-Control-Allow-Methods': 'POST', 'Access-Control-Allow-Headers': 'authorization,content-type'}


{'competition_name': 'food',
 'file_path': 'none',
 'message': 'Submission validated.',
 'name': 'Insight ML - DD',
 'result_csv_file': 'test_v2',
 'score': 5.128393060997395,
 'score2': None,
 'score3': None,
 'sub_name': 'test_v2',
 'sub_uid': '42e680e8-dd05-42a0-aef0-125b417d4977',
 'submission_time': '2021/05/19, 22:00:12'}