In [28]:
import shutil
import apiquery
import pandas as pd
import sys
import seaborn as sns
import os
import numpy as np
import random
import torch
import pickle
import gc
DATA_PATH = '../01.Data'
shutil.copy("apiquery_pyc.py", "apiquery.pyc")

module_path = "../src"
if module_path not in sys.path:
    sys.path.append(module_path)
    
from utils.training import *
from utils.encoding import *
from utils.utils import *
from utils.fetch import *
from dataset.dataset import BNPParibasText
from models.models import Roberta_Model
from utils.EarlyStopping import EarlyStopping
from utils.LoopFunctions import train_fn,valid_fn
from utils.prediction import get_prediction,get_embedding
pd.options.display.max_rows = 999
pd.options.display.max_columns = 100
import math
from collections import Counter
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import mean_squared_error
import math
import time
import lightgbm as lgbm
import matplotlib.pyplot as plt
import torch.nn as nn
import config_ad
import transformers

In [29]:
def calculate_test(test,config):
    col_unique = generate_col_unique(test,config.COLUMNS_ENCODE)   
    tokenizer     = transformers.RobertaTokenizer.from_pretrained(config.PRETRAINED)
    test_dataset = BNPParibasText(test,config.MAX_LENGTH,tokenizer,col_unique)
    test_loader = torch.utils.data.DataLoader(
        test_dataset,
        batch_size  = config.BATCH_SIZE,
        pin_memory  = True,
        num_workers = config.NUM_WORKERS
    )
    preds = 0
    for fold in range(0,5):
        model = Roberta_Model(pretrained_model=config.PRETRAINED,dropout = config.DROPOUT)
        model.load_state_dict(torch.load(f'../03.Models/BNP_PARIBAS_ROBERTA_FOLD_{fold}'))
        model.to(config.DEVICE)
        preds = preds + get_prediction(test_loader, model,config.DEVICE)
    test['preds'] = preds/5
    print(f'Real RMSE: ',math.sqrt(mean_squared_error(test['preds'].values,test['Target'].values)))

In [30]:
df_test  = pd.read_csv(os.path.join(DATA_PATH,'test_embeddings.csv'))
columns_modeling = ['additives_n','ingredients_from_palm_oil_n',
                    'ingredients_that_may_be_from_palm_oil_n','target',
                    'states_en_brands','states_en_categories','states_en_characteristics','states_en_expiration date',
                    'states_en_general_complete','states_en_ingredients','pnns_groups_1','pnns_groups_2',
                    'states_en_packaging','states_en_packaging-code-','states_en_photo_upload',
                    'states_en_photo_validate','states_en_product name','states_en_quantity','diff_t'] + [i for i in df_test.columns.to_list() if 'emb' in i]
columns_label = df_test[columns_modeling].select_dtypes(include=['object']).columns.to_list()
print(columns_label)

['states_en_brands', 'states_en_categories', 'states_en_characteristics', 'states_en_expiration date', 'states_en_general_complete', 'states_en_ingredients', 'pnns_groups_1', 'pnns_groups_2', 'states_en_packaging', 'states_en_packaging-code-', 'states_en_photo_upload', 'states_en_photo_validate', 'states_en_product name', 'states_en_quantity', 'emb_codes', 'emb_codes_tags']


In [31]:
a_file = open("../03.Models/General/label_encoding.pkl", "rb")
dict_le = pickle.load(a_file)
df_test = apply_label_encoder(df_test,dict_le,drop_original = True, missing_new_cat = True)
del dict_le,a_file

Mode: Missing as new category
Applying Label Encoding:  label_states_en_brands
Applying Label Encoding:  label_states_en_categories
Applying Label Encoding:  label_states_en_characteristics
Applying Label Encoding:  label_states_en_expiration date
Applying Label Encoding:  label_states_en_general_complete
Applying Label Encoding:  label_states_en_ingredients
Applying Label Encoding:  label_pnns_groups_1
Applying Label Encoding:  label_pnns_groups_2
Applying Label Encoding:  label_states_en_packaging
Applying Label Encoding:  label_states_en_packaging-code-
Applying Label Encoding:  label_states_en_photo_upload
Applying Label Encoding:  label_states_en_photo_validate
Applying Label Encoding:  label_states_en_product name
Applying Label Encoding:  label_states_en_quantity


In [32]:
# Saving Models
models = []
for fold in range(0,5):
    filename = f'../03.Models/Lightgbm/lgbm_fold_{fold}.pkl'
    print(filename)
    model = pickle.load(open(filename, 'rb'))
    models.append(model)

../03.Models/Lightgbm/lgbm_fold_0.pkl
../03.Models/Lightgbm/lgbm_fold_1.pkl
../03.Models/Lightgbm/lgbm_fold_2.pkl
../03.Models/Lightgbm/lgbm_fold_3.pkl
../03.Models/Lightgbm/lgbm_fold_4.pkl


In [45]:
with open('../03.Models/General/feature_list.txt') as f:
    features = f.readlines()
# you may also want to remove whitespace characters like `\n` at the end of each line
features = [x.strip() for x in content] 

In [52]:
probs = 0
for i in models:
    probs = probs + (i.predict(df_test[features]))
    print('fin_predict')
y_test_pred = probs/5.0
print(f'Real: ',math.sqrt(mean_squared_error(y_test_pred+1.8,df_test['Target'].values)))

fin_predict
fin_predict
fin_predict
fin_predict
fin_predict
Real:  4.761788288052376


In [53]:
y_submission = pd.read_csv(os.path.join(DATA_PATH,'y_test_submission_example.tsv'), index_col='Index', encoding='utf-8', sep='\t')
y_submission['target'] = y_test_pred + 1.8

In [54]:
#Enviar los resultados
apiquery.submit_api(y_submission,
       competition_name='food',
        subname='test_v12', # Pueden cambiar esto sin problemas, poner el nombre que quieran.
        holdout_key='None',
        update_ldb=True,
        username="Insight ML - DD" # Poner el nombre de su equipo como un string. 
                                  # El mejor de los resultados dentro de sus envios es el que aparecera en la tabla de posiciones.
)

requests number 1
200
{'Date': 'Thu, 20 May 2021 19:57:38 GMT', 'Content-Type': 'application/json', 'Content-Length': '497', 'Connection': 'keep-alive', 'X-Request-ID': 'UDMY5SBQA4F63ZJE', 'Access-Control-Allow-Origin': '*', 'Access-Control-Allow-Methods': 'POST', 'Access-Control-Allow-Headers': 'authorization,content-type'}


{'competition_name': 'food',
 'file_path': 'none',
 'message': 'Submission validated.',
 'name': 'Insight ML - DD',
 'result_csv_file': 'test_v12',
 'score': 4.761923558356993,
 'score2': None,
 'score3': None,
 'sub_name': 'test_v12',
 'sub_uid': '4ad55c78-1618-49fa-b63c-23e45de6b70f',
 'submission_time': '2021/05/20, 19:57:38'}