## 1. Importing Libraries

In [2]:
import shutil
import apiquery
import pandas as pd
import sys
from tqdm.notebook import tqdm
import os
import numpy as np
import random
DATA_PATH = '../01.Data'
module_path = "../src"
if module_path not in sys.path:
    sys.path.append(module_path)

pd.options.display.max_rows = 100
pd.options.display.max_columns = 100
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import mean_squared_error
import math
import time

In [3]:
def feature_engineering(df):
    df['diff_t']  = df['last_modified_t']-df['created_t']
    df.drop(columns = ['last_modified_t','created_t'],inplace = True)

## 2. Preprocessing

In [4]:
%%time
# Reading dataframes
df_train     = pd.read_csv(os.path.join(DATA_PATH,'food_train.tsv'), index_col='Index', encoding='utf-8', sep='\t').reset_index(drop = True)
df_test      = pd.read_csv(os.path.join(DATA_PATH,'food_X_test.tsv'), index_col='Index', encoding='utf-8', sep='\t').reset_index()
y_submission = pd.read_csv(os.path.join(DATA_PATH,'y_test_submission_example.tsv'), index_col='Index', encoding='utf-8', sep='\t')


cols_extract_date = ['created_datetime','last_modified_datetime']
# Preprocessing
#2. 
multiple_extract_time(df_train,cols_extract_date)
multiple_extract_time(df_test,cols_extract_date)
#3. 
col = 'main_category'
df_train[[f'{col}_lang',f'{col}_cat']] = df_train.apply(lambda x: split_double_dot(x,col),axis = 1,result_type='expand')
df_test[[f'{col}_lang',f'{col}_cat']] = df_test.apply(lambda x: split_double_dot(x,col),axis = 1,result_type='expand')

col = 'main_category_en'
df_train[[f'{col}_lang',f'{col}_cat']] = df_train.apply(lambda x: split_double_dot(x,col),axis = 1,result_type='expand')
df_test[[f'{col}_lang',f'{col}_cat']] = df_test.apply(lambda x: split_double_dot(x,col),axis = 1,result_type='expand')


# Formating columns
format_cols = ['pnns_groups_1','pnns_groups_2','product_name','generic_name','packaging','packaging_tags','brands','brands_tags',
 'categories','categories_tags','origins','origins_tags','manufacturing_places','manufacturing_places_tags',
'labels','labels_tags','labels_en','emb_codes','emb_codes_tags','cities_tags','purchase_places','stores']
for i in format_cols :
    df_train[i] = df_train[i].apply(lambda x:func_formatting(x)) 
    df_test[i] = df_test[i].apply(lambda x:func_formatting(x))

# Extracting geo location:
df_train[[f'first_packaging_code_geo_x','first_packaging_code_geo_y']] = df_train.apply(lambda x: func_geolocation(x),axis = 1,result_type='expand')
df_test[[f'first_packaging_code_geo_x','first_packaging_code_geo_y']]  = df_test.apply(lambda x: func_geolocation(x),axis = 1,result_type='expand')
df_train.drop(columns = ['first_packaging_code_geo'],inplace = True)
df_test.drop(columns = ['first_packaging_code_geo'],inplace = True)

for i in ['countries_en','allergens']:
    df_train[i] = df_train[i].apply(lambda x:func_sort_split_comma(x))
    df_test[i]  = df_test[i].apply(lambda x:func_sort_split_comma(x))


cols_states_en = ['brands','categories','characteristics','expiration date','general_check','general_complete',
                 'ingredients','nutrition facts','packaging','packaging-code-','photo_upload','photo_validate',
                 'product name','quantity']
col = 'states_en'
df_train[[f'{col}_{idx}' for idx in cols_states_en]] = df_train.apply(lambda x: func_states(x,col),axis = 1,result_type='expand')
df_test[[f'{col}_{idx}' for idx in cols_states_en]] = df_test.apply(lambda x: func_states(x,col),axis = 1,result_type='expand')

# Drop some cols
cols_drop = ['ingredients_that_may_be_from_palm_oil','ingredients_from_palm_oil','no_nutriments','allergens_en','cities',
            'generic_name','categories','categories_en','origins','manufacturing_places','labels',
             'emb_codes','emb_codes_tags','cities_tags','purchase_places','stores','countries',
             'countries_en','traces','traces_en','additives_en','ingredients_from_palm_oil_tags',
             'ingredients_that_may_be_from_palm_oil_tags','ingredients_that_may_be_from_palm_oil_tags',
             'states','states_tags','main_category','main_category_en','first_packaging_code_geo_x','first_packaging_code_geo_y']

#df_train.drop(columns = cols_drop,inplace = True)
#df_test.drop(columns = cols_drop,inplace = True)



CPU times: user 38.2 s, sys: 1.33 s, total: 39.5 s
Wall time: 39.5 s


In [5]:
feature_engineering(df_train)
feature_engineering(df_test)

## 3. Model Split

In [6]:
num_bins = int(np.floor(1 + np.log2(len(df_train))))
#num_bins = 10
print(f'Num bins: {num_bins}')
df_train['target_bin'] = pd.qcut(df_train['target'],num_bins,labels = False)
100*df_train['target_bin'].value_counts(normalize = True)

Num bins: 17


2     9.675775
10    9.075940
7     7.143137
8     7.093151
0     6.863802
5     6.561924
14    5.904262
16    5.847414
12    5.708237
13    5.387737
3     5.083899
1     5.083899
4     4.741836
6     4.449759
15    3.983220
9     3.939115
11    3.456894
Name: target_bin, dtype: float64

In [7]:
FOLDS = 5
SEED  = 42

df_fold = df_train.copy()
skf = StratifiedKFold(n_splits = FOLDS,shuffle = True, random_state = SEED)
df_fold['fold'] = 0
for fold,(train_index, test_index) in enumerate(skf.split(df_fold,df_fold['target_bin'])):
    df_fold.loc[test_index,'fold'] = fold
    
# Save the CSV with folds for training
df_fold.to_csv(os.path.join("../01.Data",'fold.csv'),index = False)
df_test.to_csv(os.path.join(DATA_PATH,'test_preprocessed.csv'),index = False)

Unnamed: 0,creator,product_name,generic_name,quantity,packaging,packaging_tags,brands,brands_tags,categories,categories_tags,categories_en,origins,origins_tags,manufacturing_places,manufacturing_places_tags,labels,labels_tags,labels_en,emb_codes,emb_codes_tags,cities,cities_tags,purchase_places,stores,countries,countries_tags,countries_en,ingredients_text,allergens,allergens_en,traces,traces_tags,traces_en,serving_size,no_nutriments,additives_n,additives,additives_tags,additives_en,ingredients_from_palm_oil_n,ingredients_from_palm_oil,ingredients_from_palm_oil_tags,ingredients_that_may_be_from_palm_oil_n,ingredients_that_may_be_from_palm_oil,ingredients_that_may_be_from_palm_oil_tags,pnns_groups_1,pnns_groups_2,states,states_tags,states_en,main_category,main_category_en,target,created_datetime_year,created_datetime_month,created_datetime_day,created_datetime_hour,created_datetime_minute,created_datetime_second,last_modified_datetime_year,last_modified_datetime_month,last_modified_datetime_day,last_modified_datetime_hour,last_modified_datetime_minute,last_modified_datetime_second,main_category_lang,main_category_cat,main_category_en_lang,main_category_en_cat,first_packaging_code_geo_x,first_packaging_code_geo_y,states_en_brands,states_en_categories,states_en_characteristics,states_en_expiration date,states_en_general_check,states_en_general_complete,states_en_ingredients,states_en_nutrition facts,states_en_packaging,states_en_packaging-code-,states_en_photo_upload,states_en_photo_validate,states_en_product name,states_en_quantity,diff_t,target_bin,fold
0,usda-ndb-import,caramel creams,,,,,gold emblem,gold emblem,,,,,,,,,,,,,,,,,US,en:united-states,united states,"Wheat flour, corn syrup, dextrose, sugar, part...",,,,,,34 g (3 PIECES),,1.0,[ wheat-flour -> en:wheat-flour ] [ flour -...,"en:e322,en:e322i","E322 - Lecithins,E322i - Lecithin",0.0,,,0.0,,,,,"en:to-be-completed, en:nutrition-facts-complet...","en:to-be-completed,en:nutrition-facts-complete...","To be completed,Nutrition facts completed,Ingr...",,,11.0,2017,3,10,8,46,35,2017,3,10,8,46,35,,,,,,,completed,to be completed,to be completed,to be completed,,to be completed,completed,completed,to be completed,to be completed,photos to be uploaded,,completed,to be completed,0,8,2


In [8]:
df_fold.groupby('fold')['target'].mean()

fold
0    9.167353
1    9.173527
2    9.168235
3    9.171870
4    9.172262
Name: target, dtype: float64