In [1]:
#
# The MIT License (MIT)

# Copyright (c) 2021, NVIDIA CORPORATION

# Permission is hereby granted, free of charge, to any person obtaining a copy of
# this software and associated documentation files (the "Software"), to deal in
# the Software without restriction, including without limitation the rights to
# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
# the Software, and to permit persons to whom the Software is furnished to do so,
# subject to the following conditions:

# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.

# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
#

In [2]:
import os
import pandas as pd
import numpy as np
from tqdm import tqdm
tqdm.pandas()
import cudf
import cupy
import nvtabular as nvt
import json

  from pandas import Panel


In [3]:
nvt.__version__

'0.6.0'

In [4]:
#DATA_FOLDER = "/workspace/"
#DATA_FOLDER = "/home/sumitra/recsys_folder/competitions/SIGIR_eCommerce_Challenge_2021/data/"

DATA_FOLDER = "/recsys/data/"
FILENAME_PATTERN_BROWSING = 'browsing_train.csv'
FILENAME_PATTERN_SEARCH = 'search_train.csv'
DATA_PATH_BROWSING = os.path.join(DATA_FOLDER, FILENAME_PATTERN_BROWSING)
DATA_PATH_SEARCH = os.path.join(DATA_FOLDER, FILENAME_PATTERN_SEARCH)
#OUTPUT_DIR = "/workspace/coveo_task1_v4"
#OUTPUT_DIR = "/home/sumitra/recsys_folder/competitions/SIGIR_eCommerce_Challenge_2021"
OUTPUT_DIR = "/recsys/coveo_task1_v42"
!ls $DATA_PATH_BROWSING

/recsys/data/browsing_train.csv


In [5]:
MINIMUM_SESSION_LENGTH = 2

**5 steps: each section is independant and create different parquet files:**

   - Pandas processing: <a href ='#pandas_proc'> Section 1 </a> 
       - <a href='#search_table'>Section 1.1</a>:  Process search table and create session_search table that groups query vectors and impressions seen/clicked within the same sessions. 
       - <a href='#event_table'>Section 1.2 </a>: Create event table by merging browsing, test and search data 
       
   
   - Create preprocessed interactions table:  <a href='#row_workflow'> Section 2 </a>
           - Filling missing values 
           - Convert timestamps 
           - Categorify and Normalize variables
   
   - Create preprocess session table:  <a href='#session_workflow'> Section 3 </a>
   
   - Merge session_browsing and session_search tables 

# <a id='pandas_proc'>Processing DATA with pandas </a>

## <a id='event_table'> Preprocessing of search tables: merging Train and Test data </a>

In [None]:
#search = pd.read_csv(DATA_PATH_SEARCH, sep=',')
#search.head()

In [None]:
# load search data
search = pd.read_csv(DATA_PATH_SEARCH, sep=',')
#search = pd.read_csv('search_train.csv', sep=',')
# Add column event_type 
search['event_type'] = 'search'
# Add column 'is_search'
search['is_search'] = 1
#search['is_test'] = 0
# drop 123 rows where: (clicked_skus_hash != NaN) and (product_skus_hash == NaN)
condition = (search['product_skus_hash'].isnull()) & (~search['clicked_skus_hash'].isnull())
search = search.loc[~condition]
# convert strings to list object 
import ast
def convert_str_to_list(x): 
    if pd.isnull(x): 
        return x
    return ast.literal_eval(x)
for col in ['product_skus_hash', 'clicked_skus_hash', 'query_vector']: 
    search[col] = search[col].progress_apply(convert_str_to_list)

# Add search events from test data
# load test data 
#with open('/workspace/rec_test_phase_1.json') as json_file:
    # read the test cases from the provided file
    #test_queries = json.load(json_file)
#test_df = pd.json_normalize(test_queries, 'query')
#test_df['is_test'] = 1
#test_search = test_df[['session_id_hash', 'query_vector', 'clicked_skus_hash',
       #'product_skus_hash', 'server_timestamp_epoch_ms', 'event_type',
       #'is_search', 'is_test']]
#test_search = test_search[test_search.is_search==True]

# concat test and train search data
#search = pd.concat([search, test_search])
#search.reset_index(inplace=True)

In [None]:
#del test_search
search.head()

- **Compute the number of search queries per session** 

In [None]:
tmp = search.groupby('session_id_hash').size().reset_index()
tmp.head()
#tmp.columns = ['session_id_hash', 'nb_queries']
#search = search.merge(tmp, on='session_id_hash', how='left')

In [None]:
tmp.columns = ['session_id_hash', 'nb_queries']
search = search.merge(tmp, on='session_id_hash', how='left')

In [None]:
#tmp.head()

In [None]:
search.head()

* **Include unseen clicked product to impression list** : 

    2% of the search events have a clicked item that does not appear in the impression list. 
        ==> We add the missing elements to the impression list. 

In [None]:
def add_clicked(x): 
    if isinstance(x.clicked_skus_hash, list) and isinstance(x.product_skus_hash, list):
        return list(set(x.product_skus_hash).union(set(x.clicked_skus_hash)))
    return x.product_skus_hash
search['updated_product_skus_hash'] = search.progress_apply(add_clicked, axis=1)

In [None]:
search.head()

* **Compute the number of visualized products and clicked items per search** 

In [None]:
search['impression_size'] = search.product_skus_hash.str.len().fillna(0)
search['clicks_size'] = search.clicked_skus_hash.str.len().fillna(0)

In [None]:
search.head(2)

- **Aggregate search information per session** 

The resulting variables are: 

    - "flat_query_vector": List of search query vector (reshaped to 1-dim so that it works with NVT Data loader)
    - "flat_product_skus_hash": List of product skus returned by the search query
    - "clicked-flag": To indicate if the product was clicked or not in the search results
    - "nb_queries" : number of search queries within the same session 
    - "impression_size": number of returned product skus for each search query
    - "clicks_size": number of clicked items for each search query



In [None]:
def all_products(x): 
    t =[]
    for products in x.dropna(): 
        t += products
    if len(t)==0:
        return ['missing']
    return t

session_search = search.sort_values(['session_id_hash', 'server_timestamp_epoch_ms']).groupby('session_id_hash').agg({'query_vector': lambda x: list(np.concatenate(x.values)),
                                                                                                                    'updated_product_skus_hash': all_products,
                                                                                                                    'clicked_skus_hash': all_products,
                                                                                                                    'impression_size': list,
                                                                                                                    'clicks_size': list,
                                                                                                                    'nb_queries': 'last'
                                                                                                                  })
session_search.columns = ['flat_query_vector', 'flat_product_skus_hash', 'flat_clicked_skus_hash', 'impressions_size', 'clicks_size', 'nb_queries']
session_search['clicked-flag'] = session_search.progress_apply(lambda x: [int(e in x['flat_clicked_skus_hash']) for e in x['flat_product_skus_hash']], axis=1)
session_search = session_search.reset_index()

In [None]:
session_search.head(3)

- **Save search tables** 

In [None]:
session_search.to_parquet(os.path.join(OUTPUT_DIR, "session_search.parquet"))
search.to_parquet(os.path.join(OUTPUT_DIR, "search.parquet"))

#session_search.to_parquet("session_search.parquet")
#search.to_parquet("search.parquet")


In [None]:
#session_search.to_parquet(path= '/home/sumitra/recsys_folder/competitions/SIGIR_eCommerce_Challenge_2021')
#search.to_parquet(path= '/home/sumitra/recsys_folder/competitions/SIGIR_eCommerce_Challenge_2021')

## <a id='event_table'> Create event table: merging browsing, test and search data </a>

In [None]:
session_search = pd.read_parquet(os.path.join(OUTPUT_DIR, "session_search.parquet"))
search = pd.read_parquet(os.path.join(OUTPUT_DIR, "search.parquet"))

#session_search = pd.read_parquet("session_search.parquet")
#search = pd.read_parquet("search.parquet")


In [None]:
browsing = pd.read_csv(DATA_PATH_BROWSING, sep=',')

#browsing = pd.read_csv('browsing_train.csv', sep=',')

# Add columns 'is_search' ans 'is_test'
browsing['is_search'] = 0 
#browsing['is_test'] = 0 

- **Create event table**: mergin browsing_train and test data

In [None]:
#with open('/workspace/rec_test_phase_1.json') as json_file:
    # read the test cases from the provided file
    #test_queries = json.load(json_file)
#test_df = pd.json_normalize(test_queries, 'query')
#test_df['is_test'] = 1

In [None]:
# concat browsing and search data to create event table 
#event_df = pd.concat([browsing, test_df])
#event_df.reset_index(drop=True, inplace=True)

In [None]:
#event_df.head(3)

In [None]:
#event_df.tail(3)

In [None]:
#del browsing

In [None]:
event_df = browsing

In [None]:
event_df.head()

- **Process duplicated events:** which are defined as interactions that occur in the same session and at the same time 

In [None]:
# drop duplicated rows (total of 1800) 
event_df.drop_duplicates(['session_id_hash' ,'server_timestamp_epoch_ms', 'event_type', 'product_action', 'product_sku_hash', 'hashed_url'],inplace=True, keep='last')
# Remove indices of 'pageview' interactions from duplicated events defined above 
tmp = event_df[(event_df.event_type == 'pageview') & (event_df.duplicated(['session_id_hash' , 'server_timestamp_epoch_ms']))]
event_df.drop(tmp.index, inplace=True)

In [None]:
event_df.shape

* **Add search events**: 

In [None]:
# select search events with clicks
use_cols = ['session_id_hash', 'clicked_skus_hash',
            'server_timestamp_epoch_ms', 'event_type',
            'is_search']
search_clicks = search[search.clicks_size>0][use_cols]

# specify event type and product action
search_clicks['event_type'] = 'search'
search_clicks['product_action'] = 'click'

#unstack list of clicked items to multiple rows
lst_col = 'clicked_skus_hash'
search_clicks = pd.DataFrame({
    col:np.repeat(search_clicks[col].values, search_clicks[lst_col].str.len()) for col in search_clicks.columns.difference([lst_col])}).assign(
    **{lst_col:np.concatenate(search_clicks[lst_col].values)})[search_clicks.columns.tolist()]

search_clicks.columns = ['session_id_hash', 'product_sku_hash', 'server_timestamp_epoch_ms',
                         'event_type', 'is_search', 'product_action']

In [None]:
search_clicks.head(3)

In [None]:
event_df = pd.concat([event_df, search_clicks])
event_df.event_type.value_counts()

* **Add product information**

In [None]:
product_info = pd.read_csv('/recsys/data/sku_to_content.csv')
#product_info = pd.read_csv('sku_to_content.csv')
product_info.tail()

- Extract product main category 

In [None]:
def product_main_category(x):
    if pd.isna(x):
        return x
    return x.split('/')[0]
product_info['main_category'] = product_info['category_hash'].progress_apply(product_main_category)

- Compute average price of main and hierarchy category

In [None]:
main_price = product_info.groupby('main_category')['price_bucket'].mean().reset_index()
main_price.columns = ['main_category', 'mean_price_main']

hierarchy_price = product_info.groupby('category_hash')['price_bucket'].mean().reset_index()
hierarchy_price.columns = ['category_hash', 'mean_price_hierarchy']

- Merge with product table 

In [None]:
product_info = product_info.merge(main_price, on=['main_category'], how='left')
product_info = product_info.merge(hierarchy_price, on=['category_hash'], how='left')

In [None]:
product_info.head(3)

* Merge product table and event_df 

In [None]:
event_df = event_df.merge(product_info[['product_sku_hash', 'main_category', 'category_hash',
                                        'price_bucket', 'mean_price_hierarchy', 'mean_price_main' ]], on='product_sku_hash', how='left')

In [None]:
event_df.head(2)

* **Remove duplicated interactions with the same product**: encode original information as binary flags

In [None]:
product_interactions = event_df.groupby(['session_id_hash', 'product_sku_hash']).agg({
    'product_sku_hash': len,
    'product_action': [lambda x: 'add' in list(x), 
                       lambda x: 'detail' in list(x),
                       lambda x: 'purchase' in list(x),
                       lambda x: 'remove' in list(x),
                       lambda x: 'click' in list(x),
                      ]
}).droplevel(0, axis=1)

In [None]:
product_interactions = product_interactions.reset_index()

In [None]:
product_interactions.columns = ['session_id_hash', 'product_sku_hash', 'nb_interactions',
                                'has_been_added_to_cart', 'has_been_detailed', 
                                'has_been_purchased', 'has_been_removed_from_cart', 'has_been_clicked' ]

In [None]:
product_interactions.head(2)

-  **Remove repeated interactions of event-product to align with private test set** 

In [None]:
event_df = event_df.sort_values(['session_id_hash', 'server_timestamp_epoch_ms'])

In [None]:
event_df = event_df.drop_duplicates(['session_id_hash','event_type','product_sku_hash','hashed_url'], keep='first')

In [None]:
event_df.shape

*  **Merge product interactions info and event df** 

In [None]:
event_df = event_df.merge(product_interactions, on=['session_id_hash', 'product_sku_hash'], how='left')

In [None]:
event_df.shape

In [None]:
event_df.head(2)

In [None]:
del product_interactions

*  **Add column product_url_hash**: SKUs are filled with hashed_url for pageview events

In [None]:
event_df['product_url_hash'] = event_df['product_sku_hash'].fillna(event_df['hashed_url'])

In [None]:
event_df.head()

In [None]:
event_df.shape

### Save into parquet files 

In [None]:
event_df.event_type.value_counts()

In [None]:
# save to parquet file with 40 partitions 
event_df['parquet_split'] =  np.random.randint(0,40,size=event_df.shape[0])

event_df.to_parquet(os.path.join(OUTPUT_DIR, "event_train_v4"), partition_cols=['parquet_split'])

#event_df.to_parquet("event_train_v4", partition_cols=['parquet_split'])

#event_df.to_parquet(os.path.join(OUTPUT_DIR, "event_train_v4"))


In [None]:
#event_df.to_csv(index=False)
#event_df.to_csv('event_df_csv.csv', sep=',', index=False)

In [None]:
#event_cudf = cudf.read_csv('event_df_csv.csv')
#event_cudf.head()

In [None]:
#event_df.to_parquet("event_train_v4")
#event_cudf = cudf.DataFrame.from_pandas(event_df)

In [None]:
#event_df

In [None]:
#!nvidia-smi

In [None]:
import gc
del event_df
gc.collect()

# <a id='row_workflow'> Define the preprocessed row interactions table </a>

In [None]:
import glob
files = glob.glob(OUTPUT_DIR + '/event_train_v4/parquet_split*/*.parquet')

#files = glob.glob('event_train_v4/parquet_split*/*.parquet')


- **1st workflow: fill missing values, encode categorical variables and normalize numericals**

In [None]:
#  load data 
#df_event = nvt.Dataset("event_train_v4", part_size="1GB") 

In [None]:
#  load data 
#df_event = nvt.Dataset(event_cudf) 

In [None]:
# convert timestamp to datetime object
#to_datetime = ["server_timestamp_epoch_ms"] >> nvt.ops.LambdaOp(lambda col: cudf.to_datetime(col, unit='ms')) >> nvt.ops.Rename( f = lambda x: 'timestamp')

In [None]:
#categorical features
# fill missing product_actions
#missing_action = ['product_action'] >> nvt.ops.FillMissing(fill_val='view')  >> nvt.ops.Rename(postfix = '_filled')
#categ_feats = ['session_id_hash',  'event_type', 'price_bucket', 'main_category', 'category_hash']
#cat_feats = missing_action + categ_feats >> nvt.ops.Categorify()



In [None]:
# fill missing product ids before categorify to keep id '0' for padding 
#missing_ids = ['product_sku_hash','hashed_url', 'product_url_hash' ] >> nvt.ops.FillMissing(fill_val='missing')
#cat_product_ids = missing_ids >> nvt.ops.Categorify()



In [None]:
# numerical features 
#cont_feats = ['mean_price_hierarchy', 'mean_price_main', 'nb_interactions'] >> nvt.ops.FillMedian()
#continuous_feats = cont_feats >> nvt.ops.Normalize()

In [None]:
#bool features 
#bool_feats = ['has_been_added_to_cart', 'has_been_detailed', 'has_been_purchased', 
              #'has_been_removed_from_cart', 'has_been_clicked', 'is_search', 'is_test']
#boolean_feats = bool_feats >> nvt.ops.FillMissing(fill_val=0)

In [None]:
#  load data 
df_event = nvt.Dataset(files, part_size="1GB") 

# convert timestamp to datetime object
to_datetime = ["server_timestamp_epoch_ms"] >> nvt.ops.LambdaOp(lambda col: cudf.to_datetime(col, unit='ms')) >> nvt.ops.Rename( f = lambda x: 'timestamp')

#categorical features
# fill missing product_actions
missing_action = ['product_action'] >> nvt.ops.FillMissing(fill_val='view')  >> nvt.ops.Rename(postfix = '_filled')
categ_feats = ['session_id_hash',  'event_type', 'price_bucket', 'main_category', 'category_hash']
cat_feats = missing_action + categ_feats >> nvt.ops.Categorify()

# fill missing product ids before categorify to keep id '0' for padding 
missing_ids = ['product_sku_hash','hashed_url', 'product_url_hash' ] >> nvt.ops.FillMissing(fill_val='missing')
cat_product_ids = missing_ids >> nvt.ops.Categorify()

# numerical features 
cont_feats = ['mean_price_hierarchy', 'mean_price_main', 'nb_interactions'] >> nvt.ops.FillMedian()
continuous_feats = cont_feats >> nvt.ops.Normalize()

#bool features 
bool_feats = ['has_been_added_to_cart', 'has_been_detailed', 'has_been_purchased', 
              'has_been_removed_from_cart', 'has_been_clicked', 'is_search']
boolean_feats = bool_feats >> nvt.ops.FillMissing(fill_val=0)

In [None]:
workflow = nvt.Workflow(to_datetime + cat_feats + cat_product_ids + continuous_feats + boolean_feats)
workflow.fit(df_event)
new_gdf = workflow.transform(df_event).to_ddf().compute()
len(new_gdf)

In [None]:
new_gdf.columns

In [None]:
# save the workflow : 
workflow.save(os.path.join(OUTPUT_DIR, "categorify_workflow"))

- **Item recency:** include the item first time seen feature using product_url_hash column 

In [None]:
items_first_ts_df = new_gdf.groupby('product_url_hash').agg({'timestamp': 'min'}).reset_index().rename(columns={'timestamp': 'itemid_ts_first'})
interactions_merged_df = new_gdf.merge(items_first_ts_df, on=['product_url_hash'], how='left')

In [None]:
interactions_merged_df.shape

In [None]:
interactions_merged_df.shape

- **Save encoded interaction table to parquet file** 

In [None]:
use_cols = ['session_id_hash', 'timestamp',  'event_type', 'product_action_filled',
            
            'product_sku_hash','hashed_url', 'product_url_hash',
            
            'main_category', 'category_hash', 'price_bucket', 'mean_price_hierarchy', 'mean_price_main', 'itemid_ts_first', 
            
            'nb_interactions', 'has_been_added_to_cart', 'has_been_detailed', 'has_been_purchased', 'has_been_removed_from_cart', 'has_been_clicked',
            
            'is_search']

interactions_merged_df[use_cols].head(5)

In [None]:
interactions_merged_df[use_cols].to_parquet(os.path.join(OUTPUT_DIR, 'row_interactions_preproc_v4.parquet'))

# <a id='session_workflow'>Preprocessing of session table - V4 </a>: 

In [6]:
interactions_merged_df = cudf.read_parquet(os.path.join(OUTPUT_DIR, 'row_interactions_preproc_v4.parquet'))

In [7]:
interactions_merged_df.head()

Unnamed: 0,session_id_hash,timestamp,event_type,product_action_filled,product_sku_hash,hashed_url,product_url_hash,main_category,category_hash,price_bucket,mean_price_hierarchy,mean_price_main,itemid_ts_first,nb_interactions,has_been_added_to_cart,has_been_detailed,has_been_purchased,has_been_removed_from_cart,has_been_clicked,is_search
0,40544,2019-01-17 23:56:39.839,2,2,937,1336,1978,1,4,1,0.245127,0.04516,2019-01-15 07:13:42.732,-0.170005,False,True,False,False,False,0
1,122961,2019-04-14 20:08:54.849,1,1,1,18,16,0,0,0,0.048303,0.04516,2019-01-15 05:29:47.682,-0.170005,False,False,False,False,False,0
2,214563,2019-02-23 14:52:33.576,1,1,1,1,1,0,0,0,0.048303,0.04516,2019-01-15 05:03:02.542,-0.170005,False,False,False,False,False,0
3,90886,2019-02-28 19:34:20.658,2,2,1731,2487,3593,3,1,5,-0.633211,2.114748,2019-01-15 15:18:29.627,-0.170005,False,True,False,False,False,0
4,1781551,2019-02-06 09:50:18.828,1,1,1,38,32,0,0,0,0.048303,0.04516,2019-01-15 05:08:54.698,-0.170005,False,False,False,False,False,0


In [8]:
#OUTPUT_FOLDER = "/workspace/coveo_dataset/"
#OUTPUT_FOLDER = "/recsys/coveo_dataset/"
#!mkdir -p $OUTPUT_FOLDER


In [9]:
#!ls $OUTPUT_FOLDER

In [10]:
#cont features
cont_feats = ['mean_price_hierarchy', 'mean_price_main', 'nb_interactions'] >> nvt.ops.FillMedian()
#bool features 
bool_feats = ['has_been_added_to_cart', 'has_been_detailed', 'has_been_purchased', 
              'has_been_removed_from_cart', 'has_been_clicked', 'is_search']
interactions_merged_df[bool_feats] = interactions_merged_df[bool_feats].astype('int32')

- **Define temporal features**

In [11]:
# create time features
sessionTime = ['timestamp']

sessionTime_hour = (
    sessionTime >> 
    #nvt.ops.LambdaOp(lambda col: cudf.to_datetime(col, unit='ms').dt.hour) >> 
    nvt.ops.LambdaOp(lambda col: col.dt.hour) >> 
    nvt.ops.Rename(postfix = '_hour')
)
sessionTime_weekday = (
    sessionTime >> 
    #nvt.ops.LambdaOp(lambda col: cudf.to_datetime(col, unit='ms').dt.weekday) >> 
    nvt.ops.LambdaOp(lambda col: col.dt.weekday) >> 
    nvt.ops.Rename(postfix = '_wd')
)
sessionTime_day = (
    sessionTime >> 
    nvt.ops.LambdaOp(lambda col: col.dt.day) >> 
    nvt.ops.Rename(postfix="_day")
)

sessionTime_timestamp = (
    sessionTime >> 
    nvt.ops.LambdaOp(lambda col: (col.astype(int) / 1e6).astype(int)) >> 
    nvt.ops.Rename(f = lambda col: "ts")
)

In [12]:
def get_cycled_feature_value_sin(col, max_value):
    value_scaled = (col + 0.000001) / max_value
    value_sin = np.sin(2*np.pi*value_scaled)
    return value_sin

def get_cycled_feature_value_cos(col, max_value):
    value_scaled = (col + 0.000001) / max_value
    value_cos = np.cos(2*np.pi*value_scaled)
    return value_cos

In [13]:
hour_sin = sessionTime_hour >> (lambda col: get_cycled_feature_value_sin(col, 24)) >> nvt.ops.Rename(postfix = '_sin')
hour_cos = sessionTime_hour >> (lambda col: get_cycled_feature_value_cos(col, 24)) >> nvt.ops.Rename(postfix = '_cos')
weekday_sin = sessionTime_weekday >> (lambda col: get_cycled_feature_value_sin(col+1, 7)) >> nvt.ops.Rename(postfix = '_sin')
weekday_cos= sessionTime_weekday >> (lambda col: get_cycled_feature_value_cos(col+1, 7)) >> nvt.ops.Rename(postfix = '_cos')

In [14]:
cycled_features = hour_sin + hour_cos + weekday_sin + weekday_cos

In [15]:
cycled_features.columns

['timestamp_hour_sin',
 'timestamp_hour_cos',
 'timestamp_wd_sin',
 'timestamp_wd_cos']

In [16]:
# calculate item recency 
# create custom op
from nvtabular.ops import Operator

class ItemRecency(Operator):
    def transform(self, columns, gdf):
        for column in columns:
            col = gdf[column]
            #col.loc[col == ""] = None
            item_first_timestamp = gdf['itemid_ts_first']
            delta_days = (col - item_first_timestamp).dt.days
            gdf[column + "_age_days"] = delta_days * (delta_days >=0)
        return gdf
            
    def output_column_names(self, columns):
        return [column + "_age_days" for column in columns]
            
    def dependencies(self):
        return ["itemid_ts_first"]

In [17]:
recency_features = ["timestamp"] >> ItemRecency() 
recency_features_norm = recency_features >> nvt.ops.LogOp() >> nvt.ops.Normalize() >> nvt.ops.Rename(postfix = '_norm')

In [18]:
recency_features.columns, recency_features_norm.columns
time_features = (
    sessionTime_timestamp +
    sessionTime + 
    sessionTime_hour +
    sessionTime_day + 
    sessionTime_weekday +
    recency_features +
    recency_features_norm + 
    cycled_features)

In [19]:
time_features.columns

['ts',
 'timestamp',
 'timestamp_hour',
 'timestamp_day',
 'timestamp_wd',
 'timestamp_age_days',
 'timestamp_age_days_norm',
 'timestamp_hour_sin',
 'timestamp_hour_cos',
 'timestamp_wd_sin',
 'timestamp_wd_cos']

- **Grouping interactions into sessions**

In [20]:
# Define Groupby Workflow: search columns are not used
# N.B: Add the op ListSlice when upgrading nvt 0.5.1 to 0.6 
filter_nan_products = (interactions_merged_df.columns >> nvt.ops.Filter(f=lambda df: df['product_sku_hash'] != 0))


groupby_only_product = filter_nan_products - ['timestamp']  + time_features  >> nvt.ops.Groupby(
    groupby_cols=["session_id_hash"], 
    sort_cols=["ts"],
    aggs={
       "product_sku_hash": ["list", "count"], 
    }
)
    
groupby_product_url = ['session_id_hash', 'product_url_hash']  + time_features >> nvt.ops.Groupby(
    groupby_cols=["session_id_hash"], 
    sort_cols=["ts"],
    aggs={
       "product_url_hash": ["list", "count"]
    }
)
groupby_other_features =  bool_feats + cont_feats + ['product_sku_hash', 'product_action_filled' ,'session_id_hash', 'event_type',
                                                                             'price_bucket', 'main_category', 'category_hash'] + time_features >> \
    nvt.ops.Groupby(
    groupby_cols=["session_id_hash"], 
    sort_cols=["ts"],
    aggs={
        "product_action_filled": ["list"],     
        "event_type": ["list"],    
        "price_bucket": ["list"],
        "category_hash": ["list"],
        'has_been_added_to_cart': ["list"],
        'has_been_detailed': ["list"],
        'has_been_purchased': ["list"],
        'has_been_removed_from_cart': ["list"],
        'has_been_clicked': ["list"],
        'mean_price_hierarchy':["list"],
        'mean_price_main':["list"],
        'nb_interactions':["list"],
        'main_category': ["list"],
        'category_hash': ["list"],
        "ts": ["list", "first", "last"],
        #"is_test": ["last"],
        "is_search": ["last"],
        "timestamp": ["first"],
        'timestamp_day': ["list"],
        'timestamp_hour': ["list"],
        'timestamp_month': ["list"],
        'timestamp_wd': ["list"],
        'timestamp_age_days': ["list"],
        'timestamp_age_days_norm': ["list"],
        'timestamp_hour_sin': ["list"],
        'timestamp_hour_sin_norm': ["list"],
        'timestamp_hour_cos': ["list"],
        'timestamp_hour_cos_norm': ["list"],
        'timestamp_wd_sin': ["list"],
        'timestamp_wd_sin_norm': ["list"],
        'timestamp_wd_cos': ["list"],
        'timestamp_wd_cos_norm': ["list"],   
        },
    name_sep="-")

In [21]:
print(groupby_other_features)

<ColumnGroup Groupby output>


In [22]:
groupby_other_features.columns

['has_been_detailed-list',
 'timestamp_day-list',
 'product_action_filled-list',
 'ts-last',
 'event_type-list',
 'timestamp_age_days-list',
 'has_been_clicked-list',
 'has_been_purchased-list',
 'timestamp_hour_sin-list',
 'has_been_added_to_cart-list',
 'timestamp_age_days_norm-list',
 'timestamp_wd_cos-list',
 'category_hash-list',
 'ts-first',
 'timestamp-first',
 'timestamp_hour-list',
 'timestamp_wd_sin-list',
 'session_id_hash',
 'is_search-last',
 'mean_price_main-list',
 'main_category-list',
 'price_bucket-list',
 'timestamp_hour_cos-list',
 'nb_interactions-list',
 'timestamp_wd-list',
 'has_been_removed_from_cart-list',
 'mean_price_hierarchy-list',
 'ts-list']

In [23]:
groupby_only_product.columns

['product_sku_hash_count', 'session_id_hash', 'product_sku_hash_list']

In [24]:
groupby_product_url.columns

['product_url_hash_count', 'session_id_hash', 'product_url_hash_list']

In [25]:
#groupby_other_features_list = groupby_other_features['product_action_filled-list',
        #'event_type-list',
        #'price_bucket-list',
        #'category_hash-list',
        #'has_been_added_to_cart-list',
        #'has_been_detailed-list',
        #'has_been_purchased-list',
        #'has_been_removed_from_cart-list',
        #'has_been_clicked-list',
        #'mean_price_hierarchy-list',
        #'mean_price_main-list',
        #'nb_interactions-list',
        #'main_category-list',
        #'category_hash-list',
        #'ts-list',
        #'timestamp_day-list',
        #'timestamp_hour-list',
        #'timestamp_month-list',
        #'timestamp_wd-list',
        #'timestamp_age_days-list',
        #'timestamp_age_days_norm-list',
        #'timestamp_hour_sin-list',
        #'timestamp_hour_sin_norm-list',
        #'timestamp_hour_cos-list',
        #'timestamp_hour_cos_norm-list',
        #'timestamp_wd_sin-list',
        #'timestamp_wd_sin_norm-list',
        #'timestamp_wd_cos-list',
        #'timestamp_wd_cos_norm-list']

In [26]:
groupby_other_features_list = groupby_other_features['timestamp_hour-list',
 'price_bucket-list',
 'main_category-list',
 'timestamp_day-list',
 'has_been_detailed-list',
 'timestamp_hour_sin-list',
 'timestamp_wd_cos-list',
 'product_action_filled-list',
 'timestamp_age_days_norm-list',
 'timestamp_hour_cos-list',
 'has_been_removed_from_cart-list',
 'mean_price_main-list',
 'timestamp_age_days-list',
 'has_been_clicked-list',
 'nb_interactions-list',
 'mean_price_hierarchy-list',
 'timestamp_wd_sin-list',
 'timestamp_wd-list',
 'event_type-list',
 'category_hash-list',
 'ts-list',
 'has_been_purchased-list',
 'has_been_added_to_cart-list']

In [27]:
groupby_only_product_list = groupby_only_product['product_sku_hash_list']

In [28]:
groupby_product_url_list = groupby_product_url['product_url_hash_list']

In [29]:
SESSIONS_MAX_LENGTH = 20

In [30]:
groupby_other_features_list_trim = groupby_other_features_list >> nvt.ops.ListSlice(0,SESSIONS_MAX_LENGTH) >> nvt.ops.Rename(postfix = '_seq') 
groupby_only_product_list_trim = groupby_only_product_list >> nvt.ops.ListSlice(0,SESSIONS_MAX_LENGTH) >> nvt.ops.Rename(postfix = '_seq')
groupby_product_url_list_trim = groupby_product_url_list >> nvt.ops.ListSlice(0,SESSIONS_MAX_LENGTH) >> nvt.ops.Rename(postfix = '_seq')


In [31]:
groupby_other_features_list_trim

<ColumnGroup Rename output>

### Three workflows 

- workflow 1 : group other features that user interactions 

In [32]:
remaining_columns = [x for x in groupby_other_features.columns if x!= 'timestamp-first']
day_index = ((groupby_other_features - remaining_columns)  >> 
    nvt.ops.LambdaOp(lambda col: (col.max() - col).dt.days + 1) >> 
    nvt.ops.Rename(f = lambda col: "day_index")
)              

In [33]:
selected_features = groupby_other_features['ts-first', 'ts-last', 'is_search-last', 'session_id_hash', 'timestamp-first'] + groupby_other_features_list_trim + day_index

In [34]:
workflow = nvt.Workflow(selected_features)
dataset = nvt.Dataset(interactions_merged_df, cpu=False)
workflow.fit(dataset)
new_gdf_other = workflow.transform(dataset).to_ddf().compute()
#new_gdf_other = dataset
len(new_gdf_other)



4934725

In [35]:
#workflow = nvt.Workflow(selected_features)

In [36]:
#dataset = nvt.Dataset(interactions_merged_df, cpu=False)

In [37]:
#workflow.fit(dataset)

In [38]:
#workflow = nvt.Workflow(groupby_other_features + day_index)
#dataset = nvt.Dataset(interactions_merged_df, cpu=False)
#workflow.fit(dataset)
#new_gdf_other = workflow.transform(dataset).to_ddf().compute()
#new_gdf_other = dataset
#len(new_gdf_other)

In [39]:
#workflow = nvt.Workflow(selected_features)
#dataset = nvt.Dataset(interactions_merged_df, cpu=False)
#workflow.fit(dataset)

In [40]:
#workflow = nvt.Workflow(groupby_other_features + day_index)
#dataset = nvt.Dataset(interactions_merged_df, cpu=False)
#workflow.fit(dataset)

In [41]:
!nvidia-smi

Mon Dec  6 14:04:32 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.82.01    Driver Version: 470.82.01    CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Quadro RTX 6000     On   | 00000000:AF:00.0 Off |                  Off |
| 33%   47C    P8    20W / 260W |   6334MiB / 24220MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [42]:
new_gdf_other.head(2)

Unnamed: 0,ts-first,ts-last,is_search-last,session_id_hash,timestamp-first,timestamp_hour-list_seq,price_bucket-list_seq,main_category-list_seq,timestamp_day-list_seq,has_been_detailed-list_seq,...,nb_interactions-list_seq,mean_price_hierarchy-list_seq,timestamp_wd_sin-list_seq,timestamp_wd-list_seq,event_type-list_seq,category_hash-list_seq,ts-list_seq,has_been_purchased-list_seq,has_been_added_to_cart-list_seq,day_index
0,1549828,1549829,0,1,2019-02-10 20:01:39.324,"[20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 2...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 1...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",...,"[-0.17000507, -0.17000507, -0.17000507, -0.170...","[0.0483035, 0.0483035, 0.0483035, 0.0483035, 0...","[1.1285199e-06, 1.1285199e-06, 1.1285199e-06, ...","[6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1549828, 1549828, 1549828, 1549828, 1549828, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",64
1,1547824,1547827,0,2,2019-01-18 15:14:11.071,"[15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 1...","[0, 6, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...","[0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...","[18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 1...","[0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...",...,"[-0.17000507, -0.17000507, -0.17000507, -0.170...","[0.0483035, 1.7211839, 0.0483035, 0.0483035, 0...","[-0.9749281, -0.9749281, -0.9749281, -0.974928...","[4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, ...","[1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, ...","[0, 28, 0, 0, 0, 0, 0, 0, 0, 13, 0, 0, 0, 0, 0...","[1547824, 1547824, 1547824, 1547824, 1547824, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",87


In [43]:
# save the workflow : 
workflow.save(os.path.join(OUTPUT_DIR, "workflow1"))

- workflow 2 : create the sequence of product interactions and pageviews

In [44]:
workflow = nvt.Workflow(groupby_product_url['product_url_hash_count', 'session_id_hash'] + groupby_product_url_list_trim)
dataset = nvt.Dataset(interactions_merged_df, cpu=False)
workflow.fit(dataset)
new_gdf_sku_url = workflow.transform(dataset).to_ddf().compute()
len(new_gdf_sku_url)



4934725

In [45]:
new_gdf_sku_url.head()

Unnamed: 0,product_url_hash_count,session_id_hash,product_url_hash_list_seq
0,199,1,"[117900, 251004, 267913, 457334, 192812, 28885..."
1,194,2,"[1238, 6082, 107904, 1031, 1698, 62086, 14822,..."
2,194,3,"[1961, 13130, 9931, 10762, 4366, 593, 149, 53,..."
3,193,4,"[148601, 58745, 13629, 4558, 179, 14633, 14634..."
4,191,5,"[23363, 5212, 1048, 7611, 136350, 53591, 10485..."


In [46]:
# save the workflow : 
workflow.save(os.path.join(OUTPUT_DIR, "workflow2"))

- workflow 3 : create sequence with only product interactions

In [47]:
workflow = nvt.Workflow(groupby_only_product['product_sku_hash_count', 'session_id_hash'] + groupby_only_product_list_trim)
dataset = nvt.Dataset(interactions_merged_df, cpu=False)
workflow.fit(dataset)
new_gdf_prod_only = workflow.transform(dataset).to_ddf().compute()
len(new_gdf_prod_only)



4934725

In [48]:
new_gdf_prod_only.head()

Unnamed: 0,product_sku_hash_count,session_id_hash,product_sku_hash_list_seq
0,199,1,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
1,194,2,"[1, 2996, 1, 1, 1, 1, 1, 1, 1, 4148, 1, 1, 1, ..."
2,194,3,"[1, 1, 1, 1, 2153, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
3,193,4,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
4,191,5,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."


In [49]:
# save the workflow : 
workflow.save(os.path.join(OUTPUT_DIR, "workflow3"))

### Merge the three resulting frames 

In [50]:
sessions_gdf = new_gdf_sku_url.merge(new_gdf_other, on='session_id_hash',  how='inner')

In [51]:
sessions_gdf = sessions_gdf.merge(new_gdf_prod_only,  on='session_id_hash',  how='left' )

In [52]:
sessions_gdf.columns

Index(['product_url_hash_count', 'session_id_hash',
       'product_url_hash_list_seq', 'ts-first', 'ts-last', 'is_search-last',
       'timestamp-first', 'timestamp_hour-list_seq', 'price_bucket-list_seq',
       'main_category-list_seq', 'timestamp_day-list_seq',
       'has_been_detailed-list_seq', 'timestamp_hour_sin-list_seq',
       'timestamp_wd_cos-list_seq', 'product_action_filled-list_seq',
       'timestamp_age_days_norm-list_seq', 'timestamp_hour_cos-list_seq',
       'has_been_removed_from_cart-list_seq', 'mean_price_main-list_seq',
       'timestamp_age_days-list_seq', 'has_been_clicked-list_seq',
       'nb_interactions-list_seq', 'mean_price_hierarchy-list_seq',
       'timestamp_wd_sin-list_seq', 'timestamp_wd-list_seq',
       'event_type-list_seq', 'category_hash-list_seq', 'ts-list_seq',
       'has_been_purchased-list_seq', 'has_been_added_to_cart-list_seq',
       'day_index', 'product_sku_hash_count', 'product_sku_hash_list_seq'],
      dtype='object')

In [53]:
SELECTED_COLS =[ 'session_id_hash', 'day_index', 'product_sku_hash_count', 'product_sku_hash_list_seq',
                 'product_url_hash_count', 'product_url_hash_list_seq',
                 'has_been_removed_from_cart-list_seq', 'has_been_added_to_cart-list_seq',
                 'has_been_purchased-list_seq', 'has_been_detailed-list_seq', 'has_been_clicked-list_seq',
                 'event_type-list_seq', 'product_action_filled-list_seq',   'nb_interactions-list_seq',
                 'category_hash-list_seq', 'main_category-list_seq',
                 'price_bucket-list_seq', 'mean_price_hierarchy-list_seq',  'mean_price_main-list_seq',
                 'ts-first', 'ts-last',  'ts-list_seq', 'day_index',
                 'timestamp_hour_cos-list_seq', 'timestamp_hour_sin-list_seq', 'timestamp_wd_sin-list_seq', 'timestamp_wd_cos-list_seq',
                 'timestamp_age_days-list_seq', 'timestamp_age_days_norm-list_seq']
sessions_gdf = sessions_gdf[SELECTED_COLS]
sessions_gdf.head()

Unnamed: 0,session_id_hash,day_index,product_sku_hash_count,product_sku_hash_list_seq,product_url_hash_count,product_url_hash_list_seq,has_been_removed_from_cart-list_seq,has_been_added_to_cart-list_seq,has_been_purchased-list_seq,has_been_detailed-list_seq,...,mean_price_main-list_seq,ts-first,ts-last,ts-list_seq,timestamp_hour_cos-list_seq,timestamp_hour_sin-list_seq,timestamp_wd_sin-list_seq,timestamp_wd_cos-list_seq,timestamp_age_days-list_seq,timestamp_age_days_norm-list_seq
0,1697,63,83,"[1, 1, 2257, 1307, 3506, 589, 1, 1, 1934, 348,...",83,"[12238, 5616, 4571, 2751, 7099, 1298, 1086, 38...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ...","[0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, ...",...,"[0.04515953, 0.04515953, -2.4533634, -2.453363...",1549893,1549895,"[1549893, 1549893, 1549893, 1549893, 1549893, ...","[-0.8660252, -0.96592575, -0.96592575, -0.8660...","[-0.50000036, -0.2588193, -0.2588193, -0.50000...","[0.7818321, 0.7818321, 0.7818321, 0.7818321, 0...","[0.6234891, 0.6234891, 0.6234891, 0.6234891, 0...","[27, 26, 26, 12, 27, 27, 27, 27, 26, 27, 27, 2...","[0.21455763, 0.1829394, 0.1829394, -0.45249838..."
1,1698,86,83,"[17927, 5913, 1, 17927, 13412, 9803, 13412, 1,...",83,"[40865, 12114, 19703, 40865, 29120, 20705, 291...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, ...",...,"[0.04515953, 0.04515953, 0.04515953, 0.0451595...",1547917,1547921,"[1547917, 1547917, 1547917, 1547917, 1547917, ...","[-0.25881854, -0.25881854, -0.25881854, -0.258...","[-0.965926, -0.965926, -0.965926, -0.965926, -...","[-0.781831, -0.781831, -0.781831, -0.781831, -...","[0.6234904, 0.6234904, 0.6234904, 0.6234904, 0...","[1, 3, 3, 1, 2, 3, 2, 3, 4, 4, 3, 4, 4, 4, 2, ...","[-2.0798535, -1.4772274, -1.4772274, -2.079853..."
2,1699,72,83,"[1, 5520, 1, 6939, 1, 1, 1, 1548, 4420, 1, 942...",83,"[23, 11282, 537, 14259, 1, 4558, 14349, 3222, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, ...",...,"[0.04515953, 2.1147482, 0.04515953, 0.04515953...",1549158,1549160,"[1549158, 1549158, 1549158, 1549158, 1549158, ...","[0.96592575, 0.96592575, 0.96592575, 0.9659257...","[0.25881928, 0.25881928, 0.25881928, 0.2588192...","[1.1285199e-06, 1.1285199e-06, 1.1285199e-06, ...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...","[18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 17, 1...","[-0.12256797, -0.12256797, -0.12256797, -0.122..."
3,1700,86,83,"[1, 10584, 7447, 1, 1, 7788, 1, 1, 7000, 8098,...",83,"[3, 22468, 15377, 453058, 1, 16149, 450590, 14...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, ...",...,"[0.04515953, -2.4533634, -2.4533634, 0.0451595...",1547931,1547936,"[1547931, 1547932, 1547932, 1547932, 1547932, ...","[0.707107, 0.707107, 0.707107, 0.707107, 0.707...","[-0.70710653, -0.70710653, -0.70710653, -0.707...","[-0.781831, -0.781831, -0.781831, -0.781831, -...","[0.6234904, 0.6234904, 0.6234904, 0.6234904, 0...","[4, 2, 4, 0, 4, 3, 0, 4, 4, 4, 4, 0, 0, 0, 3, ...","[-1.2832252, -1.7273399, -1.2832252, -2.682479..."
4,1701,10,83,"[1, 436, 1, 1, 1, 107, 1, 1, 1, 1, 1, 359, 433...",83,"[10, 1021, 139143, 87499, 66322, 356, 112548, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, ...",...,"[0.04515953, 0.04515953, 0.04515953, 0.0451595...",1554484,1554487,"[1554484, 1554484, 1554484, 1554484, 1554484, ...","[-0.25881854, -0.25881854, -0.25881854, -0.258...","[-0.965926, -0.965926, -0.965926, -0.965926, -...","[-0.9749281, -0.9749281, -0.9749281, -0.974928...","[-0.22252008, -0.22252008, -0.22252008, -0.222...","[80, 79, 17, 17, 17, 80, 10, 17, 80, 0, 80, 80...","[1.138079, 1.1272788, -0.16957423, -0.16957423..."


In [54]:
#SELECTED_COLS =[ 'session_id_hash', 'day_index', 'product_sku_hash_count', 'product_sku_hash_list',
                 #'product_url_hash_count', 'product_url_hash_list',
                 #'has_been_removed_from_cart-list', 'has_been_added_to_cart-list',
                 #'has_been_purchased-list', 'has_been_detailed-list', 'has_been_clicked-list',
                 #'event_type-list', 'product_action_filled-list',  'nb_interactions-list',
                 #'category_hash-list', 'main_category-list',
                 #'price_bucket-list', 'mean_price_hierarchy-list', 'mean_price_main-list',
                 #'ts-first', 'ts-last',  'ts-list', 'day_index',
                 #'timestamp_hour_cos-list', 'timestamp_hour_sin-list', 'timestamp_wd_sin-list', 'timestamp_wd_cos-list',
                 #'timestamp_age_days-list', 'timestamp_age_days_norm-list']
#sessions_gdf = sessions_gdf[SELECTED_COLS]
#sessions_gdf.head()

- check day data 

In [55]:
sessions_gdf['day_index'].min(), sessions_gdf['day_index'].max()

(1, 90)

In [56]:
sessions_gdf.groupby('day_index').size().sort_index()

day_index
1      43959
2      44370
3      43009
4      48863
5      43091
       ...  
86    104141
87     93046
88    103324
89     84445
90     56356
Length: 90, dtype: int32

### Remove sessions with one interaction in product_sku_url from train data 

In [57]:
filtered_sessions = sessions_gdf.columns >> \
                     nvt.ops.Filter(f=lambda df: ((df["product_url_hash_count"] >= MINIMUM_SESSION_LENGTH))) 
workflow = nvt.Workflow(filtered_sessions)
dataset = nvt.Dataset(sessions_gdf, cpu=False)
workflow.fit(dataset)
sessions_gdf = workflow.transform(dataset).to_ddf().compute()
len(sessions_gdf)

3291455

In [58]:
# save the workflow : 
workflow.save(os.path.join(OUTPUT_DIR, "workflow_etl"))

In [59]:
#save sessions_gdf
#sessions_gdf.to_parquet(os.path.join(OUTPUT_DIR, "sessions_gdf.parquet"))


In [60]:
#OUTPUT_FOLDER = "/workspace/coveo_dataset/"
#OUTPUT_FOLDER = "/recsys/sessions_by_day/"
#!mkdir -p $OUTPUT_FOLDER

In [61]:
#PARTITION_COL = 'day_index'

In [62]:
#from transformers4rec.data.preprocessing import save_time_based_splits

In [63]:
#from nvtabular import ColumnSelector

In [64]:
#save_time_based_splits(data=nvt.Dataset(sessions_gdf),
                       #output_dir= OUTPUT_FOLDER,
                       #partition_col=PARTITION_COL,
                       #timestamp_col='session_id_hash',
                      #)

### Create cross-validation folds 

- Define random 5 folds column 

In [65]:
#from sklearn.model_selection import GroupKFold
#sessions_gdf['fold'] = np.random.randint(1,6, sessions_gdf.shape[0]) 

In [66]:
#sessions_gdf.fold.value_counts()

- Reserve the 3 last weeks for validation 

In [67]:
#sessions_gdf['is_valid'] = 0 
#sessions_gdf.loc[((sessions_gdf['is_test-last']==0) & (sessions_gdf['day_index']<=50)), 'is_valid'] = 1

In [68]:
#sessions_gdf.is_valid.value_counts()

In [69]:
#sessions_gdf['is_test-last'].value_counts()

In [70]:
#sessions_gdf.head(3)

### Un-hash session id 

In [71]:
#!OUTPUT_DIR

In [72]:
#session_map = cudf.read_parquet('./categories/unique.session_id_hash.parquet').reset_index()
session_map = cudf.read_parquet('/recsys/coveo_task1_v42/categorify_workflow/categories/unique.session_id_hash.parquet').reset_index()
session_map

Unnamed: 0,index,session_id_hash,session_id_hash_count
0,0,,199
1,1,9161e328d6e73f38c7685628a376a5e881f359bf572f21...,194
2,2,5976de395f32af07016a5ec071b58eb1cb11a68ba3dba1...,194
3,3,de6ab39acc588afc3e81e9d118b070914510c12eb89cd4...,193
4,4,282e0834212dfb37ce70aa3ff1be2f9a0cef9abb62e7b2...,191
...,...,...,...
4934721,4934721,ffffd1c3b292923b8537ba92d8b14fcfab70d3ff0b5e9b...,1
4934722,4934722,ffffe890d31f09004d88e6857086e585625e0fdae97266...,1
4934723,4934723,ffffeb414c630bfb0607557bc8162611d9c356aba999dc...,1
4934724,4934724,ffffed918e1086333206d96f1fae684de5774866848bb3...,1


In [73]:
session_map = session_map.drop(['session_id_hash_count'], axis=1)

In [74]:
session_map

Unnamed: 0,index,session_id_hash
0,0,
1,1,9161e328d6e73f38c7685628a376a5e881f359bf572f21...
2,2,5976de395f32af07016a5ec071b58eb1cb11a68ba3dba1...
3,3,de6ab39acc588afc3e81e9d118b070914510c12eb89cd4...
4,4,282e0834212dfb37ce70aa3ff1be2f9a0cef9abb62e7b2...
...,...,...
4934721,4934721,ffffd1c3b292923b8537ba92d8b14fcfab70d3ff0b5e9b...
4934722,4934722,ffffe890d31f09004d88e6857086e585625e0fdae97266...
4934723,4934723,ffffeb414c630bfb0607557bc8162611d9c356aba999dc...
4934724,4934724,ffffed918e1086333206d96f1fae684de5774866848bb3...


In [75]:
#session_map = cudf.read_parquet('./categories/unique.session_id_hash.parquet').reset_index()
#session_map = cudf.read_parquet('/recsys/coveo_task1_v4/categorify_workflow/categories/unique.session_id_hash.parquet').reset_index()
#session_map

## Merge browsing session and search sessions 

In [76]:
session_map.columns = ['session_id_hash', 'original_session_id_hash']
sessions_gdf = sessions_gdf.merge(session_map, on=['session_id_hash'], how='left')

In [77]:
sessions_gdf.head(3)

Unnamed: 0,session_id_hash,day_index,product_sku_hash_count,product_sku_hash_list_seq,product_url_hash_count,product_url_hash_list_seq,has_been_removed_from_cart-list_seq,has_been_added_to_cart-list_seq,has_been_purchased-list_seq,has_been_detailed-list_seq,...,ts-first,ts-last,ts-list_seq,timestamp_hour_cos-list_seq,timestamp_hour_sin-list_seq,timestamp_wd_sin-list_seq,timestamp_wd_cos-list_seq,timestamp_age_days-list_seq,timestamp_age_days_norm-list_seq,original_session_id_hash
0,27425,72,38,"[7820, 1, 1777, 3327, 1, 1, 2672, 1, 1, 3592, ...",38,"[16222, 380, 3691, 6737, 10047, 11236, 5442, 4...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, ...",...,1549157,1549159,"[1549157, 1549157, 1549157, 1549157, 1549157, ...","[0.96592575, 0.96592575, 0.96592575, 0.9659257...","[0.25881928, 0.25881928, 0.25881928, 0.2588192...","[1.1285199e-06, 1.1285199e-06, 1.1285199e-06, ...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...","[17, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 1...","[-0.16957423, -0.12256797, -0.12256797, -0.122...",2094bca39a1c76f54af040202954fdd6a9b4fb49d44c87...
1,27426,64,38,"[1, 1674, 1, 123, 1, 1, 1, 340, 881, 1, 85, 18...",38,"[32, 3486, 879, 395, 7045, 3425, 8174, 843, 18...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, ...",...,1549838,1549839,"[1549838, 1549838, 1549838, 1549838, 1549838, ...","[0.8660258, 0.8660258, 0.8660258, 0.8660258, 0...","[-0.49999934, -0.49999934, -0.49999934, -0.499...","[1.1285199e-06, 1.1285199e-06, 1.1285199e-06, ...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...","[26, 15, 1, 1, 10, 10, 8, 26, 25, 10, 19, 26, ...","[0.1829394, -0.2719755, -2.0798535, -2.0798535...",2095ede31d5bea6078b98b38f399653b42b5053c16c953...
2,27427,82,38,"[1, 3473, 1, 1, 1, 1, 606, 11, 1, 1, 1, 178, 1...",38,"[1, 7024, 1085, 26, 1881, 10543, 1326, 95, 117...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, ...",...,1548265,1548267,"[1548265, 1548266, 1548266, 1548266, 1548266, ...","[-0.25881854, -0.25881854, -0.25881854, -0.258...","[-0.965926, -0.965926, -0.965926, -0.965926, -...","[0.43388295, 0.43388295, 0.43388295, 0.4338829...","[-0.90096927, -0.90096927, -0.90096927, -0.900...","[8, 8, 8, 8, 8, 8, 8, 8, 8, 6, 0, 8, 8, 8, 8, ...","[-0.7722002, -0.7722002, -0.7722002, -0.772200...",20bf3fb9ed342eac74781a9a65afb2da0c11adf135e4de...


### Truncate train and test sessions to last 30 elements

In [78]:
#[x for x in sessions_gdf.columns if '_list' in x or '-list' in x ]

In [79]:
#features_list = [x for x in sessions_gdf.columns if '_list' in x or '-list' in x ]
#features_list

In [80]:
#other_cols = list(set(sessions_gdf.columns).difference(set(features_list)))
#other_cols

In [81]:
#features_list = [x for x in sessions_gdf.columns if '_list' in x or '-list' in x ]
#features_trim =  features_list >> nvt.ops.ListSlice(-30)
#other_cols = list(set(sessions_gdf.columns).difference(set(features_list)))
#workflow = nvt.Workflow(other_cols + features_trim)
#dataset = nvt.Dataset(sessions_gdf[sessions_gdf.is_valid == 0], cpu=False)
#workflow.fit(dataset)
#trimed_cols = workflow.transform(dataset).to_ddf().compute()
#len(trimed_cols)

In [82]:
#trimed_cols.head(2)

### Merge back with validation session

In [83]:
#final_session_gdf = cudf.concat([sessions_gdf[sessions_gdf.is_valid !=0], trimed_cols])

In [84]:
#final_session_gdf.head(3)

In [85]:
#final_session_gdf.shape

In [86]:
final_session_gdf = sessions_gdf

In [87]:
final_session_gdf.shape

(3291455, 29)

In [88]:
final_session_gdf.head()

Unnamed: 0,session_id_hash,day_index,product_sku_hash_count,product_sku_hash_list_seq,product_url_hash_count,product_url_hash_list_seq,has_been_removed_from_cart-list_seq,has_been_added_to_cart-list_seq,has_been_purchased-list_seq,has_been_detailed-list_seq,...,ts-first,ts-last,ts-list_seq,timestamp_hour_cos-list_seq,timestamp_hour_sin-list_seq,timestamp_wd_sin-list_seq,timestamp_wd_cos-list_seq,timestamp_age_days-list_seq,timestamp_age_days_norm-list_seq,original_session_id_hash
0,27425,72,38,"[7820, 1, 1777, 3327, 1, 1, 2672, 1, 1, 3592, ...",38,"[16222, 380, 3691, 6737, 10047, 11236, 5442, 4...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, ...",...,1549157,1549159,"[1549157, 1549157, 1549157, 1549157, 1549157, ...","[0.96592575, 0.96592575, 0.96592575, 0.9659257...","[0.25881928, 0.25881928, 0.25881928, 0.2588192...","[1.1285199e-06, 1.1285199e-06, 1.1285199e-06, ...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...","[17, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 1...","[-0.16957423, -0.12256797, -0.12256797, -0.122...",2094bca39a1c76f54af040202954fdd6a9b4fb49d44c87...
1,27426,64,38,"[1, 1674, 1, 123, 1, 1, 1, 340, 881, 1, 85, 18...",38,"[32, 3486, 879, 395, 7045, 3425, 8174, 843, 18...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, ...",...,1549838,1549839,"[1549838, 1549838, 1549838, 1549838, 1549838, ...","[0.8660258, 0.8660258, 0.8660258, 0.8660258, 0...","[-0.49999934, -0.49999934, -0.49999934, -0.499...","[1.1285199e-06, 1.1285199e-06, 1.1285199e-06, ...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...","[26, 15, 1, 1, 10, 10, 8, 26, 25, 10, 19, 26, ...","[0.1829394, -0.2719755, -2.0798535, -2.0798535...",2095ede31d5bea6078b98b38f399653b42b5053c16c953...
2,27427,82,38,"[1, 3473, 1, 1, 1, 1, 606, 11, 1, 1, 1, 178, 1...",38,"[1, 7024, 1085, 26, 1881, 10543, 1326, 95, 117...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, ...",...,1548265,1548267,"[1548265, 1548266, 1548266, 1548266, 1548266, ...","[-0.25881854, -0.25881854, -0.25881854, -0.258...","[-0.965926, -0.965926, -0.965926, -0.965926, -...","[0.43388295, 0.43388295, 0.43388295, 0.4338829...","[-0.90096927, -0.90096927, -0.90096927, -0.900...","[8, 8, 8, 8, 8, 8, 8, 8, 8, 6, 0, 8, 8, 8, 8, ...","[-0.7722002, -0.7722002, -0.7722002, -0.772200...",20bf3fb9ed342eac74781a9a65afb2da0c11adf135e4de...
3,27428,86,38,"[1, 6994, 22678, 6994, 1, 1, 1, 1, 1, 1, 1, 1,...",38,"[17, 14381, 57807, 14381, 7, 23, 1, 98, 10766,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ...","[0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ...",...,1547917,1547919,"[1547917, 1547917, 1547917, 1547917, 1547917, ...","[-0.25881854, -0.25881854, -0.25881854, -0.258...","[-0.965926, -0.965926, -0.965926, -0.965926, -...","[-0.781831, -0.781831, -0.781831, -0.781831, -...","[0.6234904, 0.6234904, 0.6234904, 0.6234904, 0...","[4, 3, 1, 3, 4, 4, 4, 4, 3, 4, 3, 3, 3, 3, 3, ...","[-1.2832252, -1.4772274, -2.0798535, -1.477227...",20eb743371ef33ca3a5fbb2c5c82b524b7bd96eff52f74...
4,27429,51,38,"[1, 1, 1, 1, 1, 1, 958, 7475, 4177, 11717, 225...",38,"[59, 14, 3, 553, 28006, 2481, 2022, 15440, 846...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, ...",...,1550966,1550969,"[1550966, 1550966, 1550966, 1550966, 1550966, ...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...","[2.6179939e-07, 2.6179939e-07, 2.6179939e-07, ...","[1.1285199e-06, 1.1285199e-06, 1.1285199e-06, ...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...","[39, 39, 39, 39, 39, 39, 39, 39, 38, 39, 39, 3...","[0.5246528, 0.5246528, 0.5246528, 0.5246528, 0...",216a57d50eeacb6a798605524ee9159c3bda40a331df0c...


In [89]:
#save sessions_gdf
final_session_gdf.to_parquet(os.path.join(OUTPUT_DIR, "final_session_gdf.parquet"))

### Add search context 

In [90]:
search_table = cudf.read_parquet(OUTPUT_DIR+"/session_search.parquet")


In [91]:
search_table.columns  = ['original_session_id_hash', 'flat_query_vector', 'flat_product_skus_hash',
       'flat_clicked_skus_hash', 'impressions_size', 'clicks_size','nb_queries',
       'clicked-flag']

In [92]:
merged_session_table = final_session_gdf.merge(search_table, on=['original_session_id_hash'], how='left')

In [93]:
del final_session_gdf, search_table

# Process missing values 

In [94]:
merged_session_table = merged_session_table.to_pandas()

In [95]:
merged_session_table.head()

Unnamed: 0,session_id_hash,day_index,product_sku_hash_count,product_sku_hash_list_seq,product_url_hash_count,product_url_hash_list_seq,has_been_removed_from_cart-list_seq,has_been_added_to_cart-list_seq,has_been_purchased-list_seq,has_been_detailed-list_seq,...,timestamp_age_days-list_seq,timestamp_age_days_norm-list_seq,original_session_id_hash,flat_query_vector,flat_product_skus_hash,flat_clicked_skus_hash,impressions_size,clicks_size,nb_queries,clicked-flag
0,64548,42,28,"[2278, 1, 1, 1, 352, 1, 1, 2911, 1, 3184, 1, 1...",28,"[4621, 2010, 10385, 68, 864, 1, 9252, 5913, 36...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, ...",...,"[40, 40, 46, 47, 47, 47, 47, 47, 47, 47, 40, 4...","[0.5461206, 0.5461206, 0.66486025, 0.6831641, ...",7322346ca76333a3f57beb488486e3a7103b9b8ae5b81f...,"[0.00867827795445919, -0.3432416021823883, 0.1...",[bfced2942872e370277a90303b87623d3f1ae629be8cf...,[c660da5063e4ea50ba2cbf1d266c9dccddddb5d85f4c2...,"[0.0, 25.0, 12.0]","[0.0, 0.0, 1.0]",3.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,64552,32,28,"[2439, 1, 1, 144, 1, 1393, 1, 1, 1, 5544, 1, 1...",28,"[4982, 1, 12, 446, 1071, 2921, 14, 5386, 2, 11...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...",...,"[31, 58, 58, 33, 12, 33, 58, 8, 58, 37, 24, 57...","[0.3306505, 0.86255425, 0.86255425, 0.38335788...",7350b2ee3456c1ce3b438fa1c0a14f79d229a3644988ec...,"[-0.002791411941871047, 0.05408860743045807, -...",[7d310ca5f5c940b78058f8c47a765db2c558258c923c7...,[31540ffce1e058f4acf13d7994c9988abbf61dc91039f...,"[25.0, 2.0, 5.0]","[0.0, 1.0, 1.0]",3.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,64560,59,28,"[1, 1496, 1, 380, 1, 1, 1, 158, 1, 158, 1, 1, ...",28,"[11164, 3120, 5, 912, 13, 982, 357, 494, 1, 49...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, ...","[0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, ...",...,"[29, 30, 31, 30, 9, 31, 31, 31, 31, 31, 31, 31...","[0.2745404, 0.30304796, 0.3306505, 0.30304796,...",73971f9c28a976320ba32f08cf4557b8b58e2b9fa1de6f...,"[0.1476699411869049, -0.35439008474349976, 0.0...",[8a6176eac1d792fa81d02bc0d76970f967f7b3040fcce...,[667479857189eee1338e962ca01e66f1e0de83f9a6ea9...,"[25.0, 10.0]","[0.0, 2.0]",2.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,64563,56,28,"[1, 1, 21286, 1, 1, 1, 1, 1, 1, 13378, 15451, ...",28,"[2, 64, 52225, 312, 549, 77116, 1178, 1, 16939...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, ...",...,"[34, 34, 34, 34, 34, 33, 34, 34, 34, 33, 32, 1...","[0.4085599, 0.4085599, 0.4085599, 0.4085599, 0...",73bb059b6e3048f9775cfd147bf49c1655f057e356d7be...,"[0.08299337327480316, -0.2532674968242645, 0.0...",[7b28d98f05355a7e3a930e64648375e4603a144e20ea2...,[missing],"[25.0, 5.0, 0.0, 0.0]","[0.0, 0.0, 0.0, 0.0]",4.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,64568,87,28,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 9138, 1933, ...",28,"[2165, 2, 84, 14027, 5778, 8660, 26650, 137, 3...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, ...",...,"[2, 3, 2, 2, 2, 2, 2, 3, 2, 3, 1, 2, 2, 2, 2, ...","[-1.7273399, -1.4772274, -1.7273399, -1.727339...",73eb8785b21d6395b74723845cbf6f5b31a5d3fce99bac...,"[-0.04980561137199402, -0.1424727439880371, -0...",[cb683bec5780e7cef18967a884c195f3805673abd6e64...,[missing],[13.0],[0.0],1.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"


In [96]:
merged_session_table.loc[merged_session_table.flat_product_skus_hash.isna(), 
                         'flat_product_skus_hash'] = pd.Series([['missing']] * merged_session_table.flat_product_skus_hash.isna().sum()).values

In [97]:
merged_session_table.loc[merged_session_table.flat_clicked_skus_hash.isnull(),
                         'flat_clicked_skus_hash'] = pd.Series([['missing']] * merged_session_table.flat_clicked_skus_hash.isnull().sum()).values

In [98]:
merged_session_table.loc[merged_session_table.flat_query_vector.isnull(),
                         'flat_query_vector'] = pd.Series([np.zeros(50)] * merged_session_table.flat_query_vector.isnull().sum()).values

In [99]:
merged_session_table.loc[merged_session_table.impressions_size.isnull(),
                         'impressions_size'] = pd.Series([[0]] * merged_session_table.impressions_size.isnull().sum()).values

In [100]:
merged_session_table.nb_queries.fillna(0, inplace=True)

In [101]:
merged_session_table.head(5)

Unnamed: 0,session_id_hash,day_index,product_sku_hash_count,product_sku_hash_list_seq,product_url_hash_count,product_url_hash_list_seq,has_been_removed_from_cart-list_seq,has_been_added_to_cart-list_seq,has_been_purchased-list_seq,has_been_detailed-list_seq,...,timestamp_age_days-list_seq,timestamp_age_days_norm-list_seq,original_session_id_hash,flat_query_vector,flat_product_skus_hash,flat_clicked_skus_hash,impressions_size,clicks_size,nb_queries,clicked-flag
0,64548,42,28,"[2278, 1, 1, 1, 352, 1, 1, 2911, 1, 3184, 1, 1...",28,"[4621, 2010, 10385, 68, 864, 1, 9252, 5913, 36...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, ...",...,"[40, 40, 46, 47, 47, 47, 47, 47, 47, 47, 40, 4...","[0.5461206, 0.5461206, 0.66486025, 0.6831641, ...",7322346ca76333a3f57beb488486e3a7103b9b8ae5b81f...,"[0.00867827795445919, -0.3432416021823883, 0.1...",[bfced2942872e370277a90303b87623d3f1ae629be8cf...,[c660da5063e4ea50ba2cbf1d266c9dccddddb5d85f4c2...,"[0.0, 25.0, 12.0]","[0.0, 0.0, 1.0]",3.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,64552,32,28,"[2439, 1, 1, 144, 1, 1393, 1, 1, 1, 5544, 1, 1...",28,"[4982, 1, 12, 446, 1071, 2921, 14, 5386, 2, 11...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...",...,"[31, 58, 58, 33, 12, 33, 58, 8, 58, 37, 24, 57...","[0.3306505, 0.86255425, 0.86255425, 0.38335788...",7350b2ee3456c1ce3b438fa1c0a14f79d229a3644988ec...,"[-0.002791411941871047, 0.05408860743045807, -...",[7d310ca5f5c940b78058f8c47a765db2c558258c923c7...,[31540ffce1e058f4acf13d7994c9988abbf61dc91039f...,"[25.0, 2.0, 5.0]","[0.0, 1.0, 1.0]",3.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,64560,59,28,"[1, 1496, 1, 380, 1, 1, 1, 158, 1, 158, 1, 1, ...",28,"[11164, 3120, 5, 912, 13, 982, 357, 494, 1, 49...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, ...","[0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, ...",...,"[29, 30, 31, 30, 9, 31, 31, 31, 31, 31, 31, 31...","[0.2745404, 0.30304796, 0.3306505, 0.30304796,...",73971f9c28a976320ba32f08cf4557b8b58e2b9fa1de6f...,"[0.1476699411869049, -0.35439008474349976, 0.0...",[8a6176eac1d792fa81d02bc0d76970f967f7b3040fcce...,[667479857189eee1338e962ca01e66f1e0de83f9a6ea9...,"[25.0, 10.0]","[0.0, 2.0]",2.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,64563,56,28,"[1, 1, 21286, 1, 1, 1, 1, 1, 1, 13378, 15451, ...",28,"[2, 64, 52225, 312, 549, 77116, 1178, 1, 16939...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, ...",...,"[34, 34, 34, 34, 34, 33, 34, 34, 34, 33, 32, 1...","[0.4085599, 0.4085599, 0.4085599, 0.4085599, 0...",73bb059b6e3048f9775cfd147bf49c1655f057e356d7be...,"[0.08299337327480316, -0.2532674968242645, 0.0...",[7b28d98f05355a7e3a930e64648375e4603a144e20ea2...,[missing],"[25.0, 5.0, 0.0, 0.0]","[0.0, 0.0, 0.0, 0.0]",4.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,64568,87,28,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 9138, 1933, ...",28,"[2165, 2, 84, 14027, 5778, 8660, 26650, 137, 3...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, ...",...,"[2, 3, 2, 2, 2, 2, 2, 3, 2, 3, 1, 2, 2, 2, 2, ...","[-1.7273399, -1.4772274, -1.7273399, -1.727339...",73eb8785b21d6395b74723845cbf6f5b31a5d3fce99bac...,"[-0.04980561137199402, -0.1424727439880371, -0...",[cb683bec5780e7cef18967a884c195f3805673abd6e64...,[missing],[13.0],[0.0],1.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"


### Encode flat_product_skus_hash and flat_clicked_skus_hash using product_url_hash mapping

- load mapping 

In [102]:
mapping = pd.read_parquet(OUTPUT_DIR+"/categorify_workflow/categories/unique.product_url_hash.parquet")

In [103]:
mapping_dict = dict(zip(mapping.product_url_hash,mapping.index))

- Update mapping with unseen browsing products ids present in `flat_product_skus_hash` and `flat_clicked_skus_hash`

In [104]:
prods = [e for impression in merged_session_table.flat_product_skus_hash.values for e in impression]
clicked_prod = [e for impression in merged_session_table.flat_clicked_skus_hash.values for e in impression]
all_prods = set(prods + clicked_prod)
new_prods = all_prods.difference(set(mapping.product_url_hash))
print('Number of products present in search and not in browsing is: %s' %len(new_prods))

Number of products present in search and not in browsing is: 949


In [105]:
cardinality = len(mapping_dict)
new_ids = list(range(cardinality, cardinality+len(new_prods)))
new_dict = dict(zip(new_prods, new_ids))
mapping_dict.update(new_dict)

- Encode list columns :  `flat_product_skus_hash` and `flat_clicked_skus_hash`

In [106]:
def encode_list(x): 
    return [mapping_dict[e] for e in x]

In [107]:
merged_session_table.flat_product_skus_hash = merged_session_table.flat_product_skus_hash.progress_apply(encode_list)
merged_session_table.flat_clicked_skus_hash = merged_session_table.flat_clicked_skus_hash.progress_apply(encode_list)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████| 3291455/3291455 [00:11<00:00, 290746.18it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████| 3291455/3291455 [00:06<00:00, 485967.61it/s]


In [108]:
merged_session_table.head()

Unnamed: 0,session_id_hash,day_index,product_sku_hash_count,product_sku_hash_list_seq,product_url_hash_count,product_url_hash_list_seq,has_been_removed_from_cart-list_seq,has_been_added_to_cart-list_seq,has_been_purchased-list_seq,has_been_detailed-list_seq,...,timestamp_age_days-list_seq,timestamp_age_days_norm-list_seq,original_session_id_hash,flat_query_vector,flat_product_skus_hash,flat_clicked_skus_hash,impressions_size,clicks_size,nb_queries,clicked-flag
0,64548,42,28,"[2278, 1, 1, 1, 352, 1, 1, 2911, 1, 3184, 1, 1...",28,"[4621, 2010, 10385, 68, 864, 1, 9252, 5913, 36...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, ...",...,"[40, 40, 46, 47, 47, 47, 47, 47, 47, 47, 40, 4...","[0.5461206, 0.5461206, 0.66486025, 0.6831641, ...",7322346ca76333a3f57beb488486e3a7103b9b8ae5b81f...,"[0.00867827795445919, -0.3432416021823883, 0.1...","[6056, 3598, 10943, 7062, 7926, 9107, 4452, 12...",[6983],"[0.0, 25.0, 12.0]","[0.0, 0.0, 1.0]",3.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,64552,32,28,"[2439, 1, 1, 144, 1, 1393, 1, 1, 1, 5544, 1, 1...",28,"[4982, 1, 12, 446, 1071, 2921, 14, 5386, 2, 11...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...",...,"[31, 58, 58, 33, 12, 33, 58, 8, 58, 37, 24, 57...","[0.3306505, 0.86255425, 0.86255425, 0.38335788...",7350b2ee3456c1ce3b438fa1c0a14f79d229a3644988ec...,"[-0.002791411941871047, 0.05408860743045807, -...","[126942, 37103, 43807, 64264, 53843, 61638, 77...","[11340, 15677]","[25.0, 2.0, 5.0]","[0.0, 1.0, 1.0]",3.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,64560,59,28,"[1, 1496, 1, 380, 1, 1, 1, 158, 1, 158, 1, 1, ...",28,"[11164, 3120, 5, 912, 13, 982, 357, 494, 1, 49...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, ...","[0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, ...",...,"[29, 30, 31, 30, 9, 31, 31, 31, 31, 31, 31, 31...","[0.2745404, 0.30304796, 0.3306505, 0.30304796,...",73971f9c28a976320ba32f08cf4557b8b58e2b9fa1de6f...,"[0.1476699411869049, -0.35439008474349976, 0.0...","[8650, 1342, 223, 3311, 654, 624, 9486, 6577, ...","[494, 494]","[25.0, 10.0]","[0.0, 2.0]",2.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,64563,56,28,"[1, 1, 21286, 1, 1, 1, 1, 1, 1, 13378, 15451, ...",28,"[2, 64, 52225, 312, 549, 77116, 1178, 1, 16939...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, ...",...,"[34, 34, 34, 34, 34, 33, 34, 34, 34, 33, 32, 1...","[0.4085599, 0.4085599, 0.4085599, 0.4085599, 0...",73bb059b6e3048f9775cfd147bf49c1655f057e356d7be...,"[0.08299337327480316, -0.2532674968242645, 0.0...","[33001, 50314, 34478, 50973, 40885, 36747, 384...",[488659],"[25.0, 5.0, 0.0, 0.0]","[0.0, 0.0, 0.0, 0.0]",4.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,64568,87,28,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 9138, 1933, ...",28,"[2165, 2, 84, 14027, 5778, 8660, 26650, 137, 3...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, ...",...,"[2, 3, 2, 2, 2, 2, 2, 3, 2, 3, 1, 2, 2, 2, 2, ...","[-1.7273399, -1.4772274, -1.7273399, -1.727339...",73eb8785b21d6395b74723845cbf6f5b31a5d3fce99bac...,"[-0.04980561137199402, -0.1424727439880371, -0...","[6664, 2979, 1535, 200, 3167, 1985, 9076, 2895...",[488659],[13.0],[0.0],1.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"


- Save updated mapping 

In [109]:
pd.DataFrame(mapping_dict.keys(), columns=['product_url_hash']).to_parquet(os.path.join(OUTPUT_DIR,
                                                                                        'unique.updated_product_url_hash.parquet'))

### Create clicked-flag : 

- check if element of `flat_product_skus_hash` was clicked or not.

P.S: We ignore the click if interacted item is in the list of clicks (True in 1% of search interactions). 

In [110]:
merged_session_table['clicked-flag'] = merged_session_table.progress_apply(lambda x: [int(e in  list(set(x['flat_clicked_skus_hash']).difference(set(x['product_sku_hash_list_seq'])))) \
                                                                          for e in x['flat_product_skus_hash']], axis=1)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 3291455/3291455 [02:41<00:00, 20384.25it/s]


In [111]:
merged_session_table[1020:1030]

Unnamed: 0,session_id_hash,day_index,product_sku_hash_count,product_sku_hash_list_seq,product_url_hash_count,product_url_hash_list_seq,has_been_removed_from_cart-list_seq,has_been_added_to_cart-list_seq,has_been_purchased-list_seq,has_been_detailed-list_seq,...,timestamp_age_days-list_seq,timestamp_age_days_norm-list_seq,original_session_id_hash,flat_query_vector,flat_product_skus_hash,flat_clicked_skus_hash,impressions_size,clicks_size,nb_queries,clicked-flag
1020,31933,48,36,"[1, 24114, 9213, 9857, 11042, 1, 8988, 15137, ...",36,"[5330, 64936, 19365, 20831, 23500, 1, 18848, 3...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, ...",...,"[41, 36, 40, 40, 40, 42, 41, 41, 42, 4, 0, 41,...","[0.56707126, 0.45687267, 0.5461206, 0.5461206,...",20475ac677758125e6e3f7e6eec49c9c986e3cae0acd5a...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",[488659],[488659],[0],,0.0,[1]
1021,31934,87,36,"[1, 1, 1, 1, 1, 1, 936, 1, 1, 1, 1, 1341, 1, 1...",36,"[1588, 29308, 29601, 1, 10313, 247963, 1974, 5...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, ...",...,"[3, 2, 3, 3, 3, 0, 3, 1, 3, 3, 3, 3, 0, 1, 3, ...","[-1.4772274, -1.7273399, -1.4772274, -1.477227...",20533b32b935b77385e081f0255d543ff57a0ee3761a58...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",[488659],[488659],[0],,0.0,[1]
1022,31935,71,36,"[2, 3755, 2, 7, 1, 1, 1, 1, 7, 1, 2, 1, 7, 2, ...",36,"[8, 7609, 8, 66, 33, 3, 1856, 35, 66, 25, 8, 1...","[1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, ...","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",...,"[19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 1...","[-0.077973224, -0.077973224, -0.077973224, -0....",2056b037da7667967ea6b8aade069473cf514e88adf032...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",[488659],[488659],[0],,0.0,[1]
1023,31936,88,36,"[7402, 1, 1, 1, 7890, 1, 8302, 7719, 10174, 1,...",36,"[15279, 24442, 1, 15, 16379, 109, 17355, 16004...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, ...",...,"[2, 0, 2, 2, 0, 2, 1, 0, 1, 0, 2, 0, 1, 1, 1, ...","[-1.7273399, -2.6824794, -1.7273399, -1.727339...",205be498f2ce779cdaafab6b11862de815f92594464dbf...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",[488659],[488659],[0],,0.0,[1]
1024,42913,19,33,"[1, 1, 1, 1, 1, 1, 1, 1, 9890, 1, 5193, 1, 477...",33,"[1420, 14905, 3, 907, 28643, 7, 9, 9929, 20891...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, ...",...,"[71, 70, 71, 70, 70, 71, 71, 70, 70, 70, 70, 7...","[1.0356778, 1.0235182, 1.0356778, 1.0235182, 1...",c900d891216704d06edd5081f2c0531d8871aaaac71c87...,"[-0.06847687065601349, -0.15405967831611633, 0...",[488659],[488659],[0.0],[0.0],1.0,[1]
1025,42915,19,33,"[1, 1, 300, 223, 1, 1, 1, 1, 1, 1435, 300, 1, ...",33,"[2, 1779, 776, 637, 1, 1850, 39990, 39089, 585...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, ...",...,"[71, 69, 70, 71, 71, 71, 8, 8, 8, 71, 70, 71, ...","[1.0356778, 1.0111856, 1.0235182, 1.0356778, 1...",c922840659415c1b7d4bbc6ead2f13ea73ec9db09f5cf3...,"[-0.03456311300396919, -0.37039944529533386, 0...","[37153, 637, 7594, 1804, 184, 23901, 776, 5566...","[776, 776]",[20.0],[2.0],1.0,"[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1026,42916,62,33,"[4382, 1, 1, 7977, 1, 2664, 5522, 10560, 14398...",33,"[8907, 267, 24, 16576, 1975, 5427, 11285, 2239...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, ...",...,"[28, 28, 28, 28, 28, 27, 27, 26, 26, 28, 28, 2...","[0.24506608, 0.24506608, 0.24506608, 0.2450660...",c94b0f64ef8289b518be7df27189986c7acf5efdd1a45b...,"[-0.2519949972629547, 0.06259388476610184, -0....","[5894, 19090, 5364, 9140, 10332, 5427, 4755, 1...",[488659],[25.0],[0.0],1.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1027,42917,36,33,"[1, 1, 1, 1435, 1, 1994, 1, 4590, 1, 1, 4510, ...",33,"[489, 2762, 5479, 2994, 24, 4079, 1, 9313, 141...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, ...",...,"[54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 53, 5...","[0.80151826, 0.80151826, 0.80151826, 0.8015182...",c9675e535ea1a74af342efed7db1f28e14d81603dded2c...,"[0.027160272002220154, -0.2507864236831665, 0....","[15233, 12142, 13315, 32262, 13534, 29773, 209...",[9102],"[0.0, 9.0, 11.0]","[0.0, 0.0, 1.0]",3.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1028,42922,12,33,"[1, 1, 1, 1, 12193, 1, 1, 11570, 1, 1, 1, 1745...",33,"[3081, 2214, 9, 3, 26217, 7, 58330, 24750, 15,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, ...",...,"[77, 78, 78, 78, 74, 78, 35, 78, 78, 78, 78, 4...","[1.1052674, 1.1163425, 1.1163425, 1.1163425, 1...",c9ca05cbfaf2a2d1921a5067e3422eb8292e32552e78a1...,"[-0.11631450802087784, -0.10871367156505585, -...","[1024, 207, 336, 477, 2375, 786, 6467, 650, 62...",[488659],[25.0],[0.0],1.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1029,42924,42,33,"[1, 1, 3436, 1, 1, 95, 1, 1, 1, 1, 1, 1, 1, 95...",33,"[12, 1216, 6961, 11, 4582, 332, 2, 82, 20342, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, ...","[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, ...","[0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, ...",...,"[48, 48, 48, 48, 48, 48, 48, 48, 21, 48, 38, 4...","[0.70109075, 0.70109075, 0.70109075, 0.7010907...",ca0a15cee6b0c93f0533406561508e30934636060540b0...,"[-0.23069609701633453, 0.23751962184906006, 0....","[384, 332, 400, 281, 176, 575, 187, 89, 645, 1...",[488659],"[25.0, 25.0, 25.0]","[0.0, 0.0, 0.0]",3.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [112]:
merged_session_table.shape

(3291455, 36)

In [113]:
#save sessions_gdf
merged_session_table.to_parquet(os.path.join(OUTPUT_DIR, "merged_session_table.parquet"))

In [114]:
#merged_session_table['is_test-last'].sum()

In [115]:
# save to parquet file with 40 partitions 
#merged_session_table['parquet_split'] =  np.random.randint(0,40,size=merged_session_table.shape[0])

#merged_session_table.to_parquet(os.path.join(OUTPUT_DIR, "merged_session_table_parquet"), partition_cols=['parquet_split'])

In [116]:
#pip install transformers4rec[torch,nvtabular]

In [117]:
# define partition column
#PARTITION_COL = 'day_index'

# define output_folder to store the partitioned parquet files
#OUTPUT_FOLDER = os.environ.get("OUTPUT_FOLDER", INPUT_DATA_DIR + "sessions_by_day")
#!mkdir -p $OUTPUT_FOLDER

In [118]:
#from transformers4rec.data.preprocessing import save_time_based_splits

### Exporting session table by fold and train-test-valid splits 

In [119]:
#OUTPUT_FOLDER = "/result/coveo/"
#for fold in range(1, 6): 
    #merged_session_table.loc[(merged_session_table['is_test-last']==1) & (merged_session_table.fold==fold)].to_parquet(os.path.join(OUTPUT_DIR, 'test-%s.parquet'%fold),
                                                                                               #partition_file_name= 'test-%s.parquet'%fold,
                                                                                               #engine='pyarrow',
                                                                                               #row_group_size=1000
                                                                                                           #)
    
    
    
    #merged_session_table.loc[(merged_session_table['is_valid']==1) & (merged_session_table.fold==fold)].to_parquet(os.path.join(OUTPUT_DIR, 'valid-%s.parquet'%fold),
                                                                                           #partition_file_name = 'valid-%s.parquet'%fold,
                                                                                           #engine='pyarrow',
                                                                                           #row_group_size=1000
                                                                                                       #)
    
    
    #merged_session_table.loc[(merged_session_table['is_valid']==0) & (merged_session_table['is_test-last']==0) & (merged_session_table.fold==fold)].to_parquet(os.path.join(OUTPUT_DIR, 'train-%s.parquet'%fold),
                                                                                           #partition_file_name = 'train-%s.parquet'%fold,
                                                                                           #engine='pyarrow',
                                                                                           #row_group_size=1000
                                                                                                                                           #)

In [120]:
#test = pd.read_parquet(os.path.join(OUTPUT_DIR, "test-1.parquet"))

In [121]:
#test.head(2)

### Save unique product sku mapping from updated product_url_hash encoded column

In [122]:
urls_ids = interactions_merged_df[interactions_merged_df.event_type==2]['product_url_hash'].unique()

In [123]:
mapping = pd.read_parquet(os.path.join(OUTPUT_DIR, 'unique.updated_product_url_hash.parquet'))

In [124]:
mask = mapping.reset_index()['index'].isin(urls_ids.values.tolist())

In [125]:
mapping_prod = mapping[~mask].reset_index()

In [126]:
mapping_prod.columns =  ['encoded_product_sku', 'original_product_sku']

In [127]:
mapping_prod.head()

Unnamed: 0,encoded_product_sku,original_product_sku
0,0,
1,1,433b0e71df1fe9a8d1f45647545701f6108414c40eef76...
2,2,41ae0f916406434064aeb03d859a7141b53d2900394dca...
3,3,38f5bd3c9a1cc5b39e6b965f1aa6c565737f58e19a560a...
4,4,10cf8acdd90cc03246e1fd01630637d89d861237b14cbe...


In [128]:
mapping_prod.shape[0]

432062

In [129]:
mapping_prod.to_parquet(os.path.join(OUTPUT_DIR, 'mapping_product_sku_without_urls.parquet'))

### Create pickle files with product embedding vectors 

* Pickle file with a tuple composed by:

    - Numpy matrix with the description vectors of the products
    - Numpy matrix with the image vectors of the products
    - Dict to map the encoded product_sku to the position in the embedding matrices

- Load product table 

In [130]:
product_info = pd.read_csv('/recsys/data/sku_to_content.csv', usecols=['product_sku_hash', 
                                                                     'description_vector', 
                                                                     'image_vector'])
product_info.tail() 

Unnamed: 0,product_sku_hash,description_vector,image_vector
66381,c7cc673ca3baa5fa222fffdc16379892b3a62583a48143...,,
66382,6641c7d2053ce48ce1e81a9653dffe56dbb79ab0704fbd...,"[-0.19150441884994507, -0.06235162168741226, -...","[129.55668732976045, 43.27996741934932, -36.70..."
66383,526a6a51717d5bb40ef2b0c47394d08c54385375633bab...,"[-0.19760936498641968, 0.4446450471878052, -0....","[-158.25984189321855, 74.19255741438077, 199.6..."
66384,21ca4ab0e2fbd3b401fbeadeb4439dcab9998fb52159ec...,,
66385,0eaa39fb645749da181c0637d3b420f3f103b5f6b50286...,,


In [131]:
# convert strings to list object 
import ast
def convert_str_to_list(x): 
    if pd.isnull(x): 
        return x
    return ast.literal_eval(x)
for col in ['description_vector', 'image_vector']: 
    product_info[col] = product_info[col].progress_apply(convert_str_to_list)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 66386/66386 [00:04<00:00, 14095.37it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 66386/66386 [00:04<00:00, 15971.92it/s]


In [132]:
product_info.columns = ['original_product_sku', 'description_vector', 'image_vector']

### Merge product embeddings and mapping_prod

In [133]:
embeddings_table = mapping_prod.merge(product_info, on=['original_product_sku'], how='left')

- Fill missing embeddings with vector of zeros 

In [134]:
embeddings_table.loc[embeddings_table.description_vector.isnull(),
                         'description_vector'] = pd.Series([np.zeros(50)] * embeddings_table.description_vector.isnull().sum()).values

embeddings_table.loc[embeddings_table.image_vector.isnull(),
                         'image_vector'] = pd.Series([np.zeros(50)] * embeddings_table.image_vector.isnull().sum()).values

In [135]:
embeddings_table

Unnamed: 0,encoded_product_sku,original_product_sku,description_vector,image_vector
0,0,,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,1,433b0e71df1fe9a8d1f45647545701f6108414c40eef76...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,2,41ae0f916406434064aeb03d859a7141b53d2900394dca...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,3,38f5bd3c9a1cc5b39e6b965f1aa6c565737f58e19a560a...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,4,10cf8acdd90cc03246e1fd01630637d89d861237b14cbe...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
...,...,...,...,...
432057,489540,edfd3532fc0d137a8c0f5821d1dd54e4b2b979d2583d10...,"[-0.21970106661319733, 0.18805548548698425, 0....","[-135.82327267249292, 23.51688379541042, -70.5..."
432058,489541,b970ee708863a7086b4df69f90a046db2c09ad2504193f...,"[-0.19333353638648987, 0.31274884939193726, 0....","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
432059,489542,61775fd991f00fcaae8d4baad45dfd4d3b52fcdb212113...,"[-0.14206060767173767, 0.15506064891815186, 0....","[92.0906556030472, -100.89914367963044, -40.13..."
432060,489543,be82cef455a749da1228433081568b3a0374c7b5f6d169...,"[-0.31547439098358154, -0.20620247721672058, 0...","[-22.43402558953332, 264.6133131677118, 227.14..."


- Numpy matrix with the image vectors of the products

In [136]:
image_matrix = np.concatenate(embeddings_table.image_vector.values).reshape(-1, 50)

- Numpy matrix with the description vectors of the products

In [137]:
desc_matrix = np.concatenate(embeddings_table.description_vector.values).reshape(-1, 50)

- Dict to map the encoded product_sku to the position in the embedding matrices

In [138]:
mapping_id_sku_emb_position = dict(zip(embeddings_table.encoded_product_sku, embeddings_table.index))

### Save to pickle file 

In [139]:
# Saving the objects:
import pickle
with open(os.path.join(OUTPUT_DIR, 'embedding_data.pkl'), 'wb')as f:  
    pickle.dump([desc_matrix, image_matrix, mapping_id_sku_emb_position], f)