## Things to do
1. Embeddings for segment >= 20 for uuid and brand
2. Model inference for the 2 models viz. model_segGE20 and model_segLE20

### User and Brand Embeddings for SegGE20

In [1]:
import pandas as pd
import numpy as np
import torch
import os, sys, json, joblib

In [37]:
sys.path.append('../../offline/src/')
from constants import *
from network import ProductRecommendationModel
from baseline_feats_utils import feat_type_feats_dct

In [13]:
# GLOBALS
SEGMENT = 'GE20'
N_USERS = 1444170
N_ITEMS = 1175648
N_ONTOLOGIES = 801
N_BRANDS = 1686
MODEL_FN = os.path.join(MODEL_DIR, 'Class_model_SegGE20_E1_ckpt.pt')

In [4]:
def choose_embedding_size(cat_cols, cat_num_values, min_emb_dim=100):
    """
    cat_cols: list of categorical columns
    cat_num_values: list of number of unique values for each categorical column
    """

    embedded_cols = dict(zip(cat_cols, cat_num_values))
    embedding_sizes = [(n_categories, min(min_emb_dim, (n_categories+1)//2))
                       for _, n_categories in embedded_cols.items()]
    return embedding_sizes

In [5]:
# choose embedding size

if SEGMENT != 'GE20':
    cat_cols = [ITEM_COL, ONTOLOGY_COL, BRAND_COL]
    cat_num_values = [N_ITEMS, N_ONTOLOGIES, N_BRANDS]
else:
    cat_cols = [USER_COL, ITEM_COL, ONTOLOGY_COL, BRAND_COL]
    cat_num_values = [N_USERS, N_ITEMS, N_ONTOLOGIES, N_BRANDS]

embedding_sizes = choose_embedding_size(cat_cols, cat_num_values, 150)

In [6]:
embedding_sizes

[(1444170, 150), (1175648, 150), (801, 150), (1686, 150)]

In [8]:
model = ProductRecommendationModel(embedding_sizes, 18, 3)

In [9]:
model

ProductRecommendationModel(
  (embeddings): ModuleList(
    (0): Embedding(1444170, 150)
    (1): Embedding(1175648, 150)
    (2): Embedding(801, 150)
    (3): Embedding(1686, 150)
  )
  (lin1): Linear(in_features=618, out_features=300, bias=True)
  (lin2): Linear(in_features=300, out_features=100, bias=True)
  (lin3): Linear(in_features=100, out_features=3, bias=True)
  (bn1): BatchNorm1d(18, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (bn2): BatchNorm1d(300, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (bn3): BatchNorm1d(100, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (emb_drop): Dropout(p=0.6, inplace=False)
  (drops): Dropout(p=0.3, inplace=False)
)

In [14]:
ckpt = torch.load(MODEL_FN, map_location=torch.device('cpu'))
model.load_state_dict(ckpt['model_state_dict'])

<All keys matched successfully>

In [25]:
model.embeddings

ModuleList(
  (0): Embedding(1444170, 150)
  (1): Embedding(1175648, 150)
  (2): Embedding(801, 150)
  (3): Embedding(1686, 150)
)

In [32]:
model.embeddings[0].weight.data[0].numpy().tolist()

[-2.342577267482619e-39,
 1.213080641630508e-30,
 4.4801276403994006e-12,
 -1.0745834560718648e-12,
 -1.2736684536912568e-12,
 6.0091482803018485e-31,
 2.27411683100803e-39,
 -3.6126340930431473e-31,
 8.189247845003542e-31,
 -4.468207216983683e-13,
 -5.60428921796904e-32,
 1.2939744761797378e-12,
 -1.2274464051116813e-30,
 -1.1692410147495006e-32,
 -5.756624822276282e-31,
 -1.294582171497416e-12,
 5.167707353617018e-14,
 8.789154163014901e-40,
 1.9223996042439304e-33,
 -1.3788569423846007e-30,
 -4.247238354069029e-31,
 1.7108385021169514e-12,
 -8.790560732155114e-34,
 2.0065267271361113e-30,
 7.22623729517563e-13,
 5.943382857262827e-14,
 6.27796900294457e-14,
 -1.5818236477616043e-12,
 3.3994831147125815e-13,
 -4.610817478691021e-13,
 7.891153008809937e-13,
 2.3336201676986546e-39,
 -1.1290121597218619e-39,
 -7.991583740249442e-13,
 1.6740942387189768e-32,
 1.8846783211338293e-31,
 -3.4870087766718993e-32,
 -2.192589286353612e-39,
 6.515122642769544e-13,
 1.9089695102447735e-12,
 -2.8

### Model Inference

In [21]:
inp_fn = os.path.join(INTERIM_DATA_DIR, '0005_part_07.gz')
df = pd.read_csv(inp_fn, sep='|', compression='gzip')

In [22]:
print(df.shape)
df.head()

(1371989, 24)


Unnamed: 0,uuid,userevent,sourceprodid,clicked_epoch,ontology,brand,price,uuid_num_interactions,uuid_mean_price_interactions,uuid_days_since_earliest_interaction,...,uuid_max_num_interactions_per_ont,uuid_mean_num_interactions_per_ont,uuid_min_num_interactions_per_brand,uuid_max_num_interactions_per_brand,uuid_mean_num_interactions_per_brand,sourceprodid_num_interactions,sourceprodid_days_since_earliest_interaction,sourceprodid_min_num_interactions_per_user,sourceprodid_max_num_interactions_per_user,sourceprodid_mean_num_interactions_per_user
0,4852310,1,911340,1551714898,431,1480,1099.0,776.0,2694.77895,8.865451,...,26.0,1.519018,1.0,23.0,1.388859,188.0,38.233414,1.0,4.0,1.027066
1,6013644,1,876085,1550334397,217,1327,5399.0,1015.0,2653.918513,36.472095,...,17.0,1.265118,1.0,19.0,1.154053,1210.0,46.896551,1.0,4.0,1.046217
2,551584,1,693320,1550337611,696,1507,1999.0,93.0,2466.662973,0.044421,...,8.0,2.334516,1.0,5.0,2.301251,474.0,15.834213,1.0,3.0,1.009087
3,2954929,1,630337,1550341067,329,1222,6999.0,142.0,9241.447513,7.639884,...,20.0,3.707738,1.0,14.0,2.38984,367.0,46.536771,1.0,4.0,1.048521
4,2936231,1,537273,1550336735,285,708,499.0,163.0,820.488392,46.41912,...,16.0,2.269754,1.0,15.0,2.29338,357.0,9.044988,1.0,3.0,1.016065


In [33]:
inp_fn = os.path.join(RAW_DATA_DIR, '0005_part_07.gz')
raw_df = pd.read_csv(inp_fn, sep='|', compression='gzip')

print(raw_df.shape)
raw_df.head()

(1371989, 7)


Unnamed: 0,uuid,userevent,sourceprodid,clicked_epoch,ontology,brand,price
0,cc1b580857481534abb2204b167915d7,pageView,3a9feb4237f4203b3118d2071b93c96c,1551714898,644e3342d3fb99e0b4d03b610dd4827d,a6b68a1deb25ba3f4b5a4c4f780094e4,1099.0
1,dae70b91a4c3707956e7dd17a5b03e5c,pageView,611f8d943412d7260bedac2f493f2c77,1550334397,f9e13a341127b189f97e6ee05923340c,b081d61f98a982edd345e81a9d70102a,5399.0
2,300665c14ec978de7eeb31466eb27712,pageView,cd2d2d61897732d4aed73db4af897010,1550337611,e3bb7a2fc0e60206b5b12b95c0c25b07,72c035606a07faa83f56eeb7a1be1beb,1999.0
3,94c68896d5a983923c5acfc62c2303a0,pageView,55eb55799a57b2cc969eb5025e655025,1550341067,87400c7f16b66890a0e0e97305291c92,787923f3de426787a37e7c024f96418d,6999.0
4,eb42828d343f5e56bbf969ac7b7a0a36,pageView,5c2c92c8a1a442024b23016f145d3fda,1550336735,ed414ff376ba74be64279ba9b31a94f3,49b42c44eb0bf64a6a33b4df5ce3b7e9,499.0


In [38]:
pd.to_datetime(1551714898, unit='s')

Timestamp('2019-03-04 15:54:58')

In [23]:
user2idx = json.load(open(USER2IDX_SEGGE20_FN))

#### Inputs
1. uuid
2. sourceprodid

#### Approach
0. map sourceprodid to brand, ontology and price
1. user and item baseline features
2. index mapping for uuid, sourceprodid, ontology, brand
3. prepare input tensors
4. model.forward

In [39]:
user = raw_df.loc[0, 'uuid']
item = raw_df.loc[0, 'sourceprodid']
clicked_epoch = 1551714898

print('User: ', user)
print('Item: ', item)
print('Clicked Epoch: ', clicked_epoch)

User:  cc1b580857481534abb2204b167915d7
Item:  3a9feb4237f4203b3118d2071b93c96c
Clicked Epoch:  1551714898


In [40]:
# Step 0
pdt_mapping = json.load(open(PDT_MAPPING_FN))
ont, brand, price = pdt_mapping[item]
del pdt_mapping

print('Ontology: ', ont)
print('Brand: ', brand)
print('Price: ', price)

Ontology:  644e3342d3fb99e0b4d03b610dd4827d
Brand:  a6b68a1deb25ba3f4b5a4c4f780094e4
Price:  1099.0


In [43]:
# step 1

def get_baseline_feats(user_col, item_col, user, item, clicked_epoch,
                       user_feats, item_feats,
                       feat_type_dct=feat_type_feats_dct):
    print('User Features')
    feats = []
    for feat_pos, feat_name in enumerate(feat_type_dct['user']):
        user_col+'_'+feat_name = user_feats[user][feat_pos]
        if feat_name == 'earliest_interaction_date':
            user_col+'_days_since_earliest_interaction' = (float(
                clicked_epoch)-float(user_col+'_'+feat_name))/(60*60*24)
            if user_col+'_days_since_earliest_interaction' < 0:
                user_col+'_days_since_earliest_interaction' = -1
            feats.append(user_col+'_days_since_earliest_interaction')
        else:
            feats.append(user_col+'_'+feat_name)

    print('Item Features')
    for feat_pos, feat_name in enumerate(feat_type_dct['item']):
        item_col+'_'+feat_name = user_feats[item][feat_pos]
        if feat_name == 'earliest_interaction_date':
            item_col+'_days_since_earliest_interaction' = (float(
                clicked_epoch)-float(item_col+'_'+feat_name))/(60*60*24)
            if item_col+'_days_since_earliest_interaction' < 0:
                item_col+'_days_since_earliest_interaction' = -1
            feats.append(item_col+'_days_since_earliest_interaction')
        else:
            feats.append(item_col+'_'+feat_name)
    
    return feats


print('read baseline feats dct')
user_feats = json.load(open(USER_BASELINE_FEATS_FN))
item_feats = json.load(open(ITEM_BASELINE_FEATS_FN))

print('get baseline feats')
baseline_feats = get_baseline_feats(USER_COL, ITEM_COL, user, item,
                                    clicked_epoch, user_feats,
                                    item_feats)

del user_feats, item_feats

print(baseline_feats)

SyntaxError: can't assign to operator (<ipython-input-43-a992926e1e06>, line 10)