In [1]:
# %pip install tensorflow_addons

Collecting tensorflow_addons
  Downloading tensorflow_addons-0.14.0-cp36-cp36m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.1 MB)
[K     |################################| 1.1 MB 4.0 MB/s eta 0:00:01
[?25hCollecting typeguard>=2.7
  Downloading typeguard-2.13.3-py3-none-any.whl (17 kB)
Installing collected packages: typeguard, tensorflow-addons
Successfully installed tensorflow-addons-0.14.0 typeguard-2.13.3
You should consider upgrading via the '/home/tarique/myvenv/bin/python3.6 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


In [3]:
import sys
import warnings

warnings.filterwarnings("ignore")
sys.path.append("../")

from pandas.api.types import CategoricalDtype
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import pickle
from tqdm import tqdm
import gc
from pathlib import Path

In [4]:
import tensorflow as tf
from tensorflow.keras import Model
from tensorflow.keras.layers import Embedding, Input, Dense, Dropout, BatchNormalization, Concatenate, Activation
from tensorflow.keras.losses import binary_crossentropy
from tensorflow.keras.utils import plot_model
from sklearn.preprocessing import MinMaxScaler
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
import gc

In [5]:
import tensorflow_addons as tfa

In [6]:
from src.data import DataHelper
from src.data.metrics import map_at_k, recall_at_k

In [7]:
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

In [8]:
from pathlib import Path
from tqdm import tqdm

In [9]:
os.environ['CUDA_VISIBLE_DEVICES'] = '-1'

In [10]:
RANK_EMBEDDING_DIM = 64
BATCH_SIZE = 2**12
NEPOCH = 20

In [11]:
TRAIN_WEEK_NUM = 6
WEEK_NUM = TRAIN_WEEK_NUM + 2

VERSION_NAME = "pivot"
TEST = False  # * Set as `False` when do local experiments to save time

In [12]:
data_dir = Path("../src/data/")
model_dir = Path("../src/data/interim/")

In [15]:
dh = DataHelper(data_dir)

In [16]:
data = dh.load_data(name="encoded_full")

In [17]:
inter = pd.read_parquet(data_dir / "interim/processed_inter_v3.pqt")
inter = inter.loc[(inter.t_dat <= "2020-08-05")]
data["inter"] = inter

article_cluster = pd.read_parquet(data_dir/'articles_new.parquet')
# https://www.kaggle.com/code/beezus666/k-means-and-feature-importance-for-articles/notebook?scriptVersionId=94269787

itemid2idx = pickle.load(open(data_dir/"index_id_map/item_id2index.pkl", "rb"))
article_cluster['article_id'] = article_cluster['article_id'].map(itemid2idx)
article_cluster = article_cluster.rename(columns={'department_no':'department_no_cluster', 'ct':'cluster'})

## Calculate & Load Embeddings

In [29]:
# * Load pre-trained embeddings
w2v_user_embd = np.load(data_dir/'external'/'w2v_user_embd.npy', allow_pickle=True)
w2v_item_embd = np.load(data_dir/'external'/'w2v_item_embd.npy', allow_pickle=True)
# w2v_product_embd = np.load(data_dir/'external'/'w2v_product_embd.npy', allow_pickle=True)
# image_item_embd = np.load(data_dir/'external'/'image_embd.npy', allow_pickle=True)
# w2v_sg_user_embd = np.load(data_dir/'external'/'w2v_skipgram_user_embd.npy', allow_pickle=True)
# w2v_sg_item_embd = np.load(data_dir/'external'/'w2v_skipgram_item_embd.npy', allow_pickle=True)
# w2v_sg_product_embd = np.load(data_dir/'external'/'w2v_skipgram_product_embd.npy', allow_pickle=True)

dssm_user_embd = np.load(data_dir/'external'/'dssm_user_embd.npy', allow_pickle=True)
dssm_item_embd = np.load(data_dir/'external'/'dssm_item_embd.npy', allow_pickle=True)
yt_user_embd = np.load(data_dir/'external'/'yt_user_embd.npy', allow_pickle=True)
yt_item_embd = np.load(data_dir/'external'/'yt_item_embd.npy', allow_pickle=True)

In [18]:
just_pred = False

candidates = {}
labels = {}

if just_pred:
    for i in tqdm(range(1, 2)):
        if i==0 and not TEST:
            continue
        candidates[i] = pd.read_parquet(data_dir/"interim"/VERSION_NAME/f"week{i}_candidate_full.pqt")    
        if i != 0:
            tmp_label = pd.read_parquet(data_dir/"interim"/VERSION_NAME/f"week{i}_label.pqt")
            labels[i] = tmp_label
        else:
            labels[i] = None
    
else:
    for i in tqdm(range(1, WEEK_NUM)):
        if i==0 and not TEST:
            continue
        candidates[i] = pd.read_parquet(data_dir/"interim"/VERSION_NAME/f"week{i}_candidate_full.pqt")

        if i != 0:
            tmp_label = pd.read_parquet(data_dir/"interim"/VERSION_NAME/f"week{i}_label.pqt")
            labels[i] = tmp_label
        else:
            labels[i] = None

100%|██████████| 7/7 [00:16<00:00,  2.29s/it]


In [19]:
feats = [
    x
    for x in candidates[1].columns
    if x
    not in [
        "label",
        "sales_channel_id",
        "t_dat",
        "week",
    ]
]

ids = ["customer_id", "article_id", "product_code"]
dense_feats = [x for x in feats if x not in ids]
# feats = ids + cat_features + dense_feats

In [20]:
for f in tqdm(dense_feats):
    for i in range(1, WEEK_NUM):
        if f in candidates[i].columns:
            candidates[i][f] = candidates[i][f].astype('float16')

100%|██████████| 100/100 [01:03<00:00,  1.57it/s]


In [21]:
full_data = pd.concat([candidates[i] for i in range(1,WEEK_NUM)], ignore_index=True)
full_data = full_data[feats+['week','label']]
gc.collect()

0

In [22]:
train = full_data[full_data['week']>1]
valid = full_data[full_data['week']==1]

In [23]:
del candidates
gc.collect()

0

In [24]:
# Standardize
# for feat in dense_feats:
    # mask = train[feat].notnull()
    # value = train.loc[mask, feat].mean()
    # train[feat] = train[feat].fillna(value)
    # valid[feat] = valid[feat].fillna(value)
    # scaler = MinMaxScaler().fit(train[feat].values.reshape(-1,1))
    # train[feat] = scaler.transform(train[feat].values.reshape(-1,1))
    # valid[feat] = scaler.transform(valid[feat].values.reshape(-1,1))

In [49]:
# feat_dim = {}
# for feat in ids:
#     if feat in data['user'].columns:
#         feat_dim[feat] = int(data['user'][feat].max()) + 1
#     elif feat in data['item'].columns:
#         feat_dim[feat] = int(data['item'][feat].max()) + 1
#     elif feat in article_cluster.columns:
#         feat_dim[feat] = int(article_cluster[feat].max()) + 1
#     else:
#         feat_dim[feat] = int(full_data[feat].max()) + 1
        
feat_dim = {}
for feat in ids:
    if feat in data['user'].columns:
        feat_dim[feat] = int(data['user'][feat].max())
    elif feat in data['item'].columns:
        feat_dim[feat] = int(data['item'][feat].max())
    elif feat in article_cluster.columns:
        feat_dim[feat] = int(article_cluster[feat].max())
    else:
        feat_dim[feat] = int(full_data[feat].max())

In [26]:
del full_data
gc.collect()

0

In [27]:
X_train1 = train[['customer_id', 'article_id', 'product_code']].values.astype('int32')
X_train2 = np.zeros((X_train1.shape[0], len(dense_feats)), dtype='float32')

for i,f in tqdm(enumerate(dense_feats)):
    X_train2[:, i] = np.nan_to_num(train[f].values).astype('float32')
    del train[f]
y_train = train['label'].values

100it [02:28,  1.49s/it]


In [28]:
X_test1 = valid[['customer_id', 'article_id', 'product_code']].values.astype('int32')
X_test2 = np.zeros((X_test1.shape[0], len(dense_feats)), dtype='float32')

for i,f in tqdm(enumerate(dense_feats)):
    X_test2[:, i] = np.nan_to_num(valid[f].values).astype('float32')
    del valid[f]
y_test = valid['label'].values

100it [00:31,  3.17it/s]


## Train Model

In [65]:
customer_embd_layer_1 = Embedding(
    feat_dim["customer_id"], 128, weights=[dssm_user_embd], trainable=False
)

customer_embd_layer_2 = Embedding(
    feat_dim["customer_id"], 128, weights=[w2v_user_embd], trainable=False
)

customer_embd_layer_3 = Embedding(
    feat_dim["customer_id"], 128, weights=[yt_user_embd], trainable=False
)

In [66]:
article_embd_layer_1 = Embedding(
    feat_dim["article_id"], 128, weights=[dssm_item_embd], trainable=False
)

article_embd_layer_2 = Embedding(
    feat_dim["article_id"], 128, weights=[w2v_item_embd], trainable=False
)

article_embd_layer_3 = Embedding(
    feat_dim["article_id"], 128, weights=[yt_item_embd], trainable=False
)

# article_embd_layer_4 = Embedding(
#     feat_dim["article_id"], 128, weights=[tfidf_item2], trainable=False
# )

# article_embd_layer_5 = Embedding(
#     feat_dim["article_id"], 512, weights=[image_item_embd], trainable=False
# )

In [67]:
# product_embd_layer_1 = Embedding(
#     feat_dim["product_code"], 64, weights=[w2v_sg_product_embd], trainable=False
# )
# product_embd_layer_2 = Embedding(
#     feat_dim["product_code"], 64, weights=[w2v_product_embd], trainable=False
# )

In [68]:
class FM(tf.keras.layers.Layer):
    """Factorization Machine"""

    def __init__(self, **kwargs):
        self.linear = None
        self.w_0 = None

        super().__init__(**kwargs)

    def build(self, input_shape):
        super().build(input_shape)
        self.linear = Dense(1, use_bias=False)
        self.w_0 = self.add_weight(
            shape=(1,),
            initializer=tf.keras.initializers.Zeros,
            dtype=tf.float32,
            trainable=True,
            name="W_0",
        )

    def call(self, inputs, mask=None, *args, **kwargs):
        # * inputs: (batch_size, num_of_fields, embedding_dim)
        # * part2: (batch_size, 1)
        part2 = tf.reduce_sum(self.linear(inputs), axis=1, keepdims=False)

        # * square_sum: (batch_size, embedding_dim)
        # * sum_square: (batch_size, embedding_dim)
        square_sum = tf.square(tf.reduce_sum(inputs, axis=1, keepdims=False))
        sum_square = tf.reduce_sum(inputs * inputs, axis=1, keepdims=False)
        
        # * part3: (batch_size, 1)
        part3 = square_sum - sum_square
        part3 = 0.5 * tf.reduce_sum(part3, axis=1, keepdims=True)
        return tf.nn.bias_add(part2 + part3, self.w_0)

    def compute_output_shape(self, input_shape):
        return (None, 1)

In [71]:
X_train1.shape

(18967385, 3)

In [69]:
inputs1 = Input(shape=X_train1.shape[1:], dtype=tf.int64)
inputs2 = Input(shape=X_train2.shape[1:], dtype=tf.float32)
input1 = tf.cast(inputs1, dtype=tf.int64)

x_c_id1 = customer_embd_layer_1(input1[:,0])
x_c_id2 = customer_embd_layer_2(input1[:,0])
x_c_id3 = customer_embd_layer_3(input1[:,0])

x_a_id1 = article_embd_layer_1(input1[:,1])
x_a_id2 = article_embd_layer_2(input1[:,1])
x_a_id3 = article_embd_layer_3(input1[:,1])
x_a_id3 = Dense(128)(x_a_id3)
# x_a_id4 = article_embd_layer_4(input1[:,1])
# x_a_id5 = article_embd_layer_5(input1[:,1])
# x_a_id5 = Dense(128)(x_a_id5)

# x_p_id1 = product_embd_layer_1(input1[:,2])
# x_p_id2 = product_embd_layer_2(input1[:,2])


x_id = Concatenate(axis=-1)([
    x_c_id1, x_c_id2,
    x_a_id1, x_a_id2, x_a_id3, 
#     x_a_id4, x_a_id5,
#     x_p_id1, x_p_id2,
])

x0 = Concatenate(axis=-1)([x_id, BatchNormalization()(inputs2)])
# x = Dropout(0.2)(x0)
# x = Dense(1024, activation='swish')(x)
x = Dropout(0.2)(x0)
x = Dense(512, activation='swish')(x)
x = Dropout(0.2)(x)
x = Dense(256, activation='swish')(x)

x = Concatenate(axis=-1)([x, x0])
x = Dropout(0.2)(x)

output = Dense(1, activation='sigmoid')(x)

# x_c_id2_expand = tf.expand_dims(x_c_id2, axis=1)
# x_a_id2_expand = tf.expand_dims(x_a_id2, axis=1)
# x_p_id2_expand = tf.expand_dims(x_p_id2, axis=1)
# fm_output = FM()(Concatenate(axis=1)([x_c_id2_expand, x_a_id2_expand, x_p_id2_expand]))
# output = output + fm_output
# output = Activation('sigmoid')(output)

model = tf.keras.Model(inputs=[inputs1, inputs2], outputs=[output])
model.summary()
    
model.compile(
    tfa.optimizers.AdamW(learning_rate=0.001, weight_decay=1e-4),
    loss = 'binary_crossentropy',
    metrics=['AUC']
)

ValueError: Layer weight shape (1371980, 128) not compatible with provided weight shape (1371981, 128)

In [None]:
# early_stop = tf.keras.callbacks.EarlyStopping(monitor='val_auc', patience=10, mode='max')
# checkpoint = tf.keras.callbacks.ModelCheckpoint(
#     filepath=model_dir/'model_nn.h5',
#     save_weights_only=True,
#     monitor='val_auc',
#     mode='max',
#     save_best_only=True)

# history = model.fit(
#     [X_train1, X_train2], y_train.astype(int), 
#     shuffle=True,
#     batch_size=2048,
#     validation_data=([X_test1, X_test2], y_test.astype(int)),
#     epochs=30,
#     callbacks=[checkpoint, early_stop]
# )
# # 0.7114
# # 0.7294
# # 0.7382
# # 0.7565

In [None]:
model.load_weights(model_dir/'model_nn.h5')

In [None]:
probs = model.predict([X_test1, X_test2], batch_size=4096)

In [None]:
label = data['inter'][data['inter']['t_dat']>='2020-09-16']
label = label.groupby('customer_id')['article_id'].apply(list).reset_index()

In [None]:
valid['prob'] = probs
pred = valid.sort_values(by='prob',ascending=False).reset_index(drop=True)
pred = pred.groupby('customer_id')['article_id'].apply(list).reset_index()
pred.columns = ['customer_id','prediction']

In [None]:
valid = valid[['customer_id','article_id','prob']]

In [None]:
valid.to_parquet(data_dir/'external'/'nn_valid.pqt')

In [None]:
label = label.merge(pred, on='customer_id', how='left')

In [None]:
map_at_k(label['article_id'], label['prediction'], k=12)
# 0.028500554033301987
# 0.029904528760153

# 0.031648009478868075
# 0.031309369857160076

# 031769005497044554

0.03129488004637625

## Test

In [None]:
model.load_weights(model_dir/'model_nn.h5')

In [None]:
class TQDMPredictCallback(tf.keras.callbacks.Callback):
    def __init__(self, custom_tqdm_instance=None, tqdm_cls=tqdm, **tqdm_params):
        super().__init__()
        self.tqdm_cls = tqdm_cls
        self.tqdm_progress = None
        self.prev_predict_batch = None
        self.custom_tqdm_instance = custom_tqdm_instance
        self.tqdm_params = tqdm_params

    def on_predict_batch_begin(self, batch, logs=None):
        pass

    def on_predict_batch_end(self, batch, logs=None):
        self.tqdm_progress.update(batch - self.prev_predict_batch)
        self.prev_predict_batch = batch

    def on_predict_begin(self, logs=None):
        self.prev_predict_batch = 0
        if self.custom_tqdm_instance:
            self.tqdm_progress = self.custom_tqdm_instance
            return

        total = self.params.get('steps')
        if total:
            total -= 1

        self.tqdm_progress = self.tqdm_cls(total=total, **self.tqdm_params)

    def on_predict_end(self, logs=None):
        if self.tqdm_progress and not self.custom_tqdm_instance:
            self.tqdm_progress.close()

In [None]:
del train, valid, X_train1, X_train2, X_test1, X_test2
gc.collect()

200

In [None]:
chunk = 1

In [None]:
test_candidates = pd.read_parquet(data_dir/"processed"/VERSION_NAME/f"week0_candidate_{chunk}.pqt")
for f in tqdm(dense_feats):
    test_candidates[f] = test_candidates[f].astype('float16')
test1 = test_candidates[['customer_id', 'article_id', 'product_code']].values.astype('int32')
test2 = np.zeros((test1.shape[0], len(dense_feats)), dtype='float32')
for i,f in tqdm(enumerate(dense_feats)):
    test2[:, i] = np.nan_to_num(test_candidates[f].values).astype('float32')
    del test_candidates[f]
gc.collect()

probs = model.predict([test1, test2], batch_size=2048, callbacks=[TQDMPredictCallback()])
# test_candidates = pd.concat([test_candidates, test_candidates2], ignore_index=True)
test_candidates["prob"] = probs
pred_lgb = test_candidates[['customer_id','article_id','prob']]
# pred_lgb = pred_lgb.sort_values(by=["customer_id","prob"], ascending=False).reset_index(drop=True)
pred_lgb.rename(columns={'article_id':'prediction'}, inplace=True)
# pred_lgb = pred_lgb.drop_duplicates(['customer_id', 'prediction'], keep='first')
pred_lgb['customer_id'] = pred_lgb['customer_id'].astype(int)

100%|██████████| 96/96 [01:12<00:00,  1.33it/s]
96it [01:26,  1.12it/s]
100%|██████████| 13503/13503 [03:07<00:00, 71.85it/s]


In [None]:
pred_lgb.to_parquet(data_dir/'interim'/f'nn_test_{chunk}.pqt')

In [None]:
test_pred1 = pd.read_parquet(data_dir/'interim'/f'nn_test_0.pqt')
test_pred2 = pd.read_parquet(data_dir/'interim'/f'nn_test_1.pqt')

In [None]:
test_pred = pd.concat([test_pred1, test_pred2], ignore_index=True)
test_pred = test_pred.sort_values(by=["prob"], ascending=False).reset_index(drop=True)
test_pred = test_pred.drop_duplicates(['customer_id', 'prediction'], keep='first')

In [None]:
test_pred.to_parquet(data_dir/'processed'/'nn_test.pqt')