In [1]:
import h5py
import json
import pickle
import numpy as np
import pandas as pd
import tensorflow as tf
from scipy.sparse import csc_matrix
from dask import dataframe as dd
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer

train = False

In [2]:
if train:
    path = 'data/ensemble_train/'
    model_data_path = 'model_data/collabrative/ensemble_train/ml/'
    personalization_result_path = 'personalization/ensemble_train/'
else:
    path = 'data/ensemble/'
    model_data_path = 'model_data/collabrative/ensemble/ml/'
    personalization_result_path = 'personalization/ensemble/'

In [3]:
transactions = pd.read_pickle(path+'transactions.pkl')[["customer_id", "article_id"]]
transactions.article_id = transactions.article_id.astype(int)
data = transactions.groupby(["customer_id"]).apply(lambda x: ' '.join(map(lambda i: str(i), x.article_id.values)))

In [4]:
vectorizer = TfidfVectorizer(use_idf=False)
score_matrix = vectorizer.fit_transform(data.values)

In [5]:
customer_encoding = {c_id: i for i, c_id in enumerate(data.keys())}
with open(model_data_path+'customer_id_encoding.json', 'w') as fp:
    json.dump(customer_encoding, fp)

with open(model_data_path+'product_id_encoding.json', 'w') as fp:
    json.dump(vectorizer.vocabulary_, fp)

In [6]:
model = NMF( n_components=128,
             init='random',
             random_state=43,
             max_iter=500
            )
W = model.fit_transform(score_matrix)
H = model.components_

In [None]:
with open(model_data_path+'nmf_model_W.pkl', 'wb') as f:
    pickle.dump(W,f)

with open(model_data_path+'nmf_model_H.pkl', 'wb') as f:
    pickle.dump(H.T,f)

In [None]:
model = LatentDirichletAllocation(
    n_components=128,
    random_state=43,
    max_iter=500
)
W = model.fit_transform(score_matrix)
H = model.components_

In [None]:
with open(model_data_path+'lda_model_W.pkl', 'wb') as f:
    pickle.dump(W,f)

with open(model_data_path+'lda_model_H.pkl', 'wb') as f:
    pickle.dump(H.T,f)

## Prediction

In [3]:
with open(model_data_path+'customer_id_encoding.json', 'r') as fp:
    customer_encoding = json.load(fp)

with open(model_data_path+'product_id_encoding.json', 'r') as fp:
    product_encoding = json.load(fp)
    product_encoding = {int(k): v for k,v in product_encoding.items()}

customers = pd.read_pickle(path+"customers.pkl")
customers.drop_duplicates(subset=["customer_id"], inplace=True)
customers.sort_values(by="customer_id", inplace=True)
customer_ids = customers["customer_id"].map(customer_encoding).values
articles = pd.read_pickle(path+"articles.pkl")
articles.drop_duplicates(subset=["article_id"], inplace=True)
article_ids = articles["article_id"].map(product_encoding).tolist()

In [4]:
with open(model_data_path + 'nmf_model_W.pkl', 'rb') as f:
    customers = pickle.load(f)[customer_ids]
    customers = tf.convert_to_tensor(customers, dtype=tf.float32)

with open(model_data_path + 'nmf_model_H.pkl', 'rb') as f:
    products = pickle.load(f)[article_ids]
    products = tf.nn.l2_normalize(tf.convert_to_tensor(products, dtype=tf.float32))

In [5]:
batch = 512
step = 15000 // batch
f.close()
f = h5py.File(personalization_result_path + 'nmf.h5', 'w', libver='latest')
dset = f.create_dataset("nmf", (customers.shape[0], products.shape[0]), dtype=np.float32, compression='gzip')

ptr = 0
temp = []
for i, batch_i in enumerate(range(0, customers.shape[0], batch)):
    customer_batch = tf.nn.l2_normalize(customers[batch_i:batch_i+batch], 1)
    batch_distances = tf.matmul(customer_batch, products, transpose_b=True)

    for distance in batch_distances:
        temp.append(distance.numpy())

    if i != 0 and (i % step == 0 or i == (customers.shape[0] // batch)):
        temp = np.asarray(temp, dtype=np.float32)
        dset[ptr:ptr+temp.shape[0],:] = temp
        ptr = batch_i+batch
        temp = []

    print('\r' + f'{i*batch}: %{round(100*i*batch/customers.shape[0], 2)}', end='')
f.close()

684544: %99.98

In [4]:
with open(model_data_path + 'lda_model_W.pkl', 'rb') as f:
    customers = pickle.load(f)[customer_ids]
    customers = tf.convert_to_tensor(customers, dtype=tf.float32)

with open(model_data_path + 'lda_model_H.pkl', 'rb') as f:
    products = pickle.load(f)[article_ids]
    products = tf.nn.l2_normalize(tf.convert_to_tensor(products, dtype=tf.float32))

2022-05-09 09:31:20.849135: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [5]:
batch = 512
step = 15000 // batch
f = h5py.File(personalization_result_path + 'lda.h5', 'w', libver='latest')
dset = f.create_dataset("lda", (customers.shape[0], products.shape[0]), dtype=np.float32, compression='gzip')

ptr = 0
temp = []
for i, batch_i in enumerate(range(0, customers.shape[0], batch)):
    customer_batch = tf.nn.l2_normalize(customers[batch_i:batch_i+batch], 1)
    batch_distances = tf.matmul(customer_batch, products, transpose_b=True)

    for distance in batch_distances:
        temp.append(distance.numpy())

    if i != 0 and (i % step == 0 or i == (customers.shape[0] // batch)):
        temp = np.asarray(temp, dtype=np.float32)
        dset[ptr:ptr+temp.shape[0],:] = temp
        ptr = batch_i+batch
        temp = []

    print('\r' + f'{i*batch}: %{round(100*i*batch/customers.shape[0], 2)}', end='')
f.close()

684544: %99.98