In [1]:
import h5py
import json
import pickle
import numpy as np
import pandas as pd
import tensorflow as tf
from scipy.sparse import csc_matrix
from dask import dataframe as dd
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer

train = True

In [2]:
if train:
    path = 'data/ensemble_train/'
    model_data_path = 'model_data/collabrative/ensemble_train/ml/'
else:
    path = 'data/ensemble/'
    model_data_path = 'model_data/collabrative/ensemble/ml/'

In [3]:
transactions = pd.read_pickle(path+'transactions.pkl')[["customer_id", "article_id"]]
customer_purchase_number = transactions.groupby("customer_id").size().to_frame("prod_number").reset_index()
transactions = transactions.merge(customer_purchase_number, on="customer_id", how="inner")

c_ids = transactions.customer_id.unique()
number_of_customer = len(c_ids)
customer_encoding = {c_id: i for i, c_id in enumerate(c_ids)}
p_ids = transactions.article_id.unique()
number_of_products = len(p_ids)
product_encoding = {p_id: i for i, p_id in enumerate(p_ids)}
transactions.customer_id = transactions.customer_id.map(customer_encoding)
transactions.article_id = transactions.article_id.map(product_encoding)
p_ids = list(product_encoding.values())

with open(model_data_path+'customer_id_encoding.json', 'w') as fp:
    json.dump(customer_encoding, fp)

with open(model_data_path+'product_id_encoding.json', 'w') as fp:
    json.dump(product_encoding, fp)

del c_ids, customer_purchase_number

In [4]:
sparseMatrix = csc_matrix((len(customer_encoding.keys()), len(product_encoding.keys())), dtype = np.int8).toarray()
data = transactions.groupby(["customer_id"])
for customer_id, customer_data in data:
    for i in customer_data.article_id.unique():
        sparseMatrix[customer_id, i] = 1
print(sparseMatrix.shape)
del product_encoding, customer_encoding, data, transactions

(684744, 9857)


In [6]:
model = NMF( n_components=128,
             init='random',
             random_state=43,
             max_iter=500
            )
W = model.fit_transform(sparseMatrix)
H = model.components_

In [None]:
with open(model_data_path+'nmf_model_W.pkl', 'wb') as f:
    pickle.dump(W,f)

with open(model_data_path+'nmf_model_H.pkl', 'wb') as f:
    pickle.dump(H.T,f)

In [None]:
model = LatentDirichletAllocation(
    n_components=128,
    random_state=43,
    max_iter=500
)
W = model.fit_transform(sparseMatrix)
H = model.components_

In [None]:
with open(model_data_path+'lda_model_W.pkl', 'wb') as f:
    pickle.dump(W,f)

with open(model_data_path+'lda_model_H.pkl', 'wb') as f:
    pickle.dump(H.T,f)

## Prediction

In [5]:
with open(model_data_path+'customer_id_encoding.json', 'r') as fp:
    customer_encoding = json.load(fp)

with open(model_data_path+'product_id_encoding.json', 'r') as fp:
    product_encoding = json.load(fp)
    product_encoding = {int(k): v for k,v in product_encoding.items()}

customers = pd.read_pickle(path+"customers.pkl")
customers.drop_duplicates(subset=["customer_id"], inplace=True)
customers.sort_values(by="customer_id", inplace=True)
customer_ids = customers["customer_id"].map(customer_encoding).values
articles = pd.read_pickle(path+"articles.pkl")
articles.drop_duplicates(subset=["article_id"], inplace=True)
article_ids = articles["article_id"].map(product_encoding).tolist()


with open(model_data_path + 'nmf_model_W.pkl', 'rb') as f:
    customers = pickle.load(f)[customer_ids]
    customers = tf.convert_to_tensor(customers, dtype=tf.float32)

with open(model_data_path + 'nmf_model_H.pkl', 'rb') as f:
    products = pickle.load(f)[article_ids]
    products = tf.convert_to_tensor(customers, dtype=tf.float32)

In [6]:
batch = 512
step = 15000 // batch
f.close()
f = h5py.File('model_data/collabrative/nmf.h5', 'w', libver='latest')
dset = f.create_dataset("nmf", (customers.shape[0], 2, 150), dtype=np.float32, compression='gzip')

ptr = 0
score_temp = []
indices_temp = []
for i, batch_i in enumerate(range(0, customers.shape[0], batch)):
    customer_batch = tf.nn.l2_normalize(customers[batch_i:batch_i+batch], 1)
    batch_distances = tf.matmul(customer_batch, products, transpose_b=True)

    for distance in batch_distances:
        values, indices = tf.math.top_k(distance, k=150)
        score_temp.append(values)
        indices_temp.append(tf.cast(indices, tf.float32))

    if i != 0 and (i % step == 0 or i == (customers.shape[0] // batch)):
        score_temp = np.asarray(score_temp, dtype=np.float32)
        indices_temp = np.asarray(indices_temp, dtype=np.float32)
        dset[ptr:ptr+score_temp.shape[0],1,:] = score_temp
        dset[ptr:ptr+indices_temp.shape[0],0,:] = indices_temp
        ptr = batch_i+batch
        score_temp = []
        indices_temp = []

    print('\r' + f'{i*batch}: %{round(100*i*batch/customers.shape[0], 2)}', end='')
f.close()

684544: %99.97

In [13]:
with open(model_data_path + 'lda_model_W.pkl', 'rb') as f:
    customers = pickle.load(f)[customer_ids]
    customers = tf.convert_to_tensor(customers, dtype=tf.float32)

with open(model_data_path + 'lda_model_H.pkl', 'rb') as f:
    products = pickle.load(f)[article_ids]
    products = tf.convert_to_tensor(customers, dtype=tf.float32)

In [14]:
batch = 512
step = 15000 // batch
f.close()
f = h5py.File('model_data/collabrative/lda.h5', 'w', libver='latest')
dset = f.create_dataset("lda", (customers.shape[0], 2, 150), dtype=np.float32, compression='gzip')

ptr = 0
score_temp = []
indices_temp = []
for i, batch_i in enumerate(range(0, customers.shape[0], batch)):
    customer_batch = tf.nn.l2_normalize(customers[batch_i:batch_i+batch], 1)
    batch_distances = tf.matmul(customer_batch, products, transpose_b=True)

    for distance in batch_distances:
        values, indices = tf.math.top_k(distance, k=150)
        score_temp.append(values)
        indices_temp.append(tf.cast(indices, tf.float32))

    if i != 0 and (i % step == 0 or i == (customers.shape[0] // batch)):
        score_temp = np.asarray(score_temp, dtype=np.float32)
        indices_temp = np.asarray(indices_temp, dtype=np.float32)
        dset[ptr:ptr+score_temp.shape[0],1,:] = score_temp
        dset[ptr:ptr+indices_temp.shape[0],0,:] = indices_temp
        ptr = batch_i+batch
        score_temp = []
        indices_temp = []

    print('\r' + f'{i*batch}: %{round(100*i*batch/customers.shape[0], 2)}', end='')
f.close()

684544: %99.97