In [None]:
import json
import random
import numpy as np
import tensorflow as tf
from tensorflow import keras
from dask import dataframe as dd
from sklearn.utils import shuffle

## Matrix Factorization Model (GMF)

In [None]:
class RecommenderNet(keras.Model):
    def __init__(self, num_users, num_movies, embedding_size, **kwargs):
        super(RecommenderNet, self).__init__(**kwargs)
        self.num_users = num_users
        self.num_movies = num_movies
        self.embedding_size = embedding_size
        self.user_embedding = layers.Embedding(
            num_users,
            embedding_size,
            embeddings_initializer="he_normal",
            embeddings_regularizer=keras.regularizers.l2(1e-6),
        )
        self.user_bias = layers.Embedding(num_users, 1)
        self.movie_embedding = layers.Embedding(
            num_movies,
            embedding_size,
            embeddings_initializer="he_normal",
            embeddings_regularizer=keras.regularizers.l2(1e-6),
        )
        self.movie_bias = layers.Embedding(num_movies, 1)

    def call(self, inputs):
        user_vector = self.user_embedding(inputs[:, 0])
        user_bias = self.user_bias(inputs[:, 0])
        movie_vector = self.movie_embedding(inputs[:, 1])
        movie_bias = self.movie_bias(inputs[:, 1])
        dot_user_movie = tf.tensordot(user_vector, movie_vector, 2)
        # Add all the components (including bias)
        x = dot_user_movie + user_bias + movie_bias
        # The sigmoid activation forces the rating to between 0 and 1
        return tf.nn.sigmoid(x)


## Data Loader

### Preparing the data

In [None]:
transactions = dd.read_csv('data/transactions_train.csv')[["customer_id", "article_id"]]
c_ids = transactions.customer_id.unique()
number_of_customer = len(c_ids)
customer_encoding = {c_id: i for i, c_id in enumerate(c_ids)}
p_ids = transactions.article_id.unique()
number_of_products = len(p_ids)
product_encoding = {p_id: i for i, p_id in enumerate(p_ids)}
with open('model_data/customer_id_encoding.json', 'w') as fp:
    json.dump(customer_encoding, fp)

with open('model_data/product_id_encoding.json', 'w') as fp:
    json.dump(product_encoding, fp)

transactions.customer_id = transactions.customer_id.map(customer_encoding)
transactions.article_id = transactions.article_id.map(product_encoding)
p_ids = list(product_encoding.values())
del c_ids, customer_encoding, product_encoding

In [None]:
transactions = transactions.groupby(["customer_id"])['article_id']\
                           .apply(lambda x: list(x), meta=("article_ids",object))\
                           .reset_index().compute().drop_duplicates(subset=["customer_id"])

transactions["hist_len"] = transactions.article_ids.apply(lambda x: 128 if len(x)>128 else len(x))
transactions["prod_ids"] = transactions.apply(lambda x: random.sample(x.article_ids, x.hist_len), axis=1)
transactions["not_prods"] = transactions.prod_ids.apply(lambda x: [p_id for p_id in random.sample(p_ids, 256) if p_id not in x][:128])
transactions = transactions[["customer_id", "prod_ids", "not_prods"]]
del p_ids

In [None]:
class Generator():
    def __init__(self, data, positive_sample_length, negative_sample_length, batch_size):
        self.data = data
        self.positive_sample_length = positive_sample_length
        self.negative_sample_length = negative_sample_length
        self.sample_length = positive_sample_length + negative_sample_length
        if batch_size % self.sample_length != 0:
            raise ValueError("batch_size must be divisible by sum of positive_sample_length and negative_sample_length")

            
    def user_info_generator(self):
        for i, row in self.data[["customer_id","prod_ids","not_prods"]].iterrows():
            pids = np.asarray(random.sample(row["prod_ids"], self.positive_sample_length) + random.sample(row["not_prods"], self.negative_sample_length))
            labels = np.asarray([1]*self.positive_sample_length + [0]*self.negative_sample_length)
            indices = np.arange(self.sample_length)
            np.random.shuffle(indices)
            yield tf.convert_to_tensor([pids[indices],labels[indices]], dtype=tf.int32)

    def user_id_generator(self):
        for i, row in self.data[["customer_id"]].iterrows():
            customer_id = [row["customer_id"]] * self.sample_length
            yield tf.convert_to_tensor(customer_id, dtype=tf.int32)

    def shuffle(self):
        self.data = shuffle(self.data)

class TGenerator (Generator):
    def __init__(self, data, positive_sample_length, negative_sample_length, batch_size):
        super().__init__(data, positive_sample_length, negative_sample_length, batch_size)
        self.mini_batch = int(batch_size / self.sample_length)
        self.batch = batch_size
        self.user_info_loader = tf.data.Dataset.from_generator(
            self.user_info_generator, output_types=tf.int32).batch(self.mini_batch, drop_remainder=True).repeat()
        self.user_id_loader = tf.data.Dataset.from_generator(
            self.user_id_generator, output_types=tf.int32).batch(self.mini_batch, drop_remainder=True).repeat()

    def get(self):
        for c_ids, info in zip(self.user_id_loader, self.user_info_loader):
            input = tf.stack([tf.reshape(c_ids, self.batch), tf.reshape(info[:,0,:], self.batch)], axis=1)
            label = tf.reshape(info[:,1,:], self.batch)
            yield input, label

class VGenerator (Generator):
    def __init__(self, data, positive_sample_length, negative_sample_length, batch_size):
        super().__init__(data, positive_sample_length, negative_sample_length, batch_size)
        self.mini_batch = int(batch_size / self.sample_length)
        self.batch = batch_size
        self.user_info_loader = tf.data.Dataset.from_generator(
            self.user_info_generator, output_types=tf.int32).batch(self.mini_batch, drop_remainder=True).repeat()
        self.user_id_loader = tf.data.Dataset.from_generator(
            self.user_id_generator, output_types=tf.int32).batch(self.mini_batch, drop_remainder=True).repeat()

    def get(self):
        for c_ids, info in zip(self.user_id_loader, self.user_info_loader):
            input = tf.stack([tf.reshape(c_ids, self.batch), tf.reshape(info[:,0,:], self.batch)], axis=1)
            label = tf.reshape(info[:,1,:], self.batch)
            yield input, label

In [None]:
model = RecommenderNet(number_of_customer, number_of_products, 128)
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), loss=tf.keras.losses.BinaryCrossentropy(from_logits=True), metrics=["accuracy"])
train_loader = TGenerator(transactions, 1, 1, batch_size=256)
train = train_loader.get()

In [None]:
model.fit(train, epochs=1)