In [97]:
import pandas as pd
from mxnet import gluon, np, npx, autograd
from mxnet.gluon import nn
import d2l
import mxnet as mx
npx.set_np()

In [2]:
# read user and items into a dataframe which is then converted into csv
# this part takes a while and is only done ONCE
# after creating csv, we can upload that into a dataframe directly

#def read_file(f):
 #   for l in open(f):
 #       yield eval(l)
#df = pd.DataFrame()

#for l in read_file("train.json"):
 #   reviewerID,itemID = l['reviewerID'],l['itemID']
 #   df = df.append({'reviewerID': reviewerID, 'itemID': itemID}, ignore_index = True)
#df.to_csv("train.csv")

In [103]:
# now we can upload csv straight into dataframe
data = pd.read_csv("train.csv")
data = data.drop(data.columns[0], axis=1)  # drop the unnamed column
# check to see if there are any duplicate users + items
len(data[data.duplicated()])
# add a column to indicate item was purchased
data['Purchased'] = 1

In [104]:
# to speed things up, working with 1000 rows for now..need to remove this part later
data = data.drop(data.index[1000:]) 

In [105]:
# create pivot table to show every reviewerID and every itemID
# this will allow us identify users who did not purchase an item as well
df_matrix = pd.pivot_table(data, values='Purchased', index='reviewerID', columns='itemID')
df_matrix = df_matrix.reset_index()
# undo pivot table and save it as data
data = pd.melt(df_matrix, id_vars=['reviewerID'], value_name='Purchased')  # this takes some time to run
data = data.fillna(0)  # replace NaN with 0

In [106]:
num_users = data["reviewerID"].unique().shape[0]
num_items = data["itemID"].unique().shape[0]

In [107]:
# since we need our data to be numeric, extracting numerics
# KEEP THIS IN MIND WHEN GENERATING PREDICTIONS FOR TEST SET
data['reviewerID'] = data['reviewerID'].str.extract('(\d+)')
data['itemID'] = data['itemID'].str.extract('(\d+)')

In [108]:
#split data into train and validation set, ensuring equal proportion of labels in both
from sklearn.model_selection import StratifiedShuffleSplit
def train_validate_split(data, train_size=0.8, validate_size=0.2):
    # first we shuffle and split all data into train and test set with equal label proportions
    sss = StratifiedShuffleSplit(n_splits = 1, train_size = train_size)
    for train_index, validate_index in sss.split(data, data['Purchased']):
        train, validate = data.iloc[train_index, : ], data.iloc[validate_index, : ]
    return(train, validate)

train, validate = train_validate_split(data)

In [110]:
train_u, train_i, train_p = np.array(train['reviewerID'], dtype = 'float32'), np.array(train['itemID'], dtype = 'float32'), np.array(train['Purchased'], dtype = 'float32')
validate_u, validate_i, validate_p = np.array(validate['reviewerID'], dtype = 'float32'), np.array(validate['itemID'], dtype = 'float32'), np.array(validate['Purchased'], dtype = 'float32')

In [111]:
train_set = gluon.data.ArrayDataset(train_u, train_i, train_p)
train_iter = gluon.data.DataLoader(train_set, shuffle=True, last_batch='rollover',batch_size=256)

validate_set = gluon.data.ArrayDataset(validate_u, validate_i, validate_p)
validate_iter = gluon.data.DataLoader(validate_set, shuffle=False, last_batch='rollover',batch_size=256)

class MF_user_item_bias(nn.Block):
    def __init__(self, num_factors, num_users, num_items, **kwargs):
        super(MF_user_item_bias, self).__init__(**kwargs)
        self.P = nn.Embedding(input_dim=num_users, output_dim=num_factors)
        self.Q = nn.Embedding(input_dim=num_items, output_dim=num_factors)
        self.user_bias = nn.Embedding(num_users, 1)
        self.item_bias = nn.Embedding(num_items, 1)

    def forward(self, user_id, item_id):
        P_u = self.P(user_id)
        Q_i = self.Q(item_id)
        b_u = self.user_bias(user_id)
        b_i = self.item_bias(item_id)
        outputs = (P_u * Q_i).sum(axis=1) + np.squeeze(b_u) + np.squeeze(b_i)
        return outputs.flatten()

def evaluator(net, test_iter, ctx):
    rmse = mx.metric.RMSE()  # Get the RMSE
    rmse_list = []
    for idx, (users, items, ratings) in enumerate(test_iter):
        u = gluon.utils.split_and_load(users, ctx, even_split=False)
        i = gluon.utils.split_and_load(items, ctx, even_split=False)
        r_ui = gluon.utils.split_and_load(ratings, ctx, even_split=False)
        r_hat = [net(u, i) for u, i in zip(u, i)]
        rmse.update(labels=r_ui, preds=r_hat)
        rmse_list.append(rmse.get()[1])
    return float(np.mean(np.array(rmse_list)))

In [216]:
# model for user and item only
ctx = d2l.try_all_gpus()
ctx_list=d2l.try_all_gpus()
net = MF_user_item_bias(30, num_users, num_items)
net.initialize(ctx=ctx, force_reinit=True, init=mx.init.Normal(0.01))
lr, num_epochs, wd, optimizer = 0.002, 30, 1e-5, 'adam'
loss = gluon.loss.L2Loss()
trainer = gluon.Trainer(net.collect_params(), optimizer,{"learning_rate": lr, 'wd': wd})
    
for epoch in range(num_epochs):
    l = 0
    metric = d2l.Accumulator(3)
    for i, values in enumerate(train_iter):
        input_data = []  # 3 arrays: train_u, train_i, train_p
        values = values if isinstance(values, list) else [values]
        for v in values:
            input_data.append(gluon.utils.split_and_load(v, ctx_list))
        train_feat = input_data[0:-1] if len(values) > 1 else input_data   # 2 arrays: train_u, train_i
        train_label = input_data[-1]  # train_p
        with autograd.record():
            preds = [net(*t) for t in zip(*train_feat)]
            ls = [loss(p, s) for p, s in zip(preds, train_label)]
        [l.backward() for l in ls]
        l += sum([l.asnumpy() for l in ls]).mean() / len(ctx_list)
        trainer.step(values[0].shape[0])
        metric.add(l, values[0].shape[0], values[0].size)
    train_l = l / (i + 1)

In [219]:
def evaluate_accuracy(net, data_iter):
    metric = Accumulator(2)  # num_corrected_examples, num_examples
    for i, values in enumerate(train_iter):
        input_data = []  # 3 arrays: train_u, train_i, train_p
        values = values if isinstance(values, list) else [values]
        for v in values:
            input_data.append(gluon.utils.split_and_load(v, ctx_list))
        train_feat = input_data[0:-1] if len(values) > 1 else input_data   # 2 arrays: train_u, train_i
        train_label = input_data[-1]  # train_p
        preds = [net(*t) for t in zip(*train_feat)]
        metric.add(d2l.accuracy(np.array(preds), np.array(train_label), np.array(train_label).size))
        metric[0] / metric[1]

d2l.evaluate_accuracy(net, train_iter)

ZeroDivisionError: float division by zero

256