In [11]:
from sklearn import linear_model
import sklearn
import json
from matplotlib import pyplot as plt
from collections import defaultdict
import numpy
import random
import gzip
import math
import string
import scipy
from scipy import sparse
from implicit import bpr
import tensorflow as tf
from collections import defaultdict

import gzip
from collections import defaultdict

from typing import List


In [12]:
def readGz(path):
    for l in gzip.open(path, 'rt'):
        yield eval(l)

def readCSV(path):
    f = gzip.open(path, 'rt')
    f.readline()
    for l in f:
        u,b,r = l.strip().split(',')
        r = int(r)
        yield u,b,r

In [13]:
def accuracy(pred, y):
    TP_ = numpy.logical_and(pred, y)
    FP_ = numpy.logical_and(pred, numpy.logical_not(y))
    TN_ = numpy.logical_and(numpy.logical_not(pred), numpy.logical_not(y))
    FN_ = numpy.logical_and(numpy.logical_not(pred), y)

    TP = sum(TP_)
    FP = sum(FP_)
    TN = sum(TN_)
    FN = sum(FN_)

    acc = (TP + TN)/len(pred)
    return acc


In [14]:
entire_dataset = []
booksPerUser_all = defaultdict(set)
usersPerBook_all = defaultdict(set)
ratingsPerBook_all = defaultdict(list)

# sparse interaction matrix

for l in readCSV("train_Interactions.csv.gz"):
    entire_dataset.append(l)

random.shuffle(entire_dataset)          # shuffle data to avoid overfitting

train_data = entire_dataset[:190000]

userIDs, itemIDs = {}, {}
for u, b, r in entire_dataset:
    booksPerUser_all[u].add(b)
    usersPerBook_all[b].add(u)
    ratingsPerBook_all[b].append(r)
    if not u in userIDs:
        userIDs[u] = len(userIDs)
    if not b in itemIDs:
        itemIDs[b] = len(itemIDs)

nUsers, nItems = len(userIDs), len(itemIDs)


## Problem 1: Have read?


In [15]:
# Build validation with 50% have read and 50% unread
valid_data = []
for u, b, _ in entire_dataset[190000:]:
    valid_data.append((u, b, 1))
notRead_valid_set = []
set_of_books = set([b for b in itemIDs])

booksPerUser_valid = defaultdict(set)
for u, b, r in valid_data:
    booksPerUser_valid[u].add(b)

for d in valid_data:
    # get the books that user have not read
    diff = set_of_books.difference(booksPerUser_valid[d[0]])
    notRead_valid_set.append(
        (d[0], list(diff)[random.randint(0, len(diff)-1)]))
    # notRead_valid_set[d[0]].append(list(diff)[random.randint(0, len(diff)-1)]) # get random book for user

# adding to current validation pairs of (u,b) of books that have not been read by user
valid_data_q1 = valid_data
for u, b in notRead_valid_set:
    valid_data_q1.append((u, b, 0))

random.shuffle(valid_data_q1)

items = list(itemIDs.keys())

In [34]:
# Initialize prediction data structure and test dataset
test_dataset = []
predictions = open("predictions_Read.csv", 'w')
with open("pairs_Read.csv") as test_data:
    for l in test_data:
        if l.startswith("userID"):
            predictions.write(l)
            continue
        u, b = l.strip().split(',')
        test_dataset.append((u, b))
        # Check if user and books is in indexing data structure
        if u not in userIDs:
            userIDs[u] = len(userIDs)
        if b not in itemIDs:
            itemIDs[b] = len(itemIDs)


In [17]:
class BPRbatch(tf.keras.Model):
    def __init__(self, K, lamb):
        super(BPRbatch, self).__init__()
        # Initialize variables
        self.betaI = tf.Variable(
            tf.random.normal([len(itemIDs)], stddev=0.001))
        self.gammaU = tf.Variable(tf.random.normal(
            [len(userIDs), K], stddev=0.001))
        self.gammaI = tf.Variable(tf.random.normal(
            [len(itemIDs), K], stddev=0.001))
        # Regularization coefficient
        self.lamb = lamb

    # Prediction for a single instance
    def predict(self, u, i):
        p = self.betaI[i] + tf.tensordot(self.gammaU[u], self.gammaI[i], 1)
        return p

    # Regularizer
    def reg(self):
        return self.lamb * (tf.nn.l2_loss(self.betaI) +
                            tf.nn.l2_loss(self.gammaU) +
                            tf.nn.l2_loss(self.gammaI))

    def score(self, sampleU, sampleI):
        u = tf.convert_to_tensor(sampleU, dtype=tf.int32)
        i = tf.convert_to_tensor(sampleI, dtype=tf.int32)
        beta_i = tf.nn.embedding_lookup(self.betaI, i)
        gamma_u = tf.nn.embedding_lookup(self.gammaU, u)
        gamma_i = tf.nn.embedding_lookup(self.gammaI, i)
        x_ui = beta_i + tf.reduce_sum(tf.multiply(gamma_u, gamma_i), 1)
        return x_ui

    def call(self, sampleU, sampleI, sampleJ):
        x_ui = self.score(sampleU, sampleI)
        x_uj = self.score(sampleU, sampleJ)
        return -tf.reduce_mean(tf.math.log(tf.math.sigmoid(x_ui - x_uj)))


In [18]:
# Build model on new "userIDs" and "itemIDs" length because there were some users that DNE in the "train_interactions.csv.gz"
optimizer = tf.keras.optimizers.Adam(0.1)
modelBPR = BPRbatch(5, 0.00001)


In [19]:
def trainingStepBPR(model, interactions):
    Nsamples = 50000
    with tf.GradientTape() as tape:
        sampleU, sampleI, sampleJ = [], [], []
        for _ in range(Nsamples):
            u, i, _ = random.choice(interactions)  # positive sample
            j = random.choice(items)  # negative sample
            while j in booksPerUser_all[u]:
                j = random.choice(items)
            sampleU.append(userIDs[u])
            sampleI.append(itemIDs[i])
            sampleJ.append(itemIDs[j])

        loss = model(sampleU, sampleI, sampleJ)
        loss += model.reg()
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients((grad, var) for
                              (grad, var) in zip(
                                  gradients, model.trainable_variables)
                              if grad is not None)
    return loss.numpy()


In [20]:
for i in range(300):
    obj = trainingStepBPR(modelBPR, entire_dataset)
    if (i % 10 == 9): print("iteration " + str(i+1) + ", objective = " + str(obj))

iteration 10, objective = 0.4693931
iteration 20, objective = 0.43645096
iteration 30, objective = 0.44130868
iteration 40, objective = 0.4323181
iteration 50, objective = 0.42410135
iteration 60, objective = 0.42429882
iteration 70, objective = 0.42425352
iteration 80, objective = 0.4206741
iteration 90, objective = 0.4186772
iteration 100, objective = 0.41640985
iteration 110, objective = 0.41679323
iteration 120, objective = 0.41774333
iteration 130, objective = 0.41801995
iteration 140, objective = 0.41503876
iteration 150, objective = 0.41429076
iteration 160, objective = 0.4145111
iteration 170, objective = 0.41655073
iteration 180, objective = 0.41342944
iteration 190, objective = 0.4129957
iteration 200, objective = 0.4132862
iteration 210, objective = 0.41189373
iteration 220, objective = 0.41488746
iteration 230, objective = 0.4150156
iteration 240, objective = 0.4158305
iteration 250, objective = 0.414916
iteration 260, objective = 0.41441554
iteration 270, objective = 0.415

In [21]:
itemsScorePerUser_test = defaultdict(list)
# Add prediction to prediction data structure
for u, b in test_dataset:
    pred = modelBPR.predict(userIDs[u], itemIDs[b]).numpy()
    itemsScorePerUser_test[u].append((pred, b))

# Sort prediction data structure by score
for u in itemsScorePerUser_test.keys():
    itemsScorePerUser_test[u].sort(reverse=True)


In [22]:
# Checking data in prediction data structure
for u in list(itemsScorePerUser_test.keys())[:10]:
    print(f"u: {u}, items: {itemsScorePerUser_test[u]}")


u: u37758667, items: [(1.0866337, 'b99713185'), (-0.066176794, 'b05213070')]
u: u85626045, items: [(3.8862379, 'b29802159'), (0.84620416, 'b00524816'), (0.7505729, 'b31024771'), (-0.8422726, 'b60770713')]
u: u70770448, items: [(1.6494749, 'b07327816'), (0.9818396, 'b77746740'), (0.8690423, 'b21349423'), (0.6113306, 'b92959743')]
u: u64714864, items: [(2.0634742, 'b52217488'), (0.6568857, 'b32857815'), (-0.20397699, 'b06618138'), (-1.0911531, 'b48279541')]
u: u78647159, items: [(1.0642279, 'b63721105'), (0.024226338, 'b00654647'), (-0.33912992, 'b05439735'), (-0.74795514, 'b68809234')]
u: u43398119, items: [(0.9899411, 'b51839247'), (0.93687814, 'b08135061')]
u: u93156409, items: [(1.2602394, 'b24274613'), (-0.6965865, 'b24931555')]
u: u85724496, items: [(2.3662302, 'b39903678'), (0.57600373, 'b15357546')]
u: u72905804, items: [(1.0721098, 'b78359727'), (-0.3074209, 'b31964420')]
u: u61280144, items: [(2.8069437, 'b93997659'), (0.7869042, 'b05871767')]


In [23]:
# Make prediction
y_pred_test = []
pred_data = []
read_cnt_test = 0
unread_cnt_test = 0
for u, b in test_dataset:
    len_before = len(y_pred_test)
    fst_half = len(itemsScorePerUser_test[u])//2
    if fst_half == 0 and read_cnt_test <= unread_cnt_test:
        y_pred_test.append(1)
        pred_data.append((u, b, 1))
        # predictions.write(u + ',' + b + ",1\n")
    elif fst_half == 0 and read_cnt_test > unread_cnt_test:
        y_pred_test.append(0)
        pred_data.append((u, b, 0))
        # predictions.write(u + ',' + b + ",0\n")
    else:
        for sb in itemsScorePerUser_test[u][:fst_half]:
            if b in sb:
                y_pred_test.append(1)
                pred_data.append((u, b, 1))
                # predictions.write(u + ',' + b + ",1\n")
                read_cnt_test += 1
                break
        if len_before == len(y_pred_test):
            y_pred_test.append(0)
            pred_data.append((u, b, 0))
            # predictions.write(u + ',' + b + ",0\n")
            unread_cnt_test += 1


In [24]:
# test that there is 50% read and 50% unread predictions
print(len(y_pred_test))
print(sum(y_pred_test))
print(pred_data[19999])

20000
10000
('u06592677', 'b27702770', 1)


In [37]:
with open("predictions_Read.csv", 'w') as prediction_file:
    prediction_file.write("userID,bookID,prediction\n")
    for u, b, p in pred_data:
        prediction_file.write(u + ',' + b + ',' + str(p) + "\n")


## Problem 2: Predict Category

In [26]:
# Gather all data
data = []

for d in readGz("train_Category.json.gz"):
    data.append(d)

print(data[0])

# Split training and vlaidation data
category_all_data = [d for d in data]
category_train_data = category_all_data[:90000]
category_valid_data = category_all_data[90000:]
# stemmer = PorterStemmer()
punct = string.punctuation

wordCount = defaultdict(int)
for d in category_all_data:
    rev: string = d['review_text']
    rev = rev.lower()                           # lowercase
    rev = [c for c in rev if not (c in punct)]  # remove punctuation (char)
    rev = ''.join(rev)
    words = rev.strip().split()
    for w in words:
        wordCount[w] += 1

# sort word by frequency
counts = [(wordCount[w], w) for w in wordCount]
counts.sort()
counts.reverse()

# get dictionary
words_dict = [x[1] for x in counts[:10000]]
words_dictID = dict(zip(words_dict, range(len(words_dict))))


{'user_id': 'u75242413', 'review_id': 'r45843137', 'rating': 4, 'review_text': "a clever book with a deeply troubling premise and an intriguing protagonist. Thompson's clean, sparse prose style kept each page feeling light even as some rather heavy existential questions dropped upon them. I enjoyed it. \n and that cover design is boom-pow gorgeous.", 'n_votes': 1, 'genre': 'mystery_thriller_crime', 'genreID': 3}


In [27]:
# get document frequency using training data set
df = defaultdict(int)
for d in category_all_data:
    rev: string = d['review_text']
    rev = rev.lower()                           # lowercase
    rev = [c for c in rev if not (c in punct)]  # remove punctuation (char)
    rev = ''.join(rev)
    words = rev.strip().split()
    for w in set(words):
        df[w] += 1


In [28]:
# get review_text for each review and compute tf vector
def feature(data):
    tfidf_vector = [0]*len(words_dict)
    text = data['review_text']
    text = text.lower()                           # lowercase
    text = [c for c in text if not (c in punct)]  # remove punctuation (char)
    text = ''.join(text)
    words = text.strip().split()
    # build tfidf vector
    for w in words:
        if w in words_dict:
            tfidf_vector[words_dictID[w]] = (
                math.log2(len(category_all_data)/df[w]))
    return tfidf_vector


In [29]:
X = [feature(d) for d in category_all_data]
y = [d['genreID'] for d in category_all_data]


In [30]:
# train model
test_model = linear_model.LogisticRegression(C=0.001) # 0.743 with 0.001
test_model.fit(X,y)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [31]:
reviewID_dict = {}
for d in readGz("test_Category.json.gz"):
    reviewID_dict[d['review_id']] = d


In [32]:
with open("predictions_Category.csv", 'w') as predictions:
    for l in open("pairs_Category.csv"):
        if l.startswith("userID"):
            predictions.write(l)
            continue
        u, r = l.strip().split(',')
        x_test = [feature(reviewID_dict[r])]
        ypred = test_model.predict(x_test)
        predictions.write(u + ',' + r + "," + str(ypred[0]) + "\n")
