# CSE 258 Assignment 1 Read
**Ming Ki Toby Cheng**

In [1]:
import gzip
from collections import defaultdict
import scipy
import scipy.optimize
from sklearn import linear_model
import numpy
import random
import matplotlib.pyplot as plt

In [2]:
def readGz(path):
    for l in gzip.open(path, "rt"):
        yield eval(l)


def readCSV(path):
    f = gzip.open(path, "rt")
    f.readline()
    for l in f:
        yield l.strip().split(",")

In [3]:
users = []
books = []
ratings = []

In [4]:
for user, book, _ in readCSV("train_Interactions.csv.gz"):
    users.append(user)
    books.append(book)
    ratings.append(_)

In [5]:
#Splitting data into training and validation
users_train = users[:190000]
books_train = books[:190000]
ratings_train = ratings[:190000]
users_valid = users[190000:]
books_valid = books[190000:]
ratings_valid = ratings[190000:]
training_set = list(zip(users_train, books_train))

In [6]:
userCount = defaultdict(set)
allBooks = set()

for user, book, _ in readCSV("train_Interactions.csv.gz"):
    userCount[user].add(book)
    allBooks.add(book)

In [7]:
# Defining sets for users and books
usersPerBook = defaultdict(set)
booksPerUser = defaultdict(set)
for user, book in training_set:
    usersPerBook[book].add(user)
    booksPerUser[user].add(book)

In [8]:
users_valid_new = users_valid[:]
books_valid_new = books_valid[:]
read_valid = [1] * len(users_valid)
read_valid_new = read_valid[:]

In [9]:
# Generating negative entries for validations randomly
random.seed(1583)

for users in users_valid:
    unread_books = allBooks.difference(userCount[users])
    unread_books_list = list(unread_books)
    A = unread_books_list[random.randint(0, len(unread_books_list) - 1)]
    users_valid_new.append(users)
    books_valid_new.append(A)
    read_valid_new.append(0)

In [10]:
validation_set = list(zip(users_valid_new, books_valid_new, read_valid_new))
# Shuffling validation data
random.seed(1234)
random.shuffle(validation_set)

In [11]:
# Defining sets for users and books
usersPerBook_val = defaultdict(set)
booksPerUser_val = defaultdict(set)
readPerPair_true = defaultdict(int)
for user, book,read in validation_set:
    usersPerBook_val[book].add(user)
    booksPerUser_val[user].add(book)
    readPerPair_true[user,book] = read

In [12]:
bookCount = defaultdict(int)
totalRead = 0

for user, book in training_set:
    bookCount[book] += 1
    totalRead += 1

In [13]:
mostPopular = [(bookCount[x], x) for x in bookCount]
mostPopular.sort()
mostPopular.reverse()

In [14]:
# Defining Jaccard similarity function
def Jaccard(s1, s2):
    numer = len(s1.intersection(s2))
    denom = len(s1.union(s2))
    return (numer / denom)

In [27]:
def predictionJaccard(u, b):
    similarities = []
    books = booksPerUser[u]
    users = usersPerBook[b]
    for book in books:
        users_other = usersPerBook[book]
        sim = Jaccard(users, users_other)
        similarities.append((sim))
    similarities.sort(reverse = True)
    return(numpy.mean(similarities))

In [28]:
jaccard_user_val = defaultdict(dict)
for i in booksPerUser_val:
    books = booksPerUser_val[i]
    for books in booksPerUser_val[i]:
        jaccard_user_val[i][books] = (predictionJaccard(i, books)*bookCount[books])

In [29]:
for i in jaccard_user_val:
    jaccard_user_val[i] = sorted(jaccard_user_val[i].items(), key=lambda x:x[1])

In [30]:
readPerPair_predict = defaultdict(int)

In [31]:
for i in jaccard_user_val:
    num = len(jaccard_user_val[i])
    unread = jaccard_user_val[i][:num//2]
    read = jaccard_user_val[i][num//2:]
    for pair in read:
        readPerPair_predict[i,pair[0]] = 1
    for pair2 in unread:
        readPerPair_predict[i,pair2[0]] = 0

In [32]:
# Predictions, accuracy and BER based on combination of new popularity and Jaccard functions
predictions = []
y_valid = []

for users, books, read in validation_set:
    y_valid.append(read)
    predictions.append(readPerPair_predict[users,books])

TP_valid = sum([(p and l) for (p, l) in zip(predictions, y_valid)])
FP_valid = sum([(p and not l) for (p, l) in zip(predictions, y_valid)])
TN_valid = sum([(not p and not l) for (p, l) in zip(predictions, y_valid)])
FN_valid = sum([(not p and l) for (p, l) in zip(predictions, y_valid)])
accu_valid = (TP_valid + TN_valid) / (TP_valid + FP_valid + TN_valid + FN_valid)

TPR_valid = TP_valid / (TP_valid + FN_valid)
TNR_valid = TN_valid / (TN_valid + FP_valid)
BER_valid = 1 - 0.5 * (TPR_valid + TNR_valid)

## Accuracy and BER of Model on Set
print("Accuracy on validation set:", accu_valid)
print("BER on validation set:", BER_valid)

Accuracy on validation set: 0.718
BER on validation set: 0.28200000000000003


In [33]:
# Defining sets for users and books
usersPerBook_test = defaultdict(set)
booksPerUser_test = defaultdict(set)
readPerPair_test = defaultdict(int)
for l in open("pairs_Read.txt"):
    if l.startswith("userID"):
        # header
        continue
    u, b = l.strip().split("-")
    usersPerBook_test[b].add(u)
    booksPerUser_test[u].add(b)
    
jaccard_user_test = defaultdict(dict)
for i in booksPerUser_test:
    books = booksPerUser_test[i]
    for books in booksPerUser_test[i]:
        jaccard_user_test[i][books] = (predictionJaccard(i, books)*bookCount[books])   

In [34]:
for i in jaccard_user_test:
    jaccard_user_test[i] = sorted(jaccard_user_test[i].items(), key=lambda x:x[1])

In [35]:
readPerPair_test = defaultdict(int)
for i in jaccard_user_test:
    num = len(jaccard_user_test[i])
    unread = jaccard_user_test[i][:num//2]
    read = jaccard_user_test[i][num//2:]
    for pair in read:
        readPerPair_test[i,pair[0]] = 1
    for pair2 in unread:
        readPerPair_test[i,pair2[0]] = 0

In [36]:
# Writing predictions of test set to file
predictions = open("predictions_Read_Assignment1v3.2.txt", "w")
for l in open("pairs_Read.txt"):
    if l.startswith("userID"):
        # header
        predictions.write(l)
        continue
    u, b = l.strip().split("-")
    if readPerPair_test[u,b] == 1:
        predictions.write(u + "-" + b + ",1\n")
    else:
        predictions.write(u + "-" + b + ",0\n")

predictions.close()

**Kaggle Username: tobycheng or Toby Cheng**

**Kaggle Read Accuracy: 0.72516**