In [1]:
import gzip
from collections import defaultdict
import csv
from tqdm import tqdm
from tqdm.notebook import trange, tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import accuracy_score
import random
import numpy as np
import scipy as sp
import math
from sklearn.feature_extraction.text import CountVectorizer
from scipy.spatial.distance import pdist, squareform

In [2]:
path = "Downloads/assignment1/train_Interactions.csv.gz"
f = gzip.open(path, 'rt')
header = f.readline()
header = header.strip().split(',')
header

['userID', 'bookID', 'rating']

In [3]:
dataset = []
for line in f:
    fields = line.strip().split(',')
    d = dict(zip(header, fields))
    d['rating'] = int(d['rating'])
    dataset.append(d)

In [4]:
dataset[0]

{'userID': 'u79354815', 'bookID': 'b14275065', 'rating': 4}

#### Parsing Data

In [5]:
usersPerBook = defaultdict(set)
BooksPerUser = defaultdict(set)
books = set()
for d in dataset:
    u,b = d['userID'], d['bookID']
    usersPerBook[b].add(u)
    BooksPerUser[u].add(b)
    if (b not in books):
        books.add(b)

In [6]:
X_train = dataset[:190000]
X_valid = dataset[190000:]

In [7]:
def readCSV(path):
    f = gzip.open(path, 'rt')
    f.readline()
    for l in f:
        yield l.strip().split(',')
    
bookCount = defaultdict(int)
totalRead = 0

for user,book,_ in readCSV("Downloads/assignment1/train_Interactions.csv.gz"):
    bookCount[book] += 1
    totalRead += 1

mostPopular = [(bookCount[x], x) for x in bookCount]
mostPopular.sort()
mostPopular.reverse()

In [8]:
def popularity(book):
    count = 0
    for b in mostPopular:
        if b[1] != book:
            count = count + b[0]
        else:
            count = count + b[0]
            break
    #print(count/totalRead)
    return (count/totalRead)

In [9]:
def Jaccard(s1, s2):
    numer = len(s1.intersection(s2))
    denom = len(s1.union(s2))
    if denom > 0:
        return numer/denom
    return 0

In [10]:
def Jaccard_score(u, b):
    similarities = []
    users = set(usersPerBook[b])
    for b2 in BooksPerUser[u]:
        sim = Jaccard(users,set(usersPerBook[b2]))
        if sim > 0:
            similarities.append(sim)
    similarities.sort()
    if len(similarities) == 0: return 0
    average = np.mean(similarities)
    return average

In [11]:
def Jaccard_score_book(u, b):
    similarities = []
    uprime = usersPerBook[b]
    for i in uprime:
        user = BooksPerUser[i]
        candidateItems = u
        sim = Jaccard(user, BooksPerUser[candidateItems])
        if sim > 0:
            similarities.append(sim)
    similarities.sort()
    #print(len(similarities))
    if len(similarities) == 0: return 0
    return max(similarities)

In [12]:
def cos_sim(a,b):
    numer = len(a.intersection(b))
    denom = len(a) * len(b)
    if denom > 0:
        return numer/denom
    return 0

In [13]:
def cos_score(u,b):
    similarities = []
    uprime = usersPerBook[b]
    for i in uprime:
        book = BooksPerUser[i]
        candidateItems = u
        sim = cos_sim(book, BooksPerUser[candidateItems])
        if sim > 0:
            similarities.append(sim)
    similarities.sort()
    #print(len(similarities))
    if len(similarities) == 0: return 0
    return max(similarities)

In [14]:
test_data = []
for l in open("Downloads/assignment1/pairs_Read.txt"):
    if l.startswith("userID"):
        continue
    u,b = l.strip().split('-')
    test_data.append((u,b))

In [15]:
BooksPerUserTest = defaultdict(set)
for d in test_data:
    u,b = d[0], d[1]
    BooksPerUserTest[u].add(b)

In [16]:
BooksPerUserScore = defaultdict(set)
for u in BooksPerUserTest:
    for b in BooksPerUserTest[u]:
        pop = popularity(b)
        jac = Jaccard_score(u,b)
        cos = cos_score(u,b)
        #jacc2 = Jaccard_score_book(u,b)
        dp = (jac * cos) / pop
        BooksPerUserScore[u].add((dp, b))

In [17]:
rank = []
for u in BooksPerUserScore:
    BooksPerUserScore[u] = sorted(BooksPerUserScore[u], reverse = True)
    length = math.ceil(len(BooksPerUserScore[u])/2)
    count = 0
    for pair in BooksPerUserScore[u]:
        if count != length:
            rank.append((u,pair[1]))
            count = count + 1
        else:
            break

In [22]:
predictions = open("Downloads/assignment1/predictions_Read.txt", 'w')
for l in open("Downloads/assignment1/pairs_Read.txt"):
    if l.startswith("userID"):
    #header
        predictions.write(l)
        continue
    u,b = l.strip().split('-')
    if (u,b) in rank:
        predictions.write(u + '-' + b + ",1\n")
    else:
        predictions.write(u + '-' + b + ",0\n")
        
predictions.close()

Validation prediction

In [26]:
import random

neg = []

for i in X_valid:
    r = {'userID': 0, 'bookID': 0, 'read': 0}
    r['userID'] = i['userID']
    book = random.sample(books,1)[0]
    while (book in BooksPerUser[r['userID']]):
        book = random.sample(books,1)[0]
    r['bookID'] = book
    neg.append(r)

In [27]:
pos = [{'userID': i['userID'], 'bookID': i['bookID'], 'read':1} for i in X_valid]
mix = pos + neg

In [28]:
usersPerBook = defaultdict(set)
BooksPerUser = defaultdict(set)
books = set()
for d in X_train:
    u,b = d['userID'], d['bookID']
    usersPerBook[b].add(u)
    BooksPerUser[u].add(b)
    if (b not in books):
        books.add(b)

In [29]:
BooksPerUserValid = defaultdict(set)
for d in mix:
    u,b = d['userID'], d['bookID']
    BooksPerUserValid[u].add(b)

In [30]:
BooksPerUserValidScore = defaultdict(set)
for u in tqdm(BooksPerUserValid):
    for b in BooksPerUserValid[u]:
        pop = popularity(b)
        jac = Jaccard_score(u,b)
        cos = cos_score(u,b)
        #jacc2 = Jaccard_score_book(u,b)
        dp = (jac * cos) / pop
        BooksPerUserValidScore[u].add((dp, b))

HBox(children=(IntProgress(value=0, max=6288), HTML(value='')))




In [31]:
rank_valid = []
for u in BooksPerUserValidScore:
    BooksPerUserValidScore[u] = sorted(BooksPerUserValidScore[u], reverse = True)
    length = math.ceil(len(BooksPerUserValidScore[u])/2)
    count = 0
    for pair in BooksPerUserValidScore[u]:
        if count != length:
            rank_valid.append((u,pair[1]))
            count = count + 1
        else:
            break

In [32]:
validateSet = []
trueSet = []

for i in mix:
    if ((i['userID'], i['bookID']) in rank_valid):
        validateSet.append(True)
    else:
        validateSet.append(False)
        
    if (i['read'] == 1):
        trueSet.append(True)
    else:
        trueSet.append(False)

In [33]:
zipping = list(zip(validateSet, trueSet))
acc = [i[0] == i[1] for i in zipping]
accuracy = sum(acc)/len(acc)
print("accuracy = " + str(accuracy))

accuracy = 0.7079


On whole data

In [None]:
usersPerBook = defaultdict(set)
BooksPerUser = defaultdict(set)
books = set()
for d in dataset:
    u,b = d['userID'], d['bookID']
    usersPerBook[b].add(u)
    BooksPerUser[u].add(b)
    if (b not in books):
        books.add(b)

In [260]:
import random

neg = []

for i in tqdm(dataset):
    r = {'userID': 0, 'bookID': 0, 'read': 0}
    r['userID'] = i['userID']
    book = random.sample(books,1)[0]
    while (book in BooksPerUser[r['userID']]):
        book = random.sample(books,1)[0]
    r['bookID'] = book
    neg.append(r)

HBox(children=(IntProgress(value=0, max=200000), HTML(value='')))




In [261]:
pos = [{'userID': i['userID'], 'bookID': i['bookID'], 'read':1} for i in tqdm(dataset)]
mix = pos + neg

HBox(children=(IntProgress(value=0, max=200000), HTML(value='')))




In [262]:
BooksPerUserScore = defaultdict(set)
for u in tqdm(BooksPerUser):
    for b in BooksPerUser[u]:
        pop = popularity(b)
        jac = Jaccard_score(u,b)
        cos = cos_score(u,b)
        #jacc2 = Jaccard_score_book(u,b)
        dp = (jac * cos) * pop
        BooksPerUserScore[u].add((dp, b))

HBox(children=(IntProgress(value=0, max=11357), HTML(value='')))




In [251]:
rank_valid = []
for u in BooksPerUserScore:
    BooksPerUserScore[u] = sorted(BooksPerUserScore[u], reverse = True)
    length = math.ceil(len(BooksPerUserScore[u])/2)
    count = 0
    for pair in BooksPerUserScore[u]:
        if count != length:
            rank_valid.append((u,pair[1]))
            count = count + 1
        else:
            break

In [254]:
validateSet = []
trueSet = []

for i in tqdm(mix):
    if ((i['userID'], i['bookID']) in rank_valid):
        validateSet.append(True)
    else:
        validateSet.append(False)
        
    if (i['read'] == 1):
        trueSet.append(True)
    else:
        trueSet.append(False)

HBox(children=(IntProgress(value=0, max=400000), HTML(value='')))




In [255]:
zipping = list(zip(validateSet, trueSet))
acc = [i[0] == i[1] for i in zipping]
accuracy = sum(acc)/len(acc)
print("accuracy = " + str(accuracy))

accuracy = 0.7570675
