In [1]:
import gzip
from collections import defaultdict
import math
import scipy.optimize
from sklearn import svm
import numpy as np
import string
import random
import string
from sklearn import linear_model

In [2]:
def readGz(path):
    for l in gzip.open(path, 'rt'):
        yield eval(l)

In [3]:
def readCSV(path):
    f = gzip.open(path, 'rt')
    f.readline()
    for l in f:
        u,b,r = l.strip().split(',')
        r = int(r)
        yield u,b,r

In [4]:
allRatings = []
for l in readCSV("train_Interactions.csv.gz"):
    allRatings.append(l)

In [205]:
totalRead = 195000
ratingsTrain = allRatings[:totalRead]
ratingsValid = allRatings[totalRead:]
ratingsPerUser = defaultdict(list)
ratingsPerItem = defaultdict(list)
popularity = defaultdict(int)
for u,b,r in ratingsTrain:
    ratingsPerUser[u].append((b,r))
    ratingsPerItem[b].append((u,r))
    popularity[b] += 1

In [206]:
userSet = set()
bookSet = set()
readSet = set()

for u,b,r in allRatings:
    userSet.add(u)
    bookSet.add(b)
    readSet.add((u,b))

lUserSet = list(userSet)
lBookSet = list(bookSet)

notRead = set()
for u,b,r in ratingsValid:
    #u = random.choice(lUserSet)
    b = random.choice(lBookSet)
    while (u,b) in readSet or (u,b) in notRead:
        b = random.choice(lBookSet)
    notRead.add((u,b))
# count = 0
# while count < 5000:
#     count += 1
#     u = random.choice(lUserSet)
#     b = random.choice(lBookSet)
#     while (u,b) in readSet or (u,b) in notRead:
#         b = random.choice(lBookSet)
#     notRead.add((u, b))

readValid = set()
for u,b,r in ratingsValid:
    readValid.add((u,b))

In [126]:
sortedBook = sorted(popularity.items(), key=lambda x: x[1], reverse = True)

In [127]:
def mostPop(threshold):
    mostpop = []
    curRead = 0
    for book in sortedBook:
        curRead += book[1]
        mostpop.append(book[0])
        if (curRead > totalRead * threshold):
            break
    return mostpop

In [128]:
def Jaccard(s1, s2):
    numer = len(s1.intersection(s2))
    denom = len(s1.union(s2))
    if denom > 0:
        return numer/denom
    return 0

In [129]:
def Cosine(s1, s2):
    s1 = list(s1)
    s2 = list(s2)
    if (len(s1) == 0 or len(s2) == 0):
        return 0
    nom = 0
    s1_book = [i[0] for i in s1]
    s2_book = [i[0] for i in s2]
    s1_rate = [i[1] for i in s1]
    s2_rate = [i[1] for i in s2]
    s1_avg = np.sum(s1_rate) / len(s1)
    s2_avg = np.sum(s2_rate) / len(s2)
    s1_rate = [i - s1_avg for i in s1_rate]
    s1_rate = [i - s2_avg for i in s2_rate]
    for b,r in s1:
        if b in s2_book:
            nom += r * s2_rate[s2_book == b]
    den = np.sqrt(np.sum(np.square(s1_rate))) * np.sqrt(np.sum(np.square(s2_rate)))
    if den > 0:
        return nom / den
    return 0

In [130]:
def maxSimCal(u, b):
    maxSim = 0
    users = set(ratingsPerItem[b])
    for b2,_ in ratingsPerUser[u]:
        sim = Jaccard(users,set(ratingsPerItem[b2]))
        if sim > maxSim:
            maxSim = sim
    return maxSim

In [131]:
def maxCosSimCal(u, b):
    maxCosSim = 0
    users = set(ratingsPerItem[b])
    for b2,_ in ratingsPerUser[u]:
        sim = Cosine(users,set(ratingsPerItem[b2]))
        if sim > maxCosSim:
            maxCosSim = sim
    return maxCosSim

In [132]:
correct = 0
p0, p1 = 0,0
for (label,sample) in [(1, readValid), (0, notRead)]:
    for (u,b) in sample:
        maxSim = maxSimCal(u, b)
        pred = 0
        if maxSim > 0.015 or len(ratingsPerItem[b]) > 40:
            pred = 1
            p1 += 1
        else:
            p0 += 1
        if pred == label:
            correct += 1

In [134]:
print(correct / 10000)

0.6587


In [207]:
validData = []
for u, b in readValid:
    feature = []
    popularity = len(ratingsPerItem[b]) / len(bookSet)
    maxSim = maxSimCal(u, b)
    maxCosSim = maxCosSimCal(u, b)
    feature = [1, popularity, maxSim, maxCosSim]
    validData.append([feature, 1])
    
for u, b in notRead:
    feature = []
    popularity = len(ratingsPerItem[b]) / len(bookSet)
    maxSim = maxSimCal(u, b)
    maxCosSim = maxCosSimCal(u, b)
    feature = [1, popularity, maxSim, maxCosSim]
    validData.append([feature, 0])   

In [208]:
random.shuffle(validData)
validX = []
validY = []

for data in validData:
    validX.append(data[0])
    validY.append(data[1])

In [223]:
clf = linear_model.LogisticRegression(C=400,solver='lbfgs').fit(validX, validY)

In [224]:
validscore = clf.score(validX, validY)
print(validscore)

0.6929


In [225]:
predictions = open("predictions_Read.txt", 'w')
for l in open("pairs_Read.txt"):
    if l.startswith("userID"):
        predictions.write(l)
        continue
    u,b = l.strip().split('-')
    maxSim = maxSimCal(u, b)
    pop = len(ratingsPerItem[b])
    maxCosSim = maxCosSimCal(u, b)
    feature = [1, pop / len(bookSet), maxSim, maxCosSim]
    pred = 0
    if clf.predict([feature])[0] == 1:
        pred = 1
    _ = predictions.write(u + '-' + b + ',' + str(pred) + '\n')

predictions.close()