In [39]:
import matplotlib.pyplot as plt
import csv
import numpy as np
import random
import heapq
import math
from collections import defaultdict
from sklearn import linear_model
from sklearn.svm import SVC

In [2]:
path = "/home/cui/Projects/PycharmProjects/CSE-158/data/train_Interactions.csv"
file = open(path, 'rt')

In [3]:
header = file.readline()
header = header.strip().split(',')

In [4]:
dataset = []

In [5]:
for line in file:
    fields = line.strip().split(',')
    d = dict(zip(header, fields))
    d['rating'] = int(d['rating'])
    dataset.append(d)

In [6]:
dataset[0]

{'userID': 'u79354815', 'bookID': 'b14275065', 'rating': 4}

In [7]:
data = [[d['userID'], d['bookID'], d['rating']] for d in dataset]

In [8]:
split = 190000
trainingSet = data[:split]
validationSet = [[d[0], d[1], 1] for d in data[split:]]

In [9]:
validationSet[0]

['u35176258', 'b30592470', 1]

In [10]:
usersPerBook = defaultdict(set)
booksPerUser = defaultdict(set)
booksRatingPerUser = defaultdict(dict)
bookSets = set()

# userIndex[user] = 0
userIndex = defaultdict(int)
# indexUser[index] = user
indexUser = defaultdict(int)
# bookIndex[user] = 0
bookIndex = defaultdict(int)
# indexBook[index] = book
indexBook = defaultdict(int)

In [11]:
for d in trainingSet:
    user, book, rating = d[0], d[1], d[2]
    usersPerBook[book].add(user)
    booksPerUser[user].add(book)
    booksRatingPerUser[user][book] = rating
    bookSets.add(book)

In [12]:
index = 0
for user in booksPerUser:
    userIndex[user] = index
    indexUser[index] = user
    index += 1

index = 0
for book in usersPerBook:
    bookIndex[book] = index
    indexBook[index] = book
    index += 1

In [13]:
bookAvgRating = defaultdict(int)
bookRatedCount = defaultdict(int)
avgRatingPerUser = defaultdict(int)

for d in trainingSet:
    user, book, rating = d[0], d[1], d[2]
    bookAvgRating[book] += rating
    avgRatingPerUser[user] += rating
    bookRatedCount[book] += 1

for book in bookAvgRating:
    bookAvgRating[book] = bookAvgRating[book] / bookRatedCount[book]

for user in avgRatingPerUser:
    avgRatingPerUser[user] = avgRatingPerUser[user] / len(booksPerUser[user])



In [14]:
avgRating = 0
for b in bookAvgRating:
    avgRating += bookAvgRating[b]
avgRating = avgRating / len(bookAvgRating)

In [15]:
avgRating

3.813463732600103

In [16]:
bookAvgRating['b14275065']

3.8

In [17]:
len(bookSets)

7169

In [18]:
len(booksPerUser)

11357

In [19]:
# create the 0 data for validation set
valid_user = [d[0] for d in validationSet]

for user in valid_user:
    booksNotReadSet = bookSets - booksPerUser.get(user)
    book = random.choice(list(booksNotReadSet))
    validationSet.append([user, book, 0])

# random.shuffle(validationSet)

In [20]:
len(validationSet)

20000

In [21]:
def Pearson(u1, u2):
    s1, s2 = booksPerUser[u1], booksPerUser[u2]
    items = s1.intersection(s2)
    if len(items) == 0:
        return 0
    
    numer, denoml, denomr = 0, 0, 0
    
    for book in items:
        r_u1, r_u1_avg = booksRatingPerUser[u1][book], avgRatingPerUser[u1]
        r_u2, r_u2_avg = booksRatingPerUser[u2][book], avgRatingPerUser[u2]
        
        numer += (r_u1 - r_u1_avg) * (r_u2 - r_u2_avg)
        denoml += (r_u1 - r_u1_avg) ** 2
        denomr += (r_u2 - r_u2_avg) ** 2

    denom = np.sqrt(denoml) * np.sqrt(denomr)
    
    return numer / denom

In [22]:
def Jaccard(u1, u2):
    s1, s2 = booksPerUser[u1], booksPerUser[u2]
    numer = len(s1.intersection(s2))
    denom = len(s1.union(s2))
    
    return numer / denom

In [23]:
def Jaccard_book(b1, b2):
    s1, s2 = usersPerBook[b1], usersPerBook[b2]
    numer = len(s1.intersection(s2))
    denom = len(s1.union(s2))
    
    return numer / denom

In [25]:
num = len(booksPerUser)
num_book = len(usersPerBook)
similarityMatrix_user = np.zeros((num, num))
similarityMatrix_book = np.zeros((num_book, num_book))

for user1 in booksPerUser:
    user1_index = userIndex[user1]
    
    for user2 in booksPerUser:
        user2_index = userIndex[user2]
        
        if user2_index > user1_index:
#             similarity = Pearson(user1, user2)
            similarity = Jaccard(user1, user2)
            similarityMatrix_user[user1_index][user2_index] = similarity
            similarityMatrix_user[user2_index][user1_index] = similarity

for book1 in usersPerBook:
    book1_index = bookIndex[book1]
    
    for book2 in usersPerBook:
        book2_index = bookIndex[book2]
        
        if book2_index > book1_index:
            similarity = Jaccard_book(book1, book2)
            similarityMatrix_book[book1_index][book2_index] = similarity
            similarityMatrix_book[book2_index][book1_index] = similarity

In [26]:
def getMostPopularBooks(threshold):
    bookCount = defaultdict(int)
    totalRead = 0
    threshold = threshold

    for d in trainingSet:
        bookCount[d[1]] += 1
        totalRead += 1

    mostPopular = [(bookCount[x], x) for x in bookCount]
    mostPopular.sort()
    mostPopular.reverse()

    return1 = set()
    count = 0
    for ic, i in mostPopular:
        count += ic
        return1.add(i)
        if count > totalRead * threshold: break
            
    return return1

In [35]:
def avgSimilarity(predictUser, book):
    users = usersPerBook[book]
    count = 0
    avgSimilarity = 0
    for user in users:
        avgSimilarity += similarityMatrix_user[userIndex[predictUser]][userIndex[user]]
        count += 1
    
    if count == 0:
        return 0
    
    avgSimilarity = avgSimilarity / count
    
    if math.isnan(avgSimilarity):
        return 0

    return avgSimilarity

In [36]:
bookPopularity = defaultdict(int)
userActivity = defaultdict(int)
for book in usersPerBook:
    bookPopularity[book] = len(usersPerBook[book])
for user in booksPerUser:
    userActivity[user] = len(booksPerUser[user])

maxPopu = max([bookPopularity[b] for b in bookPopularity])
maxActi = max([userActivity[u] for u in userActivity])
    
for book in bookPopularity:
    bookPopularity[book] = bookPopularity[book] / maxPopu
for user in userActivity:
    userActivity[user] = userActivity[user] / maxActi

In [37]:
def feature(datum):
    predictUser, predictBook = datum[0], datum[1]
    feat = []
    
    # average rating of book
    feat.append(bookAvgRating[predictBook] - avgRating)
    
    feat.append(bookPopularity[predictBook])
    
    feat.append(userActivity[predictUser])
    
    booksRatedList = booksRatingPerUser[predictUser]
    
    maxRating = max([b for b in booksRatedList])
    minRating = min([b for b in booksRatedList])
    
    tempSimi = 0
    count = 0
    for b in booksRatedList:
        if booksRatedList[b] == maxRating:
            tempSimi += similarityMatrix_book[bookIndex[predictBook]][bookIndex[b]]
            count += 1
    if count == 0:
        feat.append(0)
    else:
        feat.append(tempSimi / count)
    
    tempSimi = 0
    count = 0
    for b in booksRatedList:
        if booksRatedList[b] == minRating:
            tempSimi += similarityMatrix_book[bookIndex[predictBook]][bookIndex[b]]
            count += 1
    if count == 0:
        feat.append(0)
    else:
        feat.append(tempSimi / count)
    
    feat.append(avgSimilarity(predictUser, predictBook))

    return feat

In [48]:
from sklearn.ensemble import RandomForestClassifier

In [49]:
X = [feature(d) for d in validationSet]
y = [d[2] for d in validationSet]
        
X_train = X[:18000]
y_train = y[:18000]
X_test = X[18000:]
y_test = y[18000:]
        
# model = linear_model.LogisticRegression(solver='lbfgs')
# model = SVC(gamma='auto')
model = RandomForestClassifier(max_depth=2, random_state=0)
# model.fit(X_train, y_train)
model.fit(X, y)

prediction_train = model.predict(X_train)
predict_train = prediction_train == y_train
accuracy_train = sum(predict_train) / len(predict_train)
        
prediction_test = model.predict(X_test)
predict_test = prediction_test == y_test
accuracy_test = sum(predict_test) / len(predict_test)

print(accuracy_train)
print(accuracy_test)

0.6412777777777777
0.7715




In [51]:
print(model.feature_importances_)

[0.15657528 0.56116188 0.00199116 0.         0.         0.28027168]


In [44]:
predictions = open("/home/cui/Projects/PycharmProjects/CSE-158/data/predictions_Read.txt", 'w')

for l in open("/home/cui/Projects/PycharmProjects/CSE-158/data/pairs_Read.txt"):
    if l.startswith("userID"):
        #header
        predictions.write(l)
        continue
        
    u,b = l.strip().split('-')

    predict = model.predict([feature([u, b])])
    if predict == 1:
        predictions.write(u + '-' + b + ",1\n")
    else:
        predictions.write(u + '-' + b + ",0\n")
        
predictions.close()