In [2]:
import gzip
import math
import numpy
import random
import sklearn
import nltk
import string
from collections import defaultdict
from gensim.models import Word2Vec
from nltk.stem.porter import *
from sklearn import linear_model
from sklearn.manifold import TSNE
import dateutil

In [3]:
import warnings
warnings.filterwarnings("ignore")

In [4]:
def assertFloat(x):
    assert type(float(x)) == float

def assertFloatList(items, N):
    assert len(items) == N
    assert [type(float(x)) for x in items] == [float]*N

In [5]:
dataset = []

f = gzip.open("young_adult_20000.json.gz")
for l in f:
    d = eval(l)
    dataset.append(d)
    if len(dataset) >= 20000:
        break
        
f.close()

In [6]:
answers = {}

In [7]:
### Question 1

In [8]:
data = dataset[:20000]
train = data[:10000]
test = data[10000:]

In [9]:
print(len(dataset))
print(len(train))
print(len(test))

20000
10000
10000


In [10]:
wordCount = defaultdict(int)
wordCount_combine = defaultdict(int)
wordCount_big = defaultdict(int)
punctuation = set(string.punctuation)
for d in data:
    r = ''.join([c for c in d['review_text'].lower() if not c in punctuation])
    ws = r.split()
    ws2 = [' '.join(x) for x in list(zip(ws[:-1],ws[1:]))]
    for w in ws:
        wordCount[w] += 1
    for w in ws2:
        wordCount_big[w] += 1
    for w in ws + ws2:
        wordCount_combine[w] += 1
print(len(wordCount))
print(len(wordCount_big))
print(len(wordCount_combine))

59814
528310
588124


In [11]:
d = data[0]
r = ''.join([c for c in d['review_text'].lower() if not c in punctuation])
ws = r.split()
ws2 = [' '.join(x) for x in list(zip(ws[:-1],ws[1:]))]

In [12]:
counts = [(wordCount[w], w) for w in wordCount]
counts.sort()
counts.reverse()

counts_big = [(wordCount_big[w], w) for w in wordCount_big]
counts_big.sort()
counts_big.reverse()

counts_combine = [(wordCount_combine[w], w) for w in wordCount_combine]
counts_combine.sort()
counts_combine.reverse()

In [13]:
unigrams = [w[1] for w in counts[:1000]]
print(unigrams[:10])
unigramId = dict(zip(unigrams, range(len(unigrams))))
unigramSet = set(unigrams)

bigrams = [w[1] for w in counts_big[:1000]]
print(bigrams[:10])
bigramId = dict(zip(bigrams, range(len(bigrams))))
bigramSet = set(bigrams)

combined = [w[1] for w in counts_combine[:1000]]
print(combined[:50])
combinedId = dict(zip(combined, range(len(combined))))
combinedSet = set(combined)


['the', 'and', 'a', 'of', 'to', 'i', 'is', 'this', 'it', 'in']
['of the', 'in the', 'the story', 'and the', 'is a', 'to the', 'this is', 'to be', 'it was', 'with the']
['the', 'and', 'a', 'of', 'to', 'i', 'is', 'this', 'it', 'in', 'that', 'but', 'was', 'with', 'as', 'story', 'for', 'of the', 'its', 'on', 'are', 'not', 'have', 'you', 'so', 'be', 'one', 'book', 'read', 'in the', 'more', 'like', 'all', 'at', 'an', 'his', 'just', 'really', 'about', 'from', 'me', 'some', 'my', 'he', 'up', 'what', 'her', 'good', 'the story', 'by']


In [14]:
#Train a regressor
def feature_unigram(datum):
    feat = [0]*len(unigrams)
    r = ''.join([c for c in datum['review_text'].lower() if not c in punctuation])
    ws = r.split()

    for w in ws :
        if w in unigrams:
            feat[unigramId[w]] += 1
    feat.append(1) #offset
    return feat

In [15]:
def feature_bigram(datum):
    feat = [0]*len(bigrams)
    r = ''.join([c for c in datum['review_text'].lower() if not c in punctuation])
    ws = r.split()
    ws2 = [' '.join(x) for x in list(zip(ws[:-1],ws[1:]))]

    for w in ws2 :
        if w in bigrams:
            feat[bigramId[w]] += 1
    feat.append(1) #offset
    return feat

In [16]:
def feature_combined(datum):
    feat = [0]*len(combined)
    r = ''.join([c for c in datum['review_text'].lower() if not c in punctuation])
    ws = r.split()
    ws2 = [' '.join(x) for x in list(zip(ws[:-1],ws[1:]))]

    for w in ws + ws2 :
        if w in combined:
            feat[combinedId[w]] += 1
    feat.append(1) #offset
    return feat

In [17]:
# unigram
X_train = [feature_unigram(d) for d in train]
y_train = [d['rating'] for d in train]
print(X_train[0][:20])
X_test = [feature_unigram(d) for d in test]
y_test = [d['rating'] for d in test]

clf = linear_model.Ridge(1.0, fit_intercept=False) # MSE + 1.0 l2
clf.fit(X_train, y_train)
theta = clf.coef_
pred_train = clf.predict(X_train)
MSEtrain = sum((y_train - pred_train)**2)/len(y_train)
pred_test = clf.predict(X_test)
MSEtest = sum((y_test - pred_test)**2)/len(y_test)


[20, 16, 18, 23, 21, 17, 8, 10, 13, 7, 13, 9, 11, 3, 8, 8, 6, 3, 4, 4]


In [18]:
print(MSEtest)
mostCommonUnigrams = []
mostCommonUnigrams.append(MSEtest)

wordSort = list(zip(theta, ws + ['_constant_']))
wordSort.sort()
wordSort.reverse()
mostCommonUnigrams.append(wordSort)

1.2390553477075859


In [19]:
# bigram
X_train = [feature_bigram(d) for d in train]
y_train = [d['rating'] for d in train]
X_test = [feature_bigram(d) for d in test]
y_test = [d['rating'] for d in test]


clf = linear_model.Ridge(1.0, fit_intercept=False) # MSE + 1.0 l2
clf.fit(X_train, y_train)
theta = clf.coef_
pred_train = clf.predict(X_train)
MSEtrain = sum((y_train - pred_train)**2)/len(y_train)
pred_test = clf.predict(X_test)
MSEtest = sum((y_test - pred_test)**2)/len(y_test)


In [20]:
print(MSEtest)
mostCommonBigrams = []
mostCommonBigrams.append(MSEtest)

wordSort = list(zip(theta, ws + ['_constant_']))
wordSort.sort()
wordSort.reverse()
mostCommonBigrams.append(wordSort)

1.2930626118603759


In [21]:
# combined
X_train = [feature_combined(d) for d in train]
y_train = [d['rating'] for d in train]
X_test = [feature_combined(d) for d in test]
y_test = [d['rating'] for d in test]

clf = linear_model.Ridge(1.0, fit_intercept=False) # MSE + 1.0 l2
clf.fit(X_train, y_train)
theta = clf.coef_
pred_train = clf.predict(X_train)
MSEtrain = sum((y_train - pred_train)**2)/len(y_train)
pred_test = clf.predict(X_test)
MSEtest = sum((y_test - pred_test)**2)/len(y_test)


In [22]:
print(MSEtest)
mostCommonBoth = []
mostCommonBoth.append(MSEtest)

wordSort = list(zip(theta, ws + ['_constant_']))
wordSort.sort()
wordSort.reverse()
mostCommonBoth.append(wordSort)

1.2366939869514826


In [23]:
for q,wList in ('Q1a', mostCommonUnigrams), ('Q1b', mostCommonBigrams), ('Q1c', mostCommonBoth):
    mse = wList[0]
    wordSort = wList[1]
    answers[q] = [float(mse), [x[1] for x in wordSort[:5]], [x[1] for x in wordSort[-5:]]]

In [24]:
for q in 'Q1a', 'Q1b', 'Q1c':
    assert len(answers[q]) == 3
    assertFloat(answers[q][0])
    assert [type(x) for x in answers[q][1]] == [str]*5
    assert [type(x) for x in answers[q][2]] == [str]*5

In [25]:
### Question 2

In [26]:
#unigram
wordCount = defaultdict(int)
punctuation = set(string.punctuation)
for d in dataset:
    r = ''.join([c for c in d['review_text'].lower() if not c in punctuation])
    for w in r.split():
        wordCount[w] += 1

In [27]:
counts = [(wordCount[w], w) for w in wordCount]
counts.sort()
counts.reverse()

In [28]:
words = [x[1] for x in counts[:1000]]

In [29]:
#Document frequency (df)
df = defaultdict(int)
for d in train:
    r = ''.join([c for c in d['review_text'].lower() if not c in punctuation])
    for w in set(r.split()):
        df[w] += 1

In [30]:
#Term frequency (tf)
#first review
rev = train[0]
tf = defaultdict(int)
r = ''.join([c for c in rev['review_text'].lower() if not c in punctuation])
for w in r.split():
    if w in unigrams:
        tf[w] += 1

In [31]:
tfidf = dict(zip(words,[tf[w] * math.log2(len(dataset) / df[w]) for w in words]))
tfidfQuery = [tf[w] * math.log2(len(dataset) / df[w]) for w in words]

In [32]:
#Find the highest tf-idf words in our example review
maxTf = [(tf[w],w) for w in words]
maxTf.sort(reverse=True)
maxTfIdf = [(tfidf[w],w) for w in words]
maxTfIdf.sort(reverse=True)

In [33]:
maxTfIdf[:10]

[(61.79796532638507, 'vampires'),
 (37.55561038286728, 'of'),
 (36.11488238681364, 'to'),
 (33.161016694700045, 'i'),
 (28.90488135831419, 'was'),
 (27.82381514648952, 'the'),
 (27.31058918311369, 'a'),
 (26.820400074142356, 'that'),
 (26.575424759098897, 'cover'),
 (26.154425303967734, 'it')]

In [34]:
#Cosine similarity
def Cosine(x1,x2):
    numer = 0
    norm1 = 0
    norm2 = 0
    for a1,a2 in zip(x1,x2):
        numer += a1*a2
        norm1 += a1**2
        norm2 += a2**2
    if norm1*norm2:
        return numer / math.sqrt(norm1*norm2)
    return 0

In [35]:
#Find the other reviews in the corpus with the highest cosine similarity between tf-idf vectors
similarities = []
for rev2 in train:
    tf = defaultdict(int)
    r = ''.join([c for c in rev2['review_text'].lower() if not c in punctuation])
    for w in r.split():
        # Note = rather than +=
        tf[w] = 1
    tfidf2 = [tf[w] * math.log2(len(dataset) / df[w]) for w in words]
    similarities.append((Cosine(tfidfQuery, tfidf2), rev2['review_text']))
similarities.sort(reverse=True)

In [36]:
similarities[:10]

[(0.699974110076679,
  'Sherlock Holmes and the Vampires of London \n Release Date: April 2014 \n Publisher: Darkhorse Comics \n Story by: Sylvain Cordurie \n Art by: Laci \n Colors by: Axel Gonzabo \n Cover by: Jean Sebastien Rossbach \n ISDN: 9781616552664 \n MSRP: $17.99 Hardcover \n "Sherlock Holmes died fighting Professor Moriarty in the Reichenbach Falls. \n At least, that\'s what the press claims. \n However, Holmes is alive and well and taking advantage of his presumed death to travel the globe. \n Unfortunately, Holmes\'s plans are thwarted when a plague of vampirism haunts Britain. \n This book collects Sherlock Holmes and the Vampires of London Volumes 1 and 2, originally created by French publisher Soleil." - Darkhorse Comics \n When I received this copy of "Sherlock Holmes and the Vampires of London" I was Ecstatic! The cover art was awesome and it was about two of my favorite things, Sherlock Holmes and Vampires. I couldn\'t wait to dive into this! \n Unfortunately, that 

In [37]:
sim = similarities[0][0]
review = similarities[0][1]

In [38]:
answers['Q2'] = [sim, review]

In [39]:
assert len(answers['Q2']) == 2
assertFloat(answers['Q2'][0])
assert type(answers['Q2'][1]) == str

In [40]:
### Question 3

In [41]:
reviewsPerUser = defaultdict(list)

In [42]:
for d in dataset:
    reviewsPerUser[d['user_id']].append((dateutil.parser.parse(d['date_added']), d['book_id']))

In [43]:
reviewLists = []
for u in reviewsPerUser:
    rl = list(reviewsPerUser[u])
    rl.sort()
    reviewLists.append([x[1] for x in rl])

In [43]:
model10 = Word2Vec(reviewLists,
                 min_count=1, # Words/items with fewer instances are discarded
                 vector_size=10, # Model dimensionality
                 window=3, # Window size
                 sg=1) # Skip-gram model

In [44]:
reviewLists[0][0]

'18471619'

In [45]:
similarities = model10.wv.similar_by_word(reviewLists[0][0])[:5]

In [46]:
answers['Q3'] = similarities # probably want model10.wv.similar_by_word(...)[:5]

In [47]:
assert len(answers['Q3']) == 5
assert [type(x[0]) for x in answers['Q3']] == [str]*5
assertFloatList([x[1] for x in answers['Q3']], 5)

In [48]:
### Question 4

In [65]:
def Cosine(x1,x2):
    numer = 0
    norm1 = 0
    norm2 = 0
    for a1,a2 in zip(x1,x2):
        numer += a1*a2
        norm1 += a1**2
        norm2 += a2**2
    if norm1*norm2 != 0:
        return numer / math.sqrt(norm1*norm2)
    return 0

In [110]:
RatingPerItem = defaultdict(list)
RatingPerUser = defaultdict(list)
RatingPerPair = defaultdict()
UserPerItem = defaultdict(set)
ItemPerUser = defaultdict(set)
items = set()
users = set()
for d in dataset:
    u,b,r = d['user_id'],d['book_id'],d['rating']
    r = int(r)
    RatingPerUser[u].append(r)
    RatingPerItem[b].append(r)
    RatingPerPair[(u,b)]=r
    UserPerItem[b].add(u)
    ItemPerUser[u].add(b)
    items.add(b)
    users.add(u)

In [111]:
print(items)

{'22823536', '8230486', '8849965', '16108357', '29633067', '873540', '420315', '31410680', '22357', '32580518', '28109970', '25489242', '27396492', '18734029', '599046', '7595078', '9142875', '23250222', '13569386', '7202496', '92377', '1192744', '33836740', '25111227', '191213', '8160690', '15844954', '34312937', '1303186', '1386313', '34601031', '20613754', '23278618', '876383', '15784159', '17689328', '19630774', '24684906', '24042598', '28501505', '8301077', '435879', '555478', '13194', '7976759', '50602', '11029822', '3255665', '24939795', '395021', '435650', '271365', '17347674', '25066786', '7345903', '400678', '1281681', '2072057', '20758441', '31305482', '34072', '734309', '22259483', '154531', '333731', '26987292', '643534', '32855795', '15894542', '981876', '9555120', '724114', '10536697', '6430460', '754007', '154374', '29094002', '21285064', '13276507', '6314730', '17251111', '30842154', '15744312', '6740287', '12483428', '9593396', '29775342', '17671941', '32704048', '171

In [112]:
item1 = model10.wv['8230486']
item1
item2 = model10.wv['8849965']
item2

array([-0.0250452 ,  0.05349835, -0.01671794,  0.07662286, -0.06373516,
       -0.02672544, -0.06240458,  0.09797559, -0.02655385, -0.06740941],
      dtype=float32)

In [113]:
print(Cosine(item1,item2))

-0.014061371776140126


In [114]:
def predit_rating(user,item):
    Rating_mean = sum(RatingPerUser[user])/len(RatingPerUser[user])
    numer = 0
    denom = 0
    for j in ItemPerUser[user]:
         if item == j: continue
         sim = Cosine(model10.wv[item],model10.wv[j])
         numer += (RatingPerPair[(user,j)]-sum(RatingPerItem[j])/len(RatingPerItem[j]))*sim
         denom += sim
    if denom == 0: return Rating_mean
    return Rating_mean + numer/denom

In [115]:
def MSE(Y,Y_pred):
    mse = numpy.square(numpy.subtract(numpy.array(Y),numpy.array(Y_pred))).mean()
    return mse

In [116]:
y = []
y_pred=[]
for d in dataset[:1000]:
    u,b,r = d['user_id'],d['book_id'],d['rating']
    y.append(int(r))
    y_pred.append(predit_rating(u,b))

In [117]:
mse4 = MSE(y,y_pred)
print(len(y))
print(y_pred[1])
print(mse4)

1000
4.6193260018382585
110.4315724309804


In [118]:
answers['Q4'] = mse4

In [119]:
assertFloat(answers['Q4'])

In [120]:
### Q5

In [121]:
def Jaccard(s1, s2):
    numer = len(s1.intersection(s2))
    denom = len(s1.union(s2))
    if denom == 0:
        return 0
    return numer / denom

In [136]:
def predit_rating2(user,item):
    Rating_mean = sum(RatingPerUser[user])/len(RatingPerUser[user])
    numer = 0
    denom = 0
    for j in ItemPerUser[user]:
         if item == j: continue
         sim = Jaccard(set(model10.wv[item]),set(model10.wv[j]))
         numer += (RatingPerPair[(u,j)]-sum(RatingPerItem[j])/len(RatingPerItem[j]))*sim
         denom += sim
    #deal with the cold start
    if denom == 0: return Rating_mean
    return Rating_mean + numer/denom

In [137]:
def predit_rating_iterchange(user,item):
    Rating_mean = sum(RatingPerItem[item])/len(RatingPerItem[item])
    numer = 0
    denom = 0
    for u in UserPerItem[item]:
         if user == u: continue
         sim = Jaccard(ItemPerUser[u],ItemPerUser[user])
         numer += (RatingPerPair[(u,item)]-sum(RatingPerUser[u])/len(RatingPerUser[u]))*sim
         denom += sim
    if denom == 0: return Rating_mean
    return Rating_mean + numer/denom

In [140]:
y = []
y_pred=[]
for d in dataset[:1000]:
    u,b,r = d['user_id'],d['book_id'],d['rating']
    y.append(int(r))
    y_pred.append(predit_rating2(u,b))

In [141]:
mse5 = MSE(y,y_pred)
print(len(y))
print(y_pred[1])
print(mse5)

1000
4.125
0.9219204538702108


In [142]:
answers['Q5'] = ["Use the jaccard similarity to replace cosine",
                 mse5]

In [143]:
assert len(answers['Q5']) == 2
assert type(answers['Q5'][0]) == str
assertFloat(answers['Q5'][1])

In [144]:
f = open("answers_hw4.txt", 'w')
f.write(str(answers) + '\n')
f.close()