In [1]:
import gzip
from collections import defaultdict
from sklearn.metrics import mean_squared_error
import random
from sklearn import linear_model
import string
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import math

In [2]:
def readGz(path):
    for l in gzip.open(path, 'rt'):
        yield eval(l)

In [3]:
data = []
count = 0
for d in readGz('ratebeer.json.gz'):
    count += 1
    data.append(d)
    d['review/overall'] = int(d['review/overall'].split('/')[0])
    if (count >= 100000):
        break
random.seed(30)
random.shuffle(data)

In [4]:
trainSet = data[:80000]
validSet = data[80000:90000]
testSet = data[90000:]

In [None]:
# Baseline Model

In [None]:
allRatings = []
for d in trainSet:
    allRatings.append(int(d['review/overall']))
avgRating = sum(allRatings) / len(allRatings)
print(avgRating)

In [None]:
basePred = [avgRating] * len(testSet)
testRatings = [d['review/overall'] for d in testSet]
validRatings = [d['review/overall'] for d in validSet]
print(mean_squared_error(basePred, testRatings), mean_squared_error(basePred, validRatings))

In [None]:
# Jaccard Similarity Model

In [None]:
itemPerUser = defaultdict(list)
userPerItem = defaultdict(list)
brewerSet = set()
for d in trainSet:
    itemPerUser[d['review/profileName']].append((d['beer/beerId'], d['review/overall']))
    userPerItem[d['beer/beerId']].append((d['review/profileName'], d['review/overall']))
    brewerSet.add(d['beer/brewerId'])

In [None]:
def jaccard(s1, s2):
    numer = len(s1.intersection(s2))
    denom = len(s1.union(s2))
    return numer / denom

In [None]:
def predictByItem(user, item):
    sims = []
    ratings = []
    for j in itemPerUser[user]:
        if (j[0] == item):
            continue
        users1 = set(userPerItem[item])
        users2 = set(userPerItem[j[0]])
        sims.append(jaccard(users1, users2))
        ratings.append(j[1])
    z = sum(sims)
    if (z == 0): return avgRating
    return 1 / z * sum([s * r for s, r in zip(sims, ratings)])

In [None]:
jacItemPred = []
for d in validSet:
    jacItemPred.append(predictByItem(d['review/profileName'], d['beer/beerId']))

In [None]:
print(mean_squared_error(jacItemPred, validRatings))

In [None]:
def predictByUser(user, item):
    sims = []
    ratings = []
    for v in userPerItem[item]:
        if (v[0] == user):
            continue
        items1 = set(itemPerUser[user])
        items2 = set(itemPerUser[v[0]])
        sims.append(jaccard(items1, items2))
        ratings.append(v[1])
    z = sum(sims)
    if (z == 0): return avgRating
    return 1 / z * sum([s * r for s, r in zip(sims, ratings)])

In [None]:
jacItemPred = []
for d in validSet:
    jacItemPred.append(predictByUser(d['review/profileName'], d['beer/beerId']))

In [None]:
print(mean_squared_error(jacItemPred, validRatings))

In [None]:
def predictByUserV2(user, item):
    sims = []
    ratings = []
    for v in userPerItem[item]:
        if (v[0] == user):
            continue
        items1 = set(itemPerUser[user])
        items2 = set(itemPerUser[v[0]])
        sims.append(jaccard(items1, items2))
        ratings_v = [i[1] for i in itemPerUser[v[0]]]
        ratings.append(v[1] - (sum(ratings_v) / len(ratings_v)))
    z = sum(sims)
    if (z == 0): return avgRating
    ratings_u = [i[1] for i in itemPerUser[user]]
    return (sum(ratings_u) / len(ratings_u)) + (1 / z) * sum([s * r for s, r in zip(sims, ratings)])

In [None]:
jacItemPred = []
for d in validSet:
    jacItemPred.append(predictByUserV2(d['review/profileName'], d['beer/beerId']))

In [None]:
print(mean_squared_error(jacItemPred, validRatings))

In [None]:
def predictByUserV3(user, item):
    sims = []
    ratings = []
    for v in userPerItem[item]:
        if (v[0] == user):
            continue
        items1 = set(itemPerUser[user])
        items2 = set(itemPerUser[v[0]])
        sims.append(jaccard(items1, items2))
        ratings.append(v[1])
    z = sum(sims)
    if (z == 0):
        if (len(userPerItem[item]) != 0):
            ratings_i = [i[1] for i in userPerItem[item]]
            return sum(ratings_i) / len(ratings_i)
        else:
            return avgRating
    else:
        return 1 / z * sum([s * r for s, r in zip(sims, ratings)])

In [None]:
jacItemPred = []
for d in validSet:
    jacItemPred.append(predictByUserV3(d['review/profileName'], d['beer/beerId']))

In [None]:
print(mean_squared_error(jacItemPred, validRatings))

In [None]:
jacItemPred = []
for d in testSet:
    count += 1
    jacItemPred.append(predictByUserV3(d['review/profileName'], d['beer/beerId']))

In [None]:
print(mean_squared_error(jacItemPred, testRatings))

In [None]:
# Bag of Words Model

In [None]:
def cleanText (review):
    words = review.strip().split()
    return words

In [None]:
for d in trainSet:
    words = cleanText(d['review/text'])
    d['words'] = words
for d in validSet:
    words = cleanText(d['review/text'])
    d['words'] = words
for d in testSet:
    words = cleanText(d['review/text'])
    d['words'] = words

In [None]:
wordDic = defaultdict(int)
for d in trainSet:
    for word in d['words']:
        wordDic[word] += 1

counts = []
for key, value in wordDic.items():
    counts.append((value, key))
counts.sort(key = lambda x: x[0], reverse = True)

In [None]:
dicSize = 2000
subCounts = counts[:dicSize]
wordID = {}
count = 0
wordSet = set()
for freq, word in subCounts:
    wordSet.add(word)
    wordID[word] = count
    count += 1

In [None]:
def feature(words):
    f = [0.0] * dicSize
    for word in words:
        if (word in wordSet):
            f[wordID[word]] += 1.0
    f.append(1.0)
    return f

In [None]:
Xtrain = [feature(d['words']) for d in trainSet]
ytrain = [d['review/overall'] for d in trainSet]
Xvalid = [feature(d['words']) for d in validSet]
yvalid = [d['review/overall'] for d in validSet]

In [None]:
mod = linear_model.LinearRegression(n_jobs=-1)
mod.fit(Xtrain, ytrain)

In [None]:
ypred = mod.predict(Xvalid)
print(mean_squared_error(yvalid, ypred))

In [None]:
def cleanTextV2 (review):
    txt = ''.join([c.lower() for c in review if c not in string.punctuation])
    words = txt.strip().split()
    return words

In [None]:
for d in trainSet:
    words = cleanTextV2(d['review/text'])
    d['words'] = words
for d in validSet:
    words = cleanTextV2(d['review/text'])
    d['words'] = words
for d in testSet:
    words = cleanTextV2(d['review/text'])
    d['words'] = words

In [None]:
wordDic = defaultdict(int)
for d in trainSet:
    for word in d['words']:
        wordDic[word] += 1

counts = []
for key, value in wordDic.items():
    counts.append((value, key))
counts.sort(key = lambda x: x[0], reverse = True)

In [None]:
dicSize = 2000
subCounts = counts[:dicSize]
wordID = {}
count = 0
wordSet = set()
for freq, word in subCounts:
    wordSet.add(word)
    wordID[word] = count
    count += 1

In [None]:
Xtrain = [feature(d['words']) for d in trainSet]
ytrain = [d['review/overall'] for d in trainSet]
Xvalid = [feature(d['words']) for d in validSet]
yvalid = [d['review/overall'] for d in validSet]

In [None]:
mod = linear_model.LinearRegression(n_jobs=-1)
mod.fit(Xtrain, ytrain)

In [None]:
ypred = mod.predict(Xvalid)
print(mean_squared_error(yvalid, ypred))

In [None]:
# best model starts here

In [18]:
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

In [19]:
def cleanTextV3 (review):
    txt = ''.join([c.lower() for c in review if c not in string.punctuation])
    words = txt.strip().split()
    words = [stemmer.stem(word) for word in words if word not in stop_words]
    return words

In [20]:
for d in trainSet:
    words = cleanTextV3(d['review/text'])
    d['words'] = words
for d in validSet:
    words = cleanTextV3(d['review/text'])
    d['words'] = words
for d in testSet:
    words = cleanTextV3(d['review/text'])
    d['words'] = words

In [21]:
wordDic = defaultdict(int)
docCount = defaultdict(int)
for d in trainSet:
    for word in d['words']:
        wordDic[word] += 1
    for w in set(d['words']):
        docCount[w] += 1

counts = []
for key, value in wordDic.items():
    counts.append((value, key))
counts.sort(key = lambda x: x[0], reverse = True)

In [22]:
dicSize = 2000
subCounts = counts[:dicSize]
wordID = {}
count = 0
wordSet = set()
for freq, word in subCounts:
    wordSet.add(word)
    wordID[word] = count
    count += 1

In [23]:
idf = [0] * dicSize
for word in wordID:
    idf[wordID[word]] = math.log10(len(trainSet) / docCount[word])

In [25]:
def feature(words):
    f = [0.0] * dicSize
    indices = set()
    for word in words:
        if (word in wordSet):
            index = wordID[word]
            f[index] += 1.0
            indices.add(index)
    mx = max(f)
    if (mx != 0.0):
        for i in indices:
            f[i] = f[i] / mx * idf[i]
    f.append(1.0)
    return f

In [26]:
Xtrain = [feature(d['words']) for d in trainSet]
ytrain = [d['review/overall'] for d in trainSet]
Xvalid = [feature(d['words']) for d in validSet]
yvalid = [d['review/overall'] for d in validSet]
Xtest = [feature(d['words']) for d in testSet]
ytest = [d['review/overall'] for d in testSet]

In [27]:
del counts, subCounts, wordDic, wordSet, wordID, docCount, idf

In [28]:
mod = linear_model.LinearRegression(n_jobs=-1)
mod.fit(Xtrain, ytrain)

LinearRegression(n_jobs=-1)

In [29]:
ypred = mod.predict(Xvalid)
tpred = mod.predict(Xtest)
print(mean_squared_error(yvalid, ypred), mean_squared_error(ytest, tpred))

4.866160748056001 4.978496080872587


In [None]:
del Xtrain, ytrain, Xvalid, yvalid, Xtest, ytest, ypred, tpred