In [226]:
import random
from sklearn import linear_model
from sklearn.metrics import accuracy_score
from matplotlib import pyplot as plt
from collections import defaultdict
import gzip
import sys
from math import e
from datetime import datetime

In [15]:
def assertFloat(x):
    assert type(float(x)) == float

def assertFloatList(items, N):
    assert len(items) == N
    assert [type(float(x)) for x in items] == [float]*N

In [16]:
answers = {}

In [17]:
def parseData(fname):
    for l in open(fname):
        yield eval(l)

In [18]:
data = list(parseData("beer_50000.json"))

In [19]:
random.seed(0)
random.shuffle(data)

In [20]:
dataTrain = data[:25000]
dataValid = data[25000:37500]
dataTest = data[37500:]

In [21]:
yTrain = [d['beer/ABV'] > 7 for d in dataTrain]
yValid = [d['beer/ABV'] > 7 for d in dataValid]
yTest = [d['beer/ABV'] > 7 for d in dataTest]

In [39]:
max_review_len = 0
for d in dataTrain:
    max_review_len = max(max_review_len, len(d['review/text']))

In [22]:
categoryCounts = defaultdict(int)
max_len = 0
for d in data:
    categoryCounts[d['beer/style']] += 1

In [23]:
categories = [c for c in categoryCounts if categoryCounts[c] > 1000]

In [24]:
catID = dict(zip(list(categories),range(len(categories))))

In [25]:
print(catID)

{'American Porter': 0, 'Fruit / Vegetable Beer': 1, 'English Pale Ale': 2, 'Rauchbier': 3, 'American Pale Ale (APA)': 4, 'Scotch Ale / Wee Heavy': 5, 'American IPA': 6, 'Old Ale': 7, 'American Double / Imperial IPA': 8, 'American Double / Imperial Stout': 9, 'Czech Pilsener': 10, 'Rye Beer': 11, 'Russian Imperial Stout': 12}


In [45]:
def feat(d, includeCat = True, includeReview = True, includeLength = True):
    feats = []
    if includeCat:
        feats += [0] * len(catID)
        if d['beer/style'] in catID: feats[catID[d['beer/style']]] = 1
    
    if includeReview:
        feats += [d['review/appearance'], d['review/aroma'], d['review/overall'], d['review/palate'], d['review/taste']]

    if includeLength:
        feats += [len(d['review/text']) / max_review_len]
        
    return feats

In [243]:
def pipeline(reg, includeCat = True, includeReview = True, includeLength = True, max_iter = 100):
    xTrain = [feat(d, includeCat=includeCat, includeReview=includeReview, includeLength=includeLength) for d in dataTrain]
    xValid = [feat(d, includeCat=includeCat, includeReview=includeReview, includeLength=includeLength) for d in dataValid]
    xTest = [feat(d, includeCat=includeCat, includeReview=includeReview, includeLength=includeLength) for d in dataTest]
    model = linear_model.LogisticRegression(C=reg, class_weight='balanced', max_iter=max_iter)
    model.fit(xTrain, yTrain)

    y_pred_valid = model.predict(xValid)
    y_pred_test = model.predict(xTest)

    ber_valid = sum(yValid != y_pred_valid) / len(yValid)
    ber_test = sum(yTest != y_pred_test) / len(yTest)

    return model, ber_valid, ber_test
    

In [32]:
### Question 1

In [33]:
mod, validBER, testBER = pipeline(10, True, False, False)

In [34]:
answers['Q1'] = [validBER, testBER]

In [35]:
assertFloatList(answers['Q1'], 2)

In [43]:
### Question 2

In [62]:
mod, validBER, testBER = pipeline(10, True, True, True, 500)

In [51]:
answers['Q2'] = [validBER, testBER]

In [52]:
assertFloatList(answers['Q2'], 2)

In [54]:
### Question 3

In [63]:
lowest_ber = sys.maxsize
bestC = -1
for c in [0.001, 0.01, 0.1, 1, 10]:
    mod, validBER, testBER = pipeline(c, True, True, True, 500)
    if validBER < lowest_ber: bestC = c

In [64]:
mod, validBER, testBER = pipeline(bestC, True, True, True, 500)

In [59]:
answers['Q3'] = [bestC, validBER, testBER]

In [60]:
assertFloatList(answers['Q3'], 3)

In [66]:
### Question 4

In [67]:
mod, validBER, testBER_noCat = pipeline(1, False, True, True, 500)

In [68]:
mod, validBER, testBER_noReview = pipeline(1, True, False, True, 500)

In [69]:
mod, validBER, testBER_noLength = pipeline(1, True, True, False, 500)

In [70]:
answers['Q4'] = [testBER_noCat, testBER_noReview, testBER_noLength]

In [71]:
assertFloatList(answers['Q4'], 3)

In [73]:
### Question 5

In [130]:
path = "amazon_reviews_us_Musical_Instruments_v1_00.tsv"
f = open(path, 'rt', encoding="utf8")

header = f.readline()
header = header.strip().split('\t')

In [131]:
header

['marketplace',
 'customer_id',
 'review_id',
 'product_id',
 'product_parent',
 'product_title',
 'product_category',
 'star_rating',
 'helpful_votes',
 'total_votes',
 'vine',
 'verified_purchase',
 'review_headline',
 'review_body',
 'review_date']

In [132]:
dataset = []

pairsSeen = set()

for line in f:
    fields = line.strip().split('\t')
    d = dict(zip(header, fields))
    ui = (d['customer_id'], d['product_id'])
    if ui in pairsSeen:
        print("Skipping duplicate user/item:", ui)
        continue
    pairsSeen.add(ui)
    d['star_rating'] = int(d['star_rating'])
    d['helpful_votes'] = int(d['helpful_votes'])
    d['total_votes'] = int(d['total_votes'])
    dataset.append(d)

Skipping duplicate user/item: ('46953315', 'B00QM3CNN6')
Skipping duplicate user/item: ('31616428', 'B0026RB0G8')
Skipping duplicate user/item: ('47240912', 'B008I653SC')
Skipping duplicate user/item: ('14503091', 'B003FRMRC4')
Skipping duplicate user/item: ('38538360', 'B00HVLUR86')
Skipping duplicate user/item: ('43448024', 'B00HVLUR86')
Skipping duplicate user/item: ('51525270', 'B00HVLUR86')
Skipping duplicate user/item: ('20652160', 'B004OU2IQG')
Skipping duplicate user/item: ('10964440', 'B00HVLUR86')
Skipping duplicate user/item: ('20043677', 'B00HVLUR86')
Skipping duplicate user/item: ('44796499', 'B00HVLUSGM')
Skipping duplicate user/item: ('29066899', 'B0002CZSYO')
Skipping duplicate user/item: ('10385056', 'B004OU2IQG')
Skipping duplicate user/item: ('1658551', 'B00HVLURL8')
Skipping duplicate user/item: ('907433', 'B00N9Q2E5G')
Skipping duplicate user/item: ('39412969', 'B00HVLUR86')
Skipping duplicate user/item: ('4901688', 'B00HVLUR86')
Skipping duplicate user/item: ('234

In [159]:
dataTrain = dataset[:int(len(dataset)*0.9)]
dataTest = dataset[int(len(dataset)*0.9):]

In [160]:
# Feel free to keep or discard

usersPerItem = defaultdict(set) # Maps an item to the users who rated it
itemsPerUser = defaultdict(set) # Maps a user to the items that they rated
itemNames = {}
ratingDict = {} # To retrieve a rating for a specific user/item pair
reviewsPerUser = defaultdict(list)

for d in dataTrain:
    user,item = d['customer_id'], d['product_id']
    usersPerItem[item].add(user)
    itemsPerUser[user].add(item)
    reviewsPerUser[user].append(d)

for d in dataset:
    user,item = d['customer_id'], d['product_id']
    ratingDict[(user,item)] = d['star_rating']
    itemNames[item] = d['product_title']

In [161]:
userAverages = {u: sum([ratingDict[(u,i)] for i in itemsPerUser[u]]) / len(itemsPerUser[u]) for u in itemsPerUser}
itemAverages = {i: sum([ratingDict[(u,i)] for u in usersPerItem[i]]) / len(usersPerItem[i]) for i in usersPerItem}

ratingMean = sum([d['star_rating'] for d in dataTrain]) / len(dataTrain)

In [162]:
def Jaccard(s1, s2):
    numer = len(s1.intersection(s2))
    denom = len(s1.union(s2))
    if denom == 0: return 0
    return numer / denom

In [163]:
def mostSimilar(i, N):
    similarities = []
    users = usersPerItem[i]
    for j in usersPerItem:
        if i == j: continue
        sim = Jaccard(users, usersPerItem[j])
        similarities.append((sim, j))
    similarities.sort(reverse=True)
    return similarities[:N]

In [164]:
query = 'B00KCHRKD6'

In [165]:
ms = mostSimilar(query, 10)

In [166]:
answers['Q5'] = ms

In [167]:
assertFloatList([m[0] for m in ms], 10)

In [168]:
### Question 6

In [169]:
def MSE(y, ypred):
    return sum([(y[i] - ypred[i])**2 for i in range(len(y))]) / len(y)

In [214]:
def predictRating(user,item):
    ratings = []
    similarities = []
    for d in reviewsPerUser[user]:
        i2 = d['product_id']
        if i2 == item: continue
        ratings.append(d['star_rating'] - itemAverages[i2])
        similarities.append(Jaccard(usersPerItem[item], usersPerItem[i2]))
    if sum(similarities) > 0:
        weightedRatings = [(x*y) for x, y in zip(ratings, similarities)]
        return itemAverages[item] + sum(weightedRatings) / sum(similarities)
    else:
        if item in itemAverages: 
            return itemAverages[item]
        else: 
            return ratingMean

In [215]:
alwaysPredictMean = [ratingMean] * len(dataTest)

In [216]:
simPredictions = [predictRating(d['customer_id'], d['product_id']) for d in dataTest]

In [217]:
labels = [d['star_rating'] for d in dataTest]

In [218]:
print(MSE(simPredictions, labels))
print(MSE(alwaysPredictMean, labels))

1.7165666373341593
1.6236571809194997


In [219]:
answers['Q6'] = MSE(simPredictions, labels)

In [212]:
assertFloat(answers['Q6'])

In [244]:
### Question 7

In [237]:
def decay(l, t1, t2):
    t1_unix = datetime.strptime(t1, '%Y-%m-%d').timestamp()
    t2_unix = datetime.strptime(t2, '%Y-%m-%d').timestamp()
    t_diff = abs(t2_unix - t1_unix)
    return e ** (-l * t_diff)

def predictRating(user, item, time):
    ratings = []
    similarities = []
    decays = []
    for d in reviewsPerUser[user]:
        i2 = d['product_id']
        if i2 == item: continue
        ratings.append(d['star_rating'] - itemAverages[i2])
        similarities.append(Jaccard(usersPerItem[item], usersPerItem[i2]))
        decays.append(decay(1, d['review_date'], time))
    if sum(similarities) > 0:
        weightedRatings = [(x*y*z) for x, y, z in zip(ratings, similarities, decays)]
        return itemAverages[item] + sum(weightedRatings) / sum(similarities)
    else:
        if item in itemAverages: 
            return itemAverages[item]
        else: 
            return ratingMean

decayPredictions = [predictRating(d['customer_id'], d['product_id'], d['review_date']) for d in dataTest]
itsMSE = MSE(decayPredictions, labels)
print(itsMSE)

1.6993689339769356


In [240]:
answers['Q7'] = ["The heuristic behind the decay function is that reviews made closer in time to that of the target item (argument passed into predictRating) will be \
                  more indicitive of the correct rating. Some factors that could lead to this are consumer preferences changing over time, products iterating and \
                  improving over time, or to simply filter out older reviews that could introduce irrelevancies/noise in our prediction. I decided to go with the \
                  f(abs(t_u,i - t_u,j)) decay function. Applying this function as shown in the equation on the homework doc, I was able to slightly reduce the MSE \
                  over the trivial decay function (Question 6 MSE)", itsMSE]

In [241]:
assertFloat(answers['Q7'][1])

In [242]:
f = open("answers_hw2.txt", 'w')
f.write(str(answers) + '\n')
f.close()