In [2]:
import json
from collections import defaultdict
from sklearn import linear_model
from sklearn.metrics import mean_squared_error
from sklearn.metrics import confusion_matrix
import numpy
import random
import gzip
import dateutil.parser
import math

In [3]:
answers = {}

In [4]:
def assertFloat(x):
    assert type(float(x)) == float

def assertFloatList(items, N):
    assert len(items) == N
    assert [type(float(x)) for x in items] == [float]*N

In [5]:
### Question 1

In [6]:
f = gzip.open("fantasy_10000.json.gz")
dataset = []
for l in f:
    dataset.append(json.loads(l))

In [7]:
print(dataset[0])

{'user_id': '8842281e1d1347389f2ab93d60773d4d', 'book_id': '18245960', 'review_id': 'dfdbb7b0eb5a7e4c26d59a937e2e5feb', 'rating': 5, 'review_text': 'This is a special book. It started slow for about the first third, then in the middle third it started to get interesting, then the last third blew my mind. This is what I love about good science fiction - it pushes your thinking about where things can go. \n It is a 2015 Hugo winner, and translated from its original Chinese, which made it interesting in just a different way from most things I\'ve read. For instance the intermixing of Chinese revolutionary history - how they kept accusing people of being "reactionaries", etc. \n It is a book about science, and aliens. The science described in the book is impressive - its a book grounded in physics and pretty accurate as far as I could tell. Though when it got to folding protons into 8 dimensions I think he was just making stuff up - interesting to think about though. \n But what would happ

In [8]:
max_len = 0
for datum in dataset:
    max_len = max(max_len, len(datum['review_text']))

def feature(datum):
    return len(datum['review_text']) / max_len

In [9]:
X = numpy.array([feature(x) for x in dataset]).reshape(-1, 1)
Y = [dataset[x]['rating'] for x in range(len(dataset))]

In [10]:
model = linear_model.LinearRegression()
model.fit(X, Y)

In [11]:
theta = [model.intercept_]
theta += [model.coef_[0]]
y_pred = model.predict(X)
MSE = mean_squared_error(Y, y_pred)
answers['Q1'] = [theta[0], theta[1], MSE]

In [12]:
assertFloatList(answers['Q1'], 3)

In [13]:
### Question 2

In [14]:
for d in dataset:
    t = dateutil.parser.parse(d['date_added'])
    d['parsed_date'] = t

In [15]:
def feature(datum):
    feature_vec = [0] * 19
    feature_vec[0] = 1
    feature_vec[1] = len(datum['review_text']) / max_len
    weekday = datum['parsed_date'].weekday() + 1
    if weekday != 1:
        feature_vec[weekday] = 1
    month = datum['parsed_date'].month + 6
    if month != 7:
        feature_vec[month] = 1
    return feature_vec

In [16]:
X = [feature(x) for x in dataset]
Y = [dataset[x]['rating'] for x in range(len(dataset))]

In [17]:
model = linear_model.LinearRegression(fit_intercept=False)
model.fit(X, Y)

In [18]:
answers['Q2'] = [X[0], X[1]]

In [19]:
assertFloatList(answers['Q2'][0], 19)
assertFloatList(answers['Q2'][1], 19)

In [20]:
### Question 3

In [21]:
def feature3(datum):
    return [1, 
            len(datum['review_text']) / max_len, 
            datum['parsed_date'].weekday(), 
            datum['parsed_date'].month]

In [22]:
X3 = [feature3(x) for x in dataset]
Y3 = [dataset[x]['rating'] for x in range(len(dataset))]

In [23]:
model3 = linear_model.LinearRegression(fit_intercept=False)
model3.fit(X3, Y3)

In [24]:
y_pred2 = model.predict(X)
mse2 = mean_squared_error(Y, y_pred2)
y_pred3 = model3.predict(X3)
mse3 = mean_squared_error(Y3, y_pred3)
answers['Q3'] = [mse2, mse3]

In [25]:
assertFloatList(answers['Q3'], 2)

In [26]:
### Question 4

In [27]:
random.seed(0)
random.shuffle(dataset)

In [28]:
X2 = [feature(d) for d in dataset]
X3 = [feature3(d) for d in dataset]
Y = [d['rating'] for d in dataset]

In [29]:
train2, test2 = X2[:len(X2)//2], X2[len(X2)//2:]
train3, test3 = X3[:len(X3)//2], X3[len(X3)//2:]
trainY, testY = Y[:len(Y)//2], Y[len(Y)//2:]

In [30]:
model2_1 = linear_model.LinearRegression(fit_intercept=False)
model2_1.fit(train2, trainY)
model3_1 = linear_model.LinearRegression(fit_intercept=False)
model3_1.fit(train3, trainY)

In [31]:
y_pred2_1 = model2_1.predict(test2)
test_mse2 = mean_squared_error(testY, y_pred2_1)
y_pred3_1 = model3_1.predict(test3)
test_mse3 = mean_squared_error(testY, y_pred3_1)
answers['Q4'] = [test_mse2, test_mse3]

In [32]:
assertFloatList(answers['Q4'], 2)

In [33]:
### Question 5

In [34]:
f = open("beer_50000.json")
dataset = []
for l in f:
    dataset.append(eval(l))

In [35]:
max_len = 0
for datum in dataset:
    max_len = max(max_len, len(datum['review/text']))

def feature(datum):
    return [1, len(datum['review/text']) / max_len]

In [36]:
X = [feature(x) for x in dataset]
y = [1 if x['review/overall'] >= 4 else 0 for x in dataset]

In [37]:
model = linear_model.LogisticRegression(fit_intercept=False, class_weight='balanced', C=1.0)
model.fit(X, y)

In [38]:
y_pred = model.predict(X)
y_pred = y_pred.tolist()
TP = sum([(a and b) for (a, b) in zip(y_pred, y)])
TN = sum([(not a and not b) for (a, b) in zip(y_pred, y)])
FP = sum([(a and not b) for (a, b) in zip(y_pred, y)])
FN = sum([(not a and b) for (a, b) in zip(y_pred, y)])

In [39]:
BER = 1 - 0.5 * (TP / (TP + FN) + TN / (TN + FP))
print(BER)

0.4683031525957275


In [40]:
answers['Q5'] = [TP, TN, FP, FN, BER]

In [41]:
assertFloatList(answers['Q5'], 5)

In [42]:
### Question 6

In [43]:
scores = model.decision_function(X)
score_labels = list(zip(scores, y))
score_labels.sort(reverse=True)
print(score_labels[:10])
sorted_labels = [x[1] for x in score_labels]

[(1.402055713537598, 1), (1.3905242586192261, 1), (1.3304209178325606, 1), (1.295826553077445, 1), (1.2692692629624067, 1), (1.2420130967917096, 1), (1.2081176080922529, 0), (1.184006384172021, 1), (1.1651367306692304, 1), (1.1591962841961299, 0)]


In [44]:
precs = []

In [386]:
for k in [1,100,1000,10000]:
    precision_k = sum(sorted_labels[:k]) / k
    precs.append(precision_k)

In [387]:
answers['Q6'] = precs

In [388]:
assertFloatList(answers['Q6'], 4)

In [389]:
### Question 7

In [390]:
style_count = {}
for x in dataset:
    if x['beer/style'] in style_count:
        style_count[x['beer/style']] += 1
    else:
        style_count[x['beer/style']] = 1

styles = {x['beer/style']: 0 for x in dataset}
for x in dataset:
    if x['review/overall'] >= 4:
        styles[x['beer/style']] += 1 / style_count[x['beer/style']]

sorted_style = sorted(styles.items(), key=lambda item: item[1], reverse=True)
print(sorted_style)

[('Keller Bier / Zwickel Bier', 0.9130434782608692), ('American Double / Imperial Stout', 0.8868209255533662), ('Rye Beer', 0.8526140155728487), ('Russian Imperial Stout', 0.8512059369202009), ('English Barleywine', 0.8496240601503784), ('English Dark Mild Ale', 0.8095238095238098), ('American IPA', 0.7862873814734183), ('Munich Helles Lager', 0.7830769230769155), ('American Porter', 0.7825112107623529), ('Baltic Porter', 0.7723735408560286), ('American Wild Ale', 0.734693877551021), ('Belgian Strong Pale Ale', 0.7294303797468384), ('Chile Beer', 0.7272727272727274), ('Belgian IPA', 0.7265625), ('Doppelbock', 0.7216494845360802), ('BiÃ¨re de Garde', 0.7142857142857142), ('Oatmeal Stout', 0.7058823529411761), ('Scotch Ale / Wee Heavy', 0.7028097982708696), ('Black & Tan', 0.688524590163935), ('American Barleywine', 0.6836363636363669), ('American Double / Imperial IPA', 0.6739577972208289), ('KÃ¶lsch', 0.6702127659574464), ('Rauchbier', 0.6646026831785351), ('Old Ale', 0.664448669201522

In [391]:
def feature2(datum):
    return [1, 
            len(datum['review/text']) / max_len,
            datum['beer/ABV'],
            datum['review/aroma'],
            datum['review/palate'],
            datum['review/taste'],
            1 if 'stout' in datum['beer/style'].lower() else 0,
            1 if 'PA' in datum['beer/style'] else 0,
            1 if 'ale' in datum['beer/style'].lower() else 0,
            1 if 'porter' in datum['beer/style'].lower() else 0]

X2 = [feature2(x) for x in dataset]
y2 = [1 if x['review/overall'] >= 4 else 0 for x in dataset]

model2 = linear_model.LogisticRegression(fit_intercept=False, class_weight='balanced', C=1.0, max_iter=500)
model2.fit(X2, y2)

y_pred2 = model2.predict(X2)
y_pred2 = y_pred2.tolist()
TP2 = sum([(a and b) for (a, b) in zip(y_pred2, y2)])
TN2 = sum([(not a and not b) for (a, b) in zip(y_pred2, y2)])
FP2 = sum([(a and not b) for (a, b) in zip(y_pred2, y2)])
FN2 = sum([(not a and b) for (a, b) in zip(y_pred2, y2)])

its_test_BER = 1 - 0.5 * (TP2 / (TP2 + FN2) + TN2 / (TN2 + FP2))
print(its_test_BER)

0.17595629485511854


In [392]:
answers['Q7'] = ["I decided to add the features that I thought would intuitively be some indicators to help decide if a beer is good or bad. One feature that can tell about how a beer will taste is the ABV and the style. For the different styles\
                  of beer, I listed them by the proportion of the style that had review/overall >= 4. I noticed that stouts, IPAs, Ales, and Porters were typically well reviewed and popular styles in general. Thus, I did a one-hot encoding for these\
                  styles; if a beer wasn't any of these styles, then the one-hot encoding would be all zeros. I thought that the review rating subcategories could help predict the overall score. Thus, I added the features for aroma, palate, and \
                  taste ratings which I believed to be most indicative of the overall score. With these features, I achieved a BER of 0.1760 vs the previous BER of 0.4683", its_test_BER]

In [393]:
f = open("answers_hw1.txt", 'w')
f.write(str(answers) + '\n')
f.close()