In [29]:
import json
from matplotlib import pyplot as plt
from collections import defaultdict
from sklearn import linear_model
import numpy
import random
import gzip
import math

In [30]:
def assertFloat(x): # Checks that an answer is a float
    assert type(float(x)) == float

def assertFloatList(items, N):
    assert len(items) == N
    assert [type(float(x)) for x in items] == [float]*N

In [31]:
f = gzip.open("young_adult_10000.json.gz")
dataset = []
for l in f:
    dataset.append(json.loads(l))

In [32]:
len(dataset)

10000

In [33]:
answers = {} # Put your answers to each question in this dictionary

In [34]:
dataset[1]

{'user_id': '7504b2aee1ecb5b2872d3da381c6c91e',
 'book_id': '23302416',
 'review_id': '84c0936a0f9868f38e75d2f9a5cb761e',
 'rating': 5,
 'review_text': "I read this book because my fifth grade son was required to for school. I'm so glad I did! I experienced a range of emotions & just loved it. Glad these middle schoolers are being exposed to the topics discussed in the book.",
 'date_added': 'Wed Jan 21 18:40:59 -0800 2015',
 'date_updated': 'Wed Oct 26 03:44:13 -0700 2016',
 'read_at': '',
 'started_at': '',
 'n_votes': 0,
 'n_comments': 0}

In [35]:
### Question 1

In [36]:
def feature(datum):
    # your implementation
    review = datum['review_text']
    feat = review.count('!')
    
    return [1] + [feat]
        

In [37]:
X = [feature(d) for d in dataset]
Y = [d['rating'] for d in dataset]
theta, residuals, rank, s = numpy.linalg.lstsq(X, Y)
#len(theta)
theta0 = theta[0]
theta1 = theta[1]
#print(residuals.shape)
mse = residuals[0] / len(dataset)

print(theta0)
print(theta1)
print(mse)

3.6885330408320085
0.0710901901995423
1.523174740453871


  theta, residuals, rank, s = numpy.linalg.lstsq(X, Y)


In [38]:
answers['Q1'] = [theta0, theta1, mse]

In [39]:
assertFloatList(answers['Q1'], 3) # Check the format of your answer (three floats)

In [40]:
### Question 2

In [41]:
def feature(datum):
    review = datum['review_text']
    feat1 = len(review)
    feat2 = review.count('!')
    
    return [1] + [feat1] + [feat2]

In [42]:
X = [feature(d) for d in dataset]
Y = [d['rating'] for d in dataset]
theta, residuals, rank, s = numpy.linalg.lstsq(X, Y)
#len(theta)
theta0 = theta[0]
theta1 = theta[1]
theta2 = theta[2]
mse = residuals[0] / len(dataset)

print(theta0)
print(theta1)
print(theta2)
print(mse)

3.71751280779718
-4.1215065294880096e-05
0.07527591733232662
1.5214029246165892


  theta, residuals, rank, s = numpy.linalg.lstsq(X, Y)


In [43]:
answers['Q2'] = [theta0, theta1, theta2, mse]

In [44]:
assertFloatList(answers['Q2'], 4)

In [45]:
### Question 3

In [46]:
def feature(datum, deg):
    # feature for a specific polynomial degree
    review = datum['review_text']
    feat = review.count('!')
    res = [feat ** i for i in range(deg + 1)]
    
    return res

def polyfit(dataset, deg):
    mses = []
    
    for i in range(1, deg + 1):
        X = [feature(d, i) for d in dataset]
        Y = [d['rating'] for d in dataset]
        theta, residuals, rank, s = numpy.linalg.lstsq(X, Y)
        mses = mses + [residuals[0] / len(dataset)]
        print(mses)
        
    return mses

mses = polyfit(dataset, 5)

  theta, residuals, rank, s = numpy.linalg.lstsq(X, Y)


[1.523174740453871]
[1.523174740453871, 1.5046686106250748]
[1.523174740453871, 1.5046686106250748, 1.4966845515181375]
[1.523174740453871, 1.5046686106250748, 1.4966845515181375, 1.490447730223032]
[1.523174740453871, 1.5046686106250748, 1.4966845515181375, 1.490447730223032, 1.4896106953963144]


In [47]:
answers['Q3'] = mses

In [48]:
assertFloatList(answers['Q3'], 5)# List of length 5

In [49]:
### Question 4

In [50]:
def feature(datum, deg):
    # feature for a specific polynomial degree
    review = datum['review_text']
    feat = review.count('!')
    res = [feat ** i for i in range(deg + 1)]
    
    return res

def polyfit(dataset, deg):    
    res = list()
    
    for i in range(1, deg + 1):
        X = [feature(d, i) for d in dataset]
        Y = [d['rating'] for d in dataset]
        theta, residuals, rank, s = numpy.linalg.lstsq(X, Y)
        #print(theta)
        res.append({'theta': theta, 'residuals': residuals, 'rank': rank, 's': s})
        
    return res

def predict(dataset, theta):    
    deg = len(theta) - 1
    theta = numpy.array(theta).T
    res = [ numpy.dot(feature(d, deg), theta)for d in dataset]
    #print(res[:10])
    return res

def getMSE(pred, real):
    pred = numpy.array(pred)
    real = numpy.array(real)
    diff = numpy.subtract(pred, real)
    sqdiff = numpy.square(diff)
    mse = sqdiff.mean()
    
    return mse

totalsize = len(dataset)
trainset = dataset[0: int(totalsize/2)]
testset = dataset[int(totalsize/2):]
#print(len(trainset))
#print(len(testset))

trainres = polyfit(trainset, 5)
#print(trainres)
mses = list()
realY = [d['rating'] for d in testset]

for i in range(len(trainres)):
    thetagrp = trainres[i]['theta']
    #print(thetagrp1)
    predY = predict(testset, thetagrp)
    mse = getMSE(predY, realY)
    #print(mse)
    mses.append(mse)

print(mses)

  theta, residuals, rank, s = numpy.linalg.lstsq(X, Y)


[1.5248743859866292, 1.4977199259322453, 1.4856632190311185, 1.4767337440077424, 1.4809577272589876]


In [51]:
answers['Q4'] = mses

In [52]:
assertFloatList(answers['Q4'], 5)

In [53]:
### Question 5

In [54]:
trainY = [d['rating'] for d in trainset]
theta0 = numpy.median(numpy.array(trainY))
print(theta0)

testY = [d['rating'] for d in testset]
diff = numpy.absolute(numpy.subtract(numpy.array(testY), theta0))
mae = diff.mean()
print(mae)

4.0
0.907


In [55]:
answers['Q5'] = mae

In [56]:
assertFloat(answers['Q5'])

In [57]:
### Question 6

In [58]:
f = open("beer_50000.json")
dataset = []
for l in f:
    if 'user/gender' in l:
        dataset.append(eval(l))

In [59]:
len(dataset)
print(dataset[0])

{'review/appearance': 4.0, 'beer/style': 'American Double / Imperial IPA', 'review/palate': 4.0, 'review/taste': 4.5, 'beer/name': 'Cauldron DIPA', 'review/timeUnix': 1293735206, 'user/gender': 'Male', 'user/birthdayRaw': 'Jun 16, 1901', 'beer/ABV': 7.7, 'beer/beerId': '64883', 'user/birthdayUnix': -2163081600, 'beer/brewerId': '1075', 'review/timeStruct': {'isdst': 0, 'mday': 30, 'hour': 18, 'min': 53, 'sec': 26, 'mon': 12, 'year': 2010, 'yday': 364, 'wday': 3}, 'user/ageInSeconds': 3581417047, 'review/overall': 4.0, 'review/text': "According to the website, the style for the Caldera Cauldron changes every year. The current release is a DIPA, which frankly is the only cauldron I'm familiar with (it was an IPA/DIPA the last time I ordered a cauldron at the horsebrass several years back). In any event... at the Horse Brass yesterday.\t\tThe beer pours an orange copper color with good head retention and lacing. The nose is all hoppy IPA goodness, showcasing a huge aroma of dry citrus, pi

In [60]:
def feature(datum):
    # your implementation
    review = datum['review/text']
    feat = review.count('!')
    
    return [1] + [feat]

X = [feature(d) for d in dataset if d['user/gender'] == 'Male' or d['user/gender'] == 'Female']
y = [d['user/gender'] for d in dataset if d['user/gender'] == 'Male' or d['user/gender'] == 'Female' ]
y = [0 if d == 'Male' else 1 for d in y]

In [61]:
print(len(dataset))
print(len(X))
print(len(y))

mod = linear_model.LogisticRegression(C=1.0)
mod.fit(X, y)

pred = mod.predict(X)


20403
20403
20403


In [62]:
#print (pred[:2])
#print (y[:2])
TP_ = numpy.logical_and(pred, y)
TN_ = numpy.logical_and(numpy.logical_not(pred), numpy.logical_not(y))
FP_ = numpy.logical_and(pred, numpy.logical_not(y))
FN_ = numpy.logical_and(numpy.logical_not(pred), y)

TP = sum(TP_)
TN = sum(TN_)
FP = sum(FP_)
FN = sum(FN_)

print(TP)
print(TN)
print(FP)
print(FN)

BER = 1 - 0.5 * (TP / (TP + FN) + TN / (TN + FP))
print(BER)

0
20095
0
308
0.5


In [63]:
answers['Q6'] = [TP, TN, FP, FN, BER]

In [64]:
assertFloatList(answers['Q6'], 5)

In [65]:
### Question 7

In [66]:

mod = linear_model.LogisticRegression(C=1.0, class_weight='balanced')
mod.fit(X, y)

pred = mod.predict(X)

In [67]:
TP_ = numpy.logical_and(pred, y)
TN_ = numpy.logical_and(numpy.logical_not(pred), numpy.logical_not(y))
FP_ = numpy.logical_and(pred, numpy.logical_not(y))
FN_ = numpy.logical_and(numpy.logical_not(pred), y)

TP = sum(TP_)
TN = sum(TN_)
FP = sum(FP_)
FN = sum(FN_)

print(TP)
print(TN)
print(FP)
print(FN)

BER = 1 - 0.5 * (TP / (TP + FN) + TN / (TN + FP))
print(BER)

88
16332
3763
220
0.4507731134255145


In [68]:
answers["Q7"] = [TP, TN, FP, FN, BER]

In [69]:
assertFloatList(answers['Q7'], 5)

In [70]:
### Question 8

In [71]:
scores = mod.decision_function(X)
scoreslabels = list(zip(scores, y))
scoreslabels.sort(reverse=True)
sortedlabels = [x[1] for x in scoreslabels]

precisionList = list()
precisionList.append(sum(sortedlabels[:1]) / 1)
precisionList.append(sum(sortedlabels[:10]) / 10)
precisionList.append(sum(sortedlabels[:100]) / 100)
precisionList.append(sum(sortedlabels[:1000]) / 1000)
precisionList.append(sum(sortedlabels[:10000]) / 10000)

print(precisionList)

[0.0, 0.0, 0.03, 0.033, 0.0308]


In [72]:
answers['Q8'] = precisionList

In [73]:
assertFloatList(answers['Q8'], 5) #List of five floats

In [74]:
f = open("answers_hw1.txt", 'w') # Write your answers to a file
f.write(str(answers) + '\n')
f.close()