In [1]:
import numpy
import urllib
import scipy.optimize
import random
from sklearn import linear_model
import gzip
from collections import defaultdict

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
def assertFloat(x):
    assert type(float(x)) == float

def assertFloatList(items, N):
    assert len(items) == N
    assert [type(float(x)) for x in items] == [float]*N

In [4]:
f = open("5year.arff", 'r')

In [5]:
# Read and parse the data
while not '@data' in f.readline():
    pass

dataset = []
for l in f:
    if '?' in l: # Missing entry
        continue
    l = l.split(',')
    values = [1] + [float(x) for x in l]
    values[-1] = values[-1] > 0 # Convert to bool
    dataset.append(values)

In [6]:
X = [d[:-1] for d in dataset]
y = [d[-1] for d in dataset]

In [7]:
answers = {} # Your answers

In [8]:
def total_tp(predictions, y):
     return sum([(p and q) for (p, q) in zip(predictions, y)])

In [9]:
def total_fp(predictions, y):
    return sum([(p and not q) for (p, q) in zip(predictions, y)])

In [10]:
def total_tn(predictions, y):
    return sum([(not p and not q) for (p, q) in zip(predictions, y)])

In [11]:
def total_fn(predictions, y):
    return sum([(not p and q) for (p, q) in zip(predictions, y)])

In [12]:
def get_tpr(TP, FN):
    return TP / (TP + FN)

In [13]:
def get_tnr(TN, FP):
    return TN / (TN + FP)

In [14]:
def accuracy(predictions, y):
    return sum(predictions == y) / len(y)

In [15]:
def BER(predictions, y):
    TP = total_tp(predictions, y)
    FN = total_fn(predictions, y)
    TPR = get_tpr(TP, FN)
    
    TN = total_tn(predictions, y)
    FP = total_fp(predictions, y)
    TNR = get_tnr(TN, FP)
    
    return 1 - 0.5 * (TPR + TNR)

In [16]:
### Question 1

In [17]:
mod = linear_model.LogisticRegression(C=1)
mod.fit(X,y)

pred = mod.predict(X)

In [19]:
acc1 = accuracy(pred, y)
ber1 = BER(pred, y)

In [20]:
print(acc1)
print(ber1)

0.9656878917848895
0.4766851431593464


In [21]:
answers['Q1'] = [acc1, ber1] # Accuracy and balanced error rate

In [22]:
assertFloatList(answers['Q1'], 2)

In [23]:
### Question 2

In [24]:
mod = linear_model.LogisticRegression(C=1, class_weight='balanced')
mod.fit(X,y)

pred = mod.predict(X)

In [25]:
acc2 = accuracy(pred, y)
ber2 = BER(pred, y)

In [26]:
print(acc2)
print(ber2)

0.6951501154734411
0.304401890493309


In [27]:
answers['Q2'] = [acc2, ber2]

In [28]:
assertFloatList(answers['Q2'], 2)

In [29]:
### Question 3

In [30]:
random.seed(3)
random.shuffle(dataset)

In [31]:
X = [d[:-1] for d in dataset]
y = [d[-1] for d in dataset]

In [32]:
Xtrain, Xvalid, Xtest = X[:len(X)//2], X[len(X)//2:(3*len(X))//4], X[(3*len(X))//4:]
ytrain, yvalid, ytest = y[:len(X)//2], y[len(X)//2:(3*len(X))//4], y[(3*len(X))//4:]

In [33]:
len(Xtrain), len(Xvalid), len(Xtest)

(1515, 758, 758)

In [34]:
mod = linear_model.LogisticRegression(C=1, class_weight='balanced')
mod.fit(Xtrain,ytrain)

predTrain = mod.predict(Xtrain)
berTrain = BER(predTrain, ytrain)

predValid = mod.predict(Xvalid)
berValid = BER(predValid, yvalid)

predTest = mod.predict(Xtest)
berTest = BER(predTest, ytest)

In [35]:
print(berTrain)
print(berValid)
print(berTest)

0.29287226079549855
0.3159203980099503
0.2585616438356164


In [36]:
answers['Q3'] = [berTrain, berValid, berTest]

In [37]:
assertFloatList(answers['Q3'], 3)

In [38]:
### Question 4

In [41]:
C = [1e-4, 1e-3, 1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4]
berList = list()

for c in C:
    mod = linear_model.LogisticRegression(penalty='l2', C=c, class_weight='balanced')
    mod.fit(Xtrain, ytrain)
    
    predValid = mod.predict(Xvalid)
    berValid = BER(predValid, yvalid)
    berList.append(berValid)    

In [42]:
print(berList)

[0.3288104929895974, 0.31931252826775225, 0.3281320669380371, 0.3179556761646314, 0.3159203980099503, 0.3111714156490276, 0.2955030044582283, 0.29618143050978873, 0.29618143050978873]


In [43]:
answers['Q4'] = berList

In [44]:
assertFloatList(answers['Q4'], 9)

In [45]:
### Question 5

In [52]:
ber5 = min(berList)
bestCList = [i for i, val in enumerate(berList) if val == ber5]
print(bestCList)
bestC = C[bestCList[0]]
print(bestC)
print(ber5)


[6]
100.0
0.2955030044582283


In [50]:
answers['Q5'] = [bestC, ber5]

In [51]:
assertFloatList(answers['Q5'], 2)

In [53]:
### Question 6

In [54]:
f = gzip.open("young_adult_10000.json.gz")
dataset = []
for l in f:
    dataset.append(eval(l))

In [55]:
dataTrain = dataset[:9000]
dataTest = dataset[9000:]

In [61]:
print(dataTrain[0])

{'user_id': '8842281e1d1347389f2ab93d60773d4d', 'book_id': '2767052', 'review_id': '248c011811e945eca861b5c31a549291', 'rating': 5, 'review_text': "I cracked and finally picked this up. Very enjoyable quick read - couldn't put it down - it was like crack. \n I'm a bit bothered by the lack of backstory of how Panem and the Hunger Games come about. It is just kind of explained away in a few paragraphs and we are left to accept this very strange world where teenagers are pitted into an arena each year to kill each other? I was expecting it because I've seen Battle Royale, but I would have appreciated knowing more of the backstory of how the world could have come into such a odd state. \n I suppose what makes a book like this interesting is thinking about the strategy of it all. The players are going to be statistically encouraged to band together because they will last longer that way, but by definition of course any partnership will be broken, and the drama of how that unfolds is always 

In [70]:
# Some data structures you might want

usersPerItem = defaultdict(set) # Maps an item to the users who rated it
itemsPerUser = defaultdict(set) # Maps a user to the items that they rated
reviewsPerUser = defaultdict(list)
reviewsPerItem = defaultdict(list)
ratingDict = {} # To retrieve a rating for a specific user/item pair

for d in dataTrain:
    user, item, review = d['user_id'], d['book_id'], d
    usersPerItem[item].add(user)
    itemsPerUser[user].add(item)
    
    reviewsPerUser[user].append(review)
    reviewsPerItem[item].append(review)
    
    ratingDict[(user, item)] = d['rating']
    

In [60]:
def Jaccard(s1, s2):
    a = len(s1.intersection(s2))
    b = len(s1.union(s2))
    return 0 if b == 0 else a / b

In [66]:
def mostSimilar(i, N):
    sims = []
    users = usersPerItem[i]
    
    for j in usersPerItem:
        if i == j:
            continue
        else:
            sim = Jaccard(users, usersPerItem[j])
            sims.append((sim, j))
    
    sims.sort(reverse=True)
    
    return sims[:N]

In [67]:
print(mostSimilar('2767052', 10))

[(0.4125, '6148028'), (0.3411764705882353, '7260188'), (0.1590909090909091, '256683'), (0.1375, '1162543'), (0.11494252873563218, '11735983'), (0.10989010989010989, '13335037'), (0.10810810810810811, '28187'), (0.10666666666666667, '428263'), (0.09876543209876543, '49041'), (0.09782608695652174, '41865')]


In [68]:
answers['Q6'] = mostSimilar('2767052', 10)

In [69]:
assert len(answers['Q6']) == 10
assertFloatList([x[0] for x in answers['Q6']], 10)

In [None]:
### Question 7

In [75]:
userRatingMean = dict()
itemRatingMean = dict()

for u in itemsPerUser:
    ratings = [ratingDict[(u, i)] for i in itemsPerUser[u]]
    if len(ratings) == 0:
        continue
    else:
        userRatingMean[u] = sum(ratings) / len(ratings)

for i in usersPerItem:
    ratings = [ratingDict[(u, i)] for u in usersPerItem[i]]
    if len(ratings) == 0:
        continue
    else:
        itemRatingMean[i] = sum(ratings) / len(ratings)

In [77]:
globalRatingMean = sum([d['rating'] for d in dataTrain]) / len(dataTrain)
print(globalRatingMean)

3.742888888888889


In [82]:
def predictRating(user, item):
    ratings = []
    sims = []
    
    for d in reviewsPerUser[user]:
        j = d['book_id']
        if item == j:
            continue
        else:
            ratings.append(d['rating'] - itemRatingMean[j])
            sims.append(Jaccard(usersPerItem[item], usersPerItem[j]))
    
    simsSum = sum(sims)
    if (simsSum > 0):
        weightedRatings = [(a * b) for a, b in zip(ratings, sims)]
        return itemRatingMean[item] + sum(weightedRatings) / simsSum
    else:
        return globalRatingMean

In [83]:
def MSE(preds, labels):
    diffs = [(a - b )**2 for a, b in zip(preds, labels)]
    return sum(diffs) / len (diffs)

In [84]:
preds = [predictRating(d['user_id'], d['book_id']) for d in dataTest]

In [86]:
labels = [d['rating'] for d in dataTest]

In [87]:
mse7 = MSE(preds, labels)

In [88]:
print(mse7)

1.2469091498159586


In [89]:
answers['Q7'] = mse7

In [None]:
assertFloat(answers['Q7'])

In [None]:
### Question 8

In [None]:
answers['Q8'] = mse8

In [None]:
assertFloat(answers['Q8'])

In [None]:
f = open("answers_hw2.txt", 'w')
f.write(str(answers) + '\n')
f.close()