In [2]:
import random
import os

In [3]:
import gzip
from collections import defaultdict

def readJSON(path):
    for l in gzip.open(path, 'rt'):
        d = eval(l)
        u = d['userID']
        try:
            g = d['gameID']
        except Exception as e:
            g = None
        yield u,g,d

In [4]:
### Would-play baseline: just rank which games are popular and which are not, and return '1' if a game is among the top-ranked

def baseline(ugPairs_train,threshold=.5):
    gameCount = defaultdict(int)
    totalPlayed = 0
    
    for user,game in ugPairs_train:
        gameCount[game] += 1
        totalPlayed += 1
        
    mostPopular = [(gameCount[x], x) for x in gameCount]
    mostPopular.sort()
    mostPopular.reverse() 
    
    return1 = set()
    count = 0
    for ic, i in mostPopular:
        count += ic
        return1.add(i)
        if count > totalPlayed*threshold: break
            
    def predict(ugPairs_test):
        pred = []
        for u,g in ugPairs_test:
            pred.append(1 if g in return1 else 0)
        return pred
    return predict

In [5]:
def writeTestPredictions(predict):
    predFile = open("predictions_Played.txt", 'w')
    os.remove("predictions_Played.txt")
    predFile = open("predictions_Played.txt", 'w')
    for l in open("pairs_Played.txt"):
        if l.startswith("userID"):
            #header
            predFile.write(l)
            continue
        u,g = l.strip().split('-')
        predFile.write(u+'-'+g+","+str(predict([(u,g)])[0])+"\n")
    predFile.close()

In [6]:
# Play Prediction
print("\n Play Prediction \n")



 Play Prediction 



In [7]:
# split the training data (‘train.json.gz’) as follows:
# (1) Reviews 1-165,000 for training
# (2) Reviews 165,001-175,000 for validation

In [8]:
ugPairs = []
timePlayed = []
    
for u,g,d in readJSON("train.json.gz"):
    ugPairs.append((u,g))
    timePlayed.append(d["hours_transformed"])

In [9]:
# test baseline
# writeTestPredictions(baseline(ugPairs))

In [10]:
ugPairs_training = ugPairs[0:165000]
hasPlayed_training = [1] * len(ugPairs_training)
timePlayed_training = timePlayed[0:165000]

ugPairs_validation = ugPairs[165000:175000]
hasPlayed_validation = [1] * len(ugPairs_validation)
timePlayed_validation = timePlayed[165000:175000]

In [11]:
#For each entry (user,game) in the validation set, 
# sample a negative entry by randomly choosing a game that user hasn’t played.

In [12]:
gamesByUser = defaultdict(set)
usersByGame = defaultdict(set)
gameCount = defaultdict(int)
totalPlayed = 0
timePlayedByUserGame = defaultdict(lambda:0)

for i,(user,game) in enumerate(ugPairs_training):
    gamesByUser[user].add(game)
    usersByGame[game].add(user)
    gameCount[game] += 1
    totalPlayed += 1
    timePlayedByUserGame[(user,game)] = timePlayed[i]
    
mostPopular = [(gameCount[x], x) for x in gameCount]
mostPopular.sort()
mostPopular.reverse() 

In [13]:
games = list(usersByGame.keys())
newValidationEntries = []
newHasPlayedEntries = []

for u,g in ugPairs_validation:
    randomGame = random.choice(games)
    while u in usersByGame[randomGame]:
        randomGame = random.choice(games)
    newValidationEntries.append([u,randomGame])
    newHasPlayedEntries.append(0)

ugPairs_validation.extend(newValidationEntries)
hasPlayed_validation.extend(newHasPlayedEntries)

In [14]:
def accuracy(pred,y):
    return sum([p==y[i] for i,p in enumerate(pred)])/len(pred)

In [15]:
# 1
print("\n Question 1 \n")
# accuracy of the baseline model on the validation set you have built (1 mark).
predictions = baseline(ugPairs_training)(ugPairs_validation)
print("Accuracy of baseline with 50 percentile performance:")
print(accuracy(predictions,hasPlayed_validation))


 Question 1 

Accuracy of baseline with 50 percentile performance:
0.6819


In [22]:
#2
print("\n Question 2 \n")

#baseline - using a threshold of the 50th percentile of popularity (totalPlayed/2).
#find a better threshold and report its performance on your validation set

predictions = baseline(ugPairs_training,.65)(ugPairs_validation) 
print("Accuracy with threshold of .65") 
print(accuracy(predictions,hasPlayed_validation))


 Question 2 

Accuracy with threshold of .65
0.70255


In [35]:
# 3


# Given a pair (u, g) in the validation set, consider all training items g′ that user u has played. 
# For each, compute the Jaccard similarity between g and g′, i.e., users (in the training set) who have played g and users who have played g′. 

# Predict as ‘played’ if the maximum of these Jaccard similarities exceeds a threshold
# Report the performance on your validation set
def JaccardSim(g1, g2):
    n = len(g1.intersection(g2))
    d = len(g1.union(g2))
    return n / d

def Q3predict(ugPairs_test, threshold = .031):
    pred = []
    for u,g in ugPairs_test:
        sims = []
        for g2 in gamesByUser[u]:
            if g2 == g: continue
            sims.append(JaccardSim(usersByGame[g], usersByGame[g2]))
        maximum = max(sims) if len(sims)>0 else None
        pred.append(1 if maximum != None and maximum>threshold else 0)
    return pred


print("\n Question 3 \n")

predictions = Q3predict(ugPairs_validation)
print("Accuracy with sim threshold of .16:")
print(accuracy(predictions,hasPlayed_validation))




 Question 3 

Accuracy with sim threshold of .16:
0.67315


In [38]:
# 4 

# Improve using both a Jaccard-based threshold and a popularity based threshold.
# Report the performance on your validation set


def Q4predict(ugPairs_test):
    popPredict = baseline(ugPairs_training,.70)(ugPairs_test) 
    simPredict = Q3predict(ugPairs_test,.021)
    predict = []
    for i in range(len(popPredict)):
        predict.append(1 if popPredict[i]==1 and simPredict[i]==1 else 0)
    return predict
        

print("\n Question 4 \n")

predictions = Q4predict(ugPairs_validation)
print("Accuracy with a similarity threshold of .7 and a popularity threshold of .021:")
print(accuracy(predictions,hasPlayed_validation))


 Question 4 

Accuracy with a similarity threshold of .7 and a popularity threshold of .021:
0.703


In [39]:
# 5 

# run our model on the test set, ‘pairs Played.txt’, run #4 model and upload your solution to Kaggle.
writeTestPredictions(Q4predict)

In [None]:
#
#
#
#
# Insert Kaggle Performance Here
#
#
#

In [40]:
################################################
######### Time Played Prediction ###############
################################################

In [49]:
# Time Played Prediction
print("\n Time Played Prediction \n")

# use part of the data for validation
# the time transformed field, which is computed as log2(time played + 1).
# This is the quantity we are trying to predict.




 Time Played Prediction 



In [50]:
# 9
print("\n Question 9 \n")

# Fit a predictor of the form: time(user,item) = mean + BiasTerm_User + BiasTerm_Item
# ... as described in the lecture notes.
# Use a regularization parameter of λ = 1.

# Report the MSE on the validation set

timePlayedByUserGame_Validation = defaultdict(lambda:0)
ugPairs_validation_time = ugPairs_validation[:1000]

for i,(user,game) in enumerate(ugPairs_validation_time):
    timePlayedByUserGame_Validation[(user,game)] = timePlayed_validation[i]



 Question 9 



In [51]:
#random initialization
alpha={}
alpha[1] = 1.0*sum(timePlayed_validation)/len(timePlayed_validation)
BiasUser = defaultdict(lambda:0)
BiasGame = defaultdict(lambda:0)


def updateAlpha(l):
    alpha[1] = sum([timePlayedByUserGame[(u,g)] - BiasUser[u]-BiasGame[g] for u,g in ugPairs_training])/len(ugPairs_training)
    
def updateBiasUser(u,l):
    BiasUser[u] = sum([timePlayedByUserGame[(u,g)] - alpha[1]-BiasGame[g] for g in gamesByUser[u]]) / (l + len(gamesByUser[u]))
    
def updateBiasGame(g,l):
    BiasGame[g] = sum([timePlayedByUserGame[(u,g)] - alpha[1]-BiasUser[u] for u in usersByGame[g]]) / (l + len(usersByGame[g]))
    
def printObjectiveFunc(l):
    print(sum([(alpha[1]+BiasUser[u]+BiasGame[i]-timePlayedByUserGame[(u,g)])**2 for u,g in ugPairs_training])+l*(sum([BiasUser[u]**2 for u in BiasUser])+sum([BiasGame[g2]**2 for g2 in BiasGame])**2))

def update(l,i):
    updateAlpha(l)
    [updateBiasUser(u,l) for u in gamesByUser]
    [updateBiasGame(g,l) for g in usersByGame]
    if i % 100 ==0:
        printObjectiveFunc(l)
    

for i in range(500):
    update(1,i)

10836021.48241849
13271921.94879321
12982270.950644014
12977661.500099069
12980167.140861055


In [52]:
MSE = sum([(timePlayedByUserGame_Validation[(u,g)]-(alpha[1]+BiasUser[u]+BiasGame[i]))**2 for u,g in ugPairs_validation_time])/len(ugPairs_validation_time)
print("MSE:")
print(MSE)

MSE:
5.596353081460636


In [53]:
# 10
print("\n Question 10 \n")
import operator
# Report the user and game IDs that have the largest and smallest values of β
print("The user with the max value of the Bias is:")
print(max(BiasUser.items(), key=operator.itemgetter(1))[0])
print("The game with the max value of the Bias is:")
print(max(BiasUser.items(), key=operator.itemgetter(1))[0])
print("The user with the min value of the Bias is:")
print(min(BiasGame.items(), key=operator.itemgetter(1))[0])
print("The game with the min value of the Bias is:")
print(min(BiasGame.items(), key=operator.itemgetter(1))[0])



 Question 10 

The user with the max value of the Bias is:
u38845867
The game with the max value of the Bias is:
u38845867
The user with the min value of the Bias is:
b05546112
The game with the min value of the Bias is:
b05546112


In [54]:
# 11
print("\n Question 11 \n")

# Find a better value of λ using your validation set.

#Report the value you chose, its MSE, and upload your solution to Kaggle by running it on the test data


 Question 11 



In [64]:
l = .5

#random initialization
alpha={}
alpha[1] = 1.0*sum(timePlayed_validation)/len(timePlayed_validation)
BiasUser = defaultdict(lambda:0)
BiasGame = defaultdict(lambda:0)

for i in range(500):
    update(l,i)
    
print("using a lambda of "+str(l))
MSE = sum([(timePlayedByUserGame_Validation[(u,g)]-(alpha[1]+BiasUser[u]+BiasGame[i]))**2 for u,g in ugPairs_validation_time])/len(ugPairs_validation_time)
print("MSE:")
print(MSE)

6044215.035878684
8045924.964042197
7435117.836161754
7307944.148052311
7284373.81738448
using a lambda of 0.5
MSE:
5.588154240117794


In [67]:
def predictHours(u,g):
    return alpha[1] + BiasUser[u] + BiasGame[g]

In [68]:
predFile = open("predictions_Hours.txt", 'w')
os.remove("predictions_Hours.txt")
predFile = open("predictions_Hours.txt", 'w')
for l in open("pairs_Hours.txt"):
    if l.startswith("userID"):
        #header
        predFile.write(l)
        continue
    u,g = l.strip().split('-')
    predFile.write(u+'-'+g+","+str(predictHours(u,g))+"\n")
predFile.close()