In [3]:
import pandas as pd
import numpy as np
import random
import yaml
import gzip
import string
from collections import defaultdict
from IPython.display import clear_output
from sklearn.neighbors import NearestNeighbors
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import scale
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
from sklearn.feature_selection import chi2
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
import seaborn as sns

In [4]:
data_folder = "assignment1/"

In [5]:
#Load Data
f = gzip.open(data_folder+"train.json.gz", 'r+')
d = []
for line in f:
    value = eval(line)
    d.append(value)
data=pd.DataFrame(d)

# Tasks-Play Prediction

In [6]:
#Train-validation split
data = data.sample(frac=1).reset_index(drop=True)
train = data.iloc[:165000]
validation = data.iloc[165000:175000]

In [7]:
#Create dictionary set of user/game pair indicating which games has each player played or not played
played={}
not_played={}
user_id = train.userID.unique()
game_id = train.gameID.unique()

for user in user_id:
    played[user] = []
    not_played[user] = []

for x in train.itertuples():
    played[x.userID].append(x.gameID)

for user in user_id:
    not_played[user]= set(game_id) - set(played[user])

In [8]:
#Create prediction label for validation set
validation['played'] = True

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  validation['played'] = True


In [9]:
#Create negative samples
neg=[]
for x in validation.itertuples():
    neg.append(random.sample(not_played[x.userID],1)[0])
    
validation_neg = pd.DataFrame()
validation_neg['userID'] = validation.userID
validation_neg['gameID'] = neg
validation_neg['played'] = False

validation = pd.concat([validation, validation_neg])

In [10]:
#Create dictionary set of user/game pair indicating which users played each game
game_user = {}
for g in game_id:
    game_user[g] = []

for x in train.itertuples():
    game_user[x.gameID].append(x.userID)

## First let's try using Jaccard Similarity

In [11]:
#Define Jaccard similarity
def Jaccard(s1, s2):
    numer = len(s1.intersection(s2))
    denom = len(s1.union(s2))
    return numer/denom

In [12]:
#Define algorithm using the Jaccard similarity
def jaccard_pred(u, g):
    if u not in played.keys():
        return 0
    g1 = played[u]
    if g in g1:
        g1.remove(g)
    s1 = set(game_user[g])
    jmax = 0
    for gi in g1:
        s2 = set(game_user[gi])
        j = Jaccard(s1, s2)
        if j > jmax:
            jmax = j
    return jmax

In [None]:
accuracy = {}

gameCount = defaultdict(int)
totalPlayed = 0

for x in train.itertuples():
    gameCount[x.gameID] += 1
    totalPlayed += 1

mostPopular = [(gameCount[x], x) for x in gameCount]
mostPopular.sort()
mostPopular.reverse()

counter = 0
for t1 in np.linspace(1,3,5):
    return1 = set()
    count = 0
    
    for ic, i in mostPopular:
        count += ic
        return1.add(i)
        if count > totalPlayed/t1: break
    
    for t2 in np.linspace(0.01,0.05,5):
        clear_output(wait=True)
        pred = []
        for x in validation.itertuples():
            if (jaccard_pred(x.userID, x.gameID) > t2) and (x.gameID in return1):
                pred.append(True)
            else:
                pred.append(False)
        validation['pred'] = pred
        accuracy[(t1,t2)] = (validation.pred == validation.played).sum()/len(validation.played)
        print(np.round((counter/25)*100, 2), "%")
        counter += 1
accuracy

The best accuracy we got is when we set the threhold of popularity to __1.5__ and the threshold to similarity to __0.02__. This set up gives us an accuracy of __0.7054__.

In [25]:
gameCount = defaultdict(int)
totalPlayed = 0

for x in train.itertuples():
    gameCount[x.gameID] += 1
    totalPlayed += 1

mostPopular = [(gameCount[x], x) for x in gameCount]
mostPopular.sort()
mostPopular.reverse()

In [26]:
return1 = set()
count = 0
    
for ic, i in mostPopular:
    count += ic
    return1.add(i)
    if count > totalPlayed/1.5: break

In [27]:
pred = []
for x in df.itertuples():
    if (x.gameID in return1):
        pred.append(True)
    else:
        pred.append(False)
df['pred'] = pred
accuracy = (df.pred == df.played).sum()/len(df.played)
print(accuracy)

0.713


## Let's try Euclidean distance

In [None]:
def Euclidean(s1,s2):
    return len(s1.union(s2)) - len(s1.intersection(s2))

In [None]:
def Euclidean_pred(u, g):
    if u not in played.keys():
        return 0
    g1 = played[u]
    if g in g1:
        g1.remove(g)
    s1 = set(game_user[g])
    jmax = 0
    for gi in g1:
        s2 = set(game_user[gi])
        j = Euclidean(s1, s2)
        if j > jmax:
            jmax = j
    return jmax

In [None]:
pred = []
for x in df.itertuples():
    if (Euclidean_pred(x.userID, x.gameID) > 950):
        pred.append(True)
    else:
        pred.append(False)
df['pred'] = pred
accuracy = (df.pred == df.played).sum()/len(df.played)
print(accuracy)

## Next up let's try Cosine Similarity

In [13]:
# We will use an adjusted cosine similarity where we take into consideration of hours played
# First we construct a user-game pairwise metrics with hours_transformed as the value.
temp = data[['gameID', 'userID']]
data['played'] = 1
M = data.pivot_table(columns='gameID', index='userID',values ='played')

### Let's first try a user-based recommendation system

In [14]:
similarity_u = pd.DataFrame(cosine_similarity(
    scale(M.fillna(-10000))),
    index=M.index,
    columns=M.index)

In [None]:
played[user_id]

In [21]:
def cosine_pred_u(u,g):
    if u not in played.keys():
        return 0
    u1 = game_user[g]
    if u in u1:
        u1.remove(u)
    sim_max = 0
    for ui in u1:
        sim = similarity_u[u][ui]
        if sim > sim_max:
            sim_max = sim
    return sim_max

In [34]:
pred = []
for x in data.itertuples():
    if cosine_pred_u(x.userID, x.gameID) and (x.gameID in return1):
        pred.append(True)
    else:
        pred.append(False)
data['pred'] = pred
accuracy = (data.pred == data.played).sum()/len(data.played)
print(accuracy)

0.6664685714285714


### Next let's try item-based system

In [135]:
similarity_i = pd.DataFrame(cosine_similarity(
    scale(M.T.fillna(-10000))),
    index=M.columns,
    columns=M.columns)

In [None]:
def cosine_pred_i(u,g):
    if u not in played.keys():
        return 0
    g1 = played[u]
    if g in g1:
        g1.remove(g)
    sim_max = 0
    for gi in g1:
        sim = similarity_i[g][gi]
        if sim > sim_max:
            sim_max = sim
    return sim_max

In [None]:
pred = []
for x in df.itertuples():
    if (cosine_pred_i(x.userID, x.gameID) > 0.03):
        pred.append(True)
    else:
        pred.append(False)
df['pred'] = pred
accuracy = (df.pred == df.played).sum()/len(df.played)
print(accuracy)

In [None]:
pred = []
for x in df.itertuples():
    if (cosine_pred_u(x.userID, x.gameID) > 0.03) and (cosine_pred_i(x.userID, x.gameID) > 0) and (x.gameID in return1):
        pred.append(True)
    else:
        pred.append(False)
df['pred'] = pred
accuracy = (df.pred == df.played).sum()/len(df.played)
print(accuracy)

In [None]:
pred = []
for x in validation.itertuples():
    if (cosine_pred_u(x.userID, x.gameID) > 0.03) and (cosine_pred_i(x.userID, x.gameID) > 0) and (x.gameID in return1):
        pred.append(True)
    else:
        pred.append(False)
validation['pred'] = pred
accuracy = (validation.pred == validation.played).sum()/len(validation.played)
print(accuracy)

In [None]:
pred = []
for x in validation.itertuples():
    if (cosine_pred_u(x.userID, x.gameID) > 0.03) and (cosine_pred_i(x.userID, x.gameID) > 0.02) and (x.gameID in return1):
        pred.append(True)
    else:
        pred.append(False)
validation['pred'] = pred
accuracy = (validation.pred == validation.played).sum()/len(validation.played)
print(accuracy)

In [None]:
pred = []
for x in validation.itertuples():
    if (cosine_pred_u(x.userID, x.gameID) > 0.05) and (cosine_pred_i(x.userID, x.gameID) > 0.02) and (x.gameID in return1):
        pred.append(True)
    else:
        pred.append(False)
validation['pred'] = pred
accuracy = (validation.pred == validation.played).sum()/len(validation.played)
print(accuracy)

### Test the system

In [35]:
#Load Dataset
test = pd.read_csv(data_folder+"pairs_Played.txt", sep="-|,", engine='python')

In [36]:
#Create dictionary set of user/game pair indicating which games has each player played or not played
played={}
not_played={}
user_id = data.userID.unique()
game_id = data.gameID.unique()

for user in user_id:
    played[user] = []
    not_played[user] = []

for x in data.itertuples():
    played[x.userID].append(x.gameID)

for user in user_id:
    not_played[user]= set(game_id) - set(played[user])

In [129]:
def cosine_pred_u(u,g):
    if u not in played.keys():
        return 0
    u1 = game_user[g]
    if u in u1:
        u1.remove(u)
    sim_max = 0
    for ui in u1:
        sim = similarity_u[u][ui]
        if sim > sim_max:
            sim_max = sim
    return sim_max

In [131]:
def cosine_pred_i(u,g):
    if u not in played.keys():
        return 0
    g1 = played[u]
    if g in g1:
        g1.remove(g)
    sim_max = 0
    for gi in g1:
        sim = similarity_i[g][gi]
        if sim > sim_max:
            sim_max = sim
    return sim_max

In [177]:
df = validation.sample(frac=1)[:2000]

In [192]:
gameCount = defaultdict(int)
totalPlayed = 0

for x in train.itertuples():
    gameCount[x.gameID] += 1
    totalPlayed += 1

mostPopular = [(gameCount[x], x) for x in gameCount]
mostPopular.sort()
mostPopular.reverse()

return1 = set()
count = 0
    
for ic, i in mostPopular:
    count += ic
    return1.add(i)
    if count > totalPlayed/3: break
        
pred = []
for x in df.itertuples():
    if ((cosine_pred_u(x.userID, x.gameID)>0.05) or (cosine_pred_i(x.userID,x.gameID)>0.05)) and (x.gameID in return1):
        pred.append(1)
    else:
        pred.append(0)
df['pred'] = pred
np.sum(df.pred==df.played)/len(df.pred)

0.631

In [193]:
gameCount = defaultdict(int)
totalPlayed = 0

for x in train.itertuples():
    gameCount[x.gameID] += 1
    totalPlayed += 1

mostPopular = [(gameCount[x], x) for x in gameCount]
mostPopular.sort()
mostPopular.reverse()

return1 = set()
count = 0
    
for ic, i in mostPopular:
    count += ic
    return1.add(i)
    if count > totalPlayed/1.6: break
    
pred = []
for x in test.itertuples():
    if ((cosine_pred_u(x.userID, x.gameID)>0.05) or (cosine_pred_i(x.userID,x.gameID)>0.05)) and (x.gameID in return1):
        pred.append(1)
    else:
        pred.append(0)
test['prediction'] = pred

In [194]:
predictions = open("predictions_Played.txt", 'w')
predictions. truncate(0)
predictions.write("userID-gameID,prediction\n")
for x in test.itertuples():
    predictions.write(str(x.userID) + '-' + str(x.gameID) + ',' + str(x.prediction) + '\n')
predictions.close()

# Tasks-Category Prediction

In [5]:
#Load Data
f = gzip.open(data_folder+"train_Category.json.gz", 'r+')
f.readline()
d = []
for line in f:
    value = eval(line)
    d.append(value)
data=pd.DataFrame(d)

## Baseline Bag of Words Model

In [None]:
#Build set of all words appeared in the training data
wordCount = defaultdict(int)
punctuation = set(string.punctuation)
for d in train.itertuples():
    r = ''.join([c for c in d.text.lower() if not c in punctuation])
    for w in r.split():
        wordCount[w] += 1

In [None]:
#Extract the 2500 most common words
counts = [(wordCount[w], w) for w in wordCount]
counts.sort()
counts.reverse()

words = [x[1] for x in counts[:5000]]

In [None]:
#Build Bag of Words
wordId = dict(zip(words, range(len(words))))
wordSet = set(words)

def feature(datum):
    feat = [0]*len(words)
    r = ''.join([c for c in datum.lower() if not c in punctuation])
    for w in r.split():
        if w in words:
            feat[wordId[w]] += 1
    return feat

X = [feature(d) for d in train.text]
y = train['genreID']

In [None]:
#Build and fit logistic regression model
logreg = LogisticRegression(max_iter=100000, C=1.5)
logreg.fit(X,y)

In [None]:
#Make predictions and evaludate the performance
y_pred = logreg.predict([feature(d) for d in validation.text])
acc = (y_pred == validation.genreID).sum()/len(y_pred)
print("The accuracy of this model on the validation set is ", str(acc))

In [None]:
#Make predictions and evaludate the performance
y_pred = logreg.predict([feature(d) for d in train.text])
acc = (y_pred == train.genreID).sum()/len(y_pred)
print("The accuracy of this model on the validation set is ", str(acc))

## TFIDF Method

In [15]:
genre_dict = {0:"Action",1:"Strategy",2:"RPG",3:"Adventure",4:"Sports"}

In [17]:
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=20, norm='l2', encoding='latin-1', ngram_range=(1, 2), stop_words='english')
features = tfidf.fit_transform(train.text).toarray()
labels = train.genreID
features.shape

(165000, 33629)

## Naive Bayes

In [None]:
cv_counts = CountVectorizer(stop_words='english',binary=False,min_df=5)
X_counts=cv_counts.fit_transform(data.text).toarray()

In [8]:
clf_Multinomial=MultinomialNB()
clf_Multinomial.fit(X_counts,data.genreID)

MultinomialNB()

## Logistic Regression

In [None]:
logisReg=LogisticRegression(max_iter=100000, C=1.5)
logisReg.fit(X_counts,data.genreID)
# print('The train accuracy for Logistic Regression is {0}'.format(logisReg.score(X_train,y_train)))
# print('The test accuracy for Logistic Regression is {0}'.format(logisReg.score(X_test,y_test)))

## Improved BOW model

In [None]:
wordCount = defaultdict(int)
punctuation = set(string.punctuation)
stemmer = nltk.stem.porter.PorterStemmer()
for d in data:
    for w in d['review/text'].split():
        w = ''.join([c for c in w.lower() if not c in punctuation])
        w = stemmer.stem(w)
        wordCount[w] += 1

In [None]:
counts = [(wordCount[w], w) for w in wordCount]
counts.sort()
counts.reverse()
words = [x[1] for x in counts[:3000]]

In [None]:
#Build Bag of Words
wordId = dict(zip(words, range(len(words))))
wordSet = set(words)

def feature(datum):
    feat = [0]*len(words)
    r = ''.join([c for c in datum.lower() if not c in punctuation])
    for w in r.split():
        if w in words:
            feat[wordId[w]] += 1
    return feat

X = [feature(d) for d in train.text]
y = train['genreID']

In [None]:
#Build and fit logistic regression model
logreg = LogisticRegression(max_iter=100000, C=1.5)
logreg.fit(X,y)

In [None]:
#Make predictions and evaludate the performance
y_pred = logreg.predict([feature(d) for d in train.text])
acc = (y_pred == train.genreID).sum()/len(y_pred)
print("The accuracy of this model on the validation set is ", str(acc))

In [None]:
#Make predictions and evaludate the performance
y_pred = logreg.predict([feature(d) for d in validation.text])
acc = (y_pred == validation.genreID).sum()/len(y_pred)
print("The accuracy of this model on the validation set is ", str(acc))

## Test Data

In [9]:
#Load test data
f = gzip.open(data_folder+"test_Category.json.gz", 'r+')
d = []
for line in f:
    value = eval(line)
    d.append(value)
test=pd.DataFrame(d)

In [10]:
test_counts=cv_counts.transform(test.text).toarray()

In [11]:
y_pred = clf_Multinomial.predict(test_counts)
test['prediction'] = y_pred

In [12]:
predictions = open("predictions_Category.txt", 'w')
predictions. truncate(0)
predictions.write("userID-reviewID,prediction\n")
for x in test.itertuples():
    predictions.write(str(x.userID) + '-' + str(x.reviewID) + "," + str(x.prediction) + "\n")
predictions.close()

In [None]:
y_pred = logreg.predict(test_counts)
test['prediction'] = y_pred

In [None]:
predictions = open("predictions_Category.txt", 'w')
predictions. truncate(0)
predictions.write("userID-reviewID,prediction\n")
for x in test.itertuples():
    predictions.write(str(x.userID) + '-' + str(x.reviewID) + "," + str(x.prediction) + "\n")
predictions.close()