In [1280]:
import pandas as pd
import numpy as np
import random
import yaml
import gzip
import string
from collections import defaultdict
from IPython.display import clear_output
from sklearn.neighbors import NearestNeighbors
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import scale
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
from sklearn.feature_selection import chi2
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
import seaborn as sns

In [7]:
data_folder = "assignment1/"

In [1037]:
#Load Data
f = gzip.open(data_folder+"train.json.gz", 'r+')
d = []
for line in f:
    value = eval(line)
    d.append(value)
data=pd.DataFrame(d)

# Tasks-Play Prediction

In [1042]:
#Train-validation split
data = data.sample(frac=1).reset_index(drop=True)
train = data.iloc[:165000]
validation = data.iloc[165000:175000]

In [1043]:
#Create dictionary set of user/game pair indicating which games has each player played or not played
played={}
not_played={}
user_id = train.userID.unique()
game_id = train.gameID.unique()

for user in user_id:
    played[user] = []
    not_played[user] = []

for x in train.itertuples():
    played[x.userID].append(x.gameID)

for user in user_id:
    not_played[user]= set(game_id) - set(played[user])

In [1044]:
#Create prediction label for validation set
validation['played'] = True

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  validation['played'] = True


In [1047]:
#Create negative samples
neg=[]
for x in validation.itertuples():
    neg.append(random.sample(not_played[x.userID],1)[0])
    
validation_neg = pd.DataFrame()
validation_neg['userID'] = validation.userID
validation_neg['gameID'] = neg
validation_neg['played'] = False

validation = pd.concat([validation, validation_neg])

In [1048]:
#Create dictionary set of user/game pair indicating which users played each game
game_user = {}
for g in game_id:
    game_user[g] = []

for x in train.itertuples():
    game_user[x.gameID].append(x.userID)

## First let's try using Jaccard Similarity

In [247]:
#Define Jaccard similarity
def Jaccard(s1, s2):
    numer = len(s1.intersection(s2))
    denom = len(s1.union(s2))
    return numer/denom

In [1131]:
#Define algorithm using the Jaccard similarity
def jaccard_pred(u, g):
    if u not in played.keys():
        return 0
    g1 = played[u]
    if g in g1:
        g1.remove(g)
    s1 = set(game_user[g])
    jmax = 0
    for gi in g1:
        s2 = set(game_user[gi])
        j = Jaccard(s1, s2)
        if j > jmax:
            jmax = j
    return jmax

In [249]:
accuracy = {}

gameCount = defaultdict(int)
totalPlayed = 0

for x in train.itertuples():
    gameCount[x.gameID] += 1
    totalPlayed += 1

mostPopular = [(gameCount[x], x) for x in gameCount]
mostPopular.sort()
mostPopular.reverse()

counter = 0
for t1 in np.linspace(1,3,5):
    return1 = set()
    count = 0
    
    for ic, i in mostPopular:
        count += ic
        return1.add(i)
        if count > totalPlayed/t1: break
    
    for t2 in np.linspace(0.01,0.05,5):
        clear_output(wait=True)
        pred = []
        for x in validation.itertuples():
            if (jaccard_pred(x.userID, x.gameID) > t2) and (x.gameID in return1):
                pred.append(True)
            else:
                pred.append(False)
        validation['pred'] = pred
        accuracy[(t1,t2)] = (validation.pred == validation.played).sum()/len(validation.played)
        print(np.round((counter/25)*100, 2), "%")
        counter += 1
accuracy

96.0 %


{(1.0, 0.01): 0.51935,
 (1.0, 0.02): 0.61215,
 (1.0, 0.03): 0.6747,
 (1.0, 0.04): 0.6463,
 (1.0, 0.05): 0.5885,
 (1.5, 0.01): 0.7035,
 (1.5, 0.02): 0.7061,
 (1.5, 0.03): 0.69735,
 (1.5, 0.04): 0.6475,
 (1.5, 0.05): 0.5872,
 (2.0, 0.01): 0.6817,
 (2.0, 0.02): 0.6818,
 (2.0, 0.03): 0.67735,
 (2.0, 0.04): 0.6412,
 (2.0, 0.05): 0.5849,
 (2.5, 0.01): 0.659,
 (2.5, 0.02): 0.65895,
 (2.5, 0.03): 0.6572,
 (2.5, 0.04): 0.63445,
 (2.5, 0.05): 0.58365,
 (3.0, 0.01): 0.6383,
 (3.0, 0.02): 0.6382,
 (3.0, 0.03): 0.6369,
 (3.0, 0.04): 0.6231,
 (3.0, 0.05): 0.58145}

The best accuracy we got is when we set the threhold of popularity to __1.5__ and the threshold to similarity to __0.02__. This set up gives us an accuracy of __0.7054__.

In [1115]:
return1 = set()
count = 0
    
for ic, i in mostPopular:
    count += ic
    return1.add(i)
    if count > totalPlayed/1.5: break

In [1117]:
pred = []
for x in df.itertuples():
    if (x.gameID in return1):
        pred.append(True)
    else:
        pred.append(False)
df['pred'] = pred
accuracy = (df.pred == df.played).sum()/len(df.played)
print(accuracy)

0.697


## Let's try Euclidean distance

In [1132]:
def Euclidean(s1,s2):
    return len(s1.union(s2)) - len(s1.intersection(s2))

In [1136]:
def Euclidean_pred(u, g):
    if u not in played.keys():
        return 0
    g1 = played[u]
    if g in g1:
        g1.remove(g)
    s1 = set(game_user[g])
    jmax = 0
    for gi in g1:
        s2 = set(game_user[gi])
        j = Euclidean(s1, s2)
        if j > jmax:
            jmax = j
    return jmax

In [1150]:
pred = []
for x in df.itertuples():
    if (Euclidean_pred(x.userID, x.gameID) > 950):
        pred.append(True)
    else:
        pred.append(False)
df['pred'] = pred
accuracy = (df.pred == df.played).sum()/len(df.played)
print(accuracy)

0.559


## Next up let's try Cosine Similarity

In [1164]:
# We will use an adjusted cosine similarity where we take into consideration of hours played
# First we construct a user-game pairwise metrics with hours_transformed as the value.
temp = train[['gameID', 'userID']]
temp['played'] = 1
M = temp.pivot_table(columns='gameID', index='userID',values ='played')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp['played'] = 1


### Let's first try a user-based recommendation system

In [1187]:
similarity_u = pd.DataFrame(cosine_similarity(
    scale(M.fillna(-10000))),
    index=M.index,
    columns=M.index)

In [1190]:
def cosine_pred_u(u,g):
    if u not in played.keys():
        return 0
    u1 = game_user[g]
    if u in u1:
        u1.remove(u)
    sim_max = 0
    for ui in u1:
        sim = similarity_u[u][ui]
        if sim > sim_max:
            sim_max = sim
    return sim_max

In [1185]:
df = validation.sample(frac=1).iloc[:1000]

In [1186]:
pred = []
for x in df.itertuples():
    if (cosine_pred_u(x.userID, x.gameID) > 0.08) and (x.gameID in return1):
        pred.append(True)
    else:
        pred.append(False)
df['pred'] = pred
accuracy = (df.pred == df.played).sum()/len(df.played)
print(accuracy)

0.7


In [1183]:
pred = []
for x in validation.itertuples():
    if (cosine_pred(x.userID, x.gameID) > 0.08) and (x.gameID in return1):
        pred.append(True)
    else:
        pred.append(False)
validation['pred'] = pred
accuracy = (validation.pred == validation.played).sum()/len(validation.played)
print(accuracy)

0.6938


### Next let's try item-based system

In [1191]:
similarity_i = pd.DataFrame(cosine_similarity(
    scale(M.T.fillna(-10000))),
    index=M.columns,
    columns=M.columns)

In [1192]:
def cosine_pred_i(u,g):
    if u not in played.keys():
        return 0
    g1 = played[u]
    if g in g1:
        g1.remove(g)
    sim_max = 0
    for gi in g1:
        sim = similarity_i[g][gi]
        if sim > sim_max:
            sim_max = sim
    return sim_max

In [1200]:
pred = []
for x in df.itertuples():
    if (cosine_pred_i(x.userID, x.gameID) > 0.03):
        pred.append(True)
    else:
        pred.append(False)
df['pred'] = pred
accuracy = (df.pred == df.played).sum()/len(df.played)
print(accuracy)

0.627


In [1213]:
pred = []
for x in df.itertuples():
    if (cosine_pred_u(x.userID, x.gameID) > 0.03) and (cosine_pred_i(x.userID, x.gameID) > 0) and (x.gameID in return1):
        pred.append(True)
    else:
        pred.append(False)
df['pred'] = pred
accuracy = (df.pred == df.played).sum()/len(df.played)
print(accuracy)

0.722


In [1214]:
pred = []
for x in validation.itertuples():
    if (cosine_pred_u(x.userID, x.gameID) > 0.03) and (cosine_pred_i(x.userID, x.gameID) > 0) and (x.gameID in return1):
        pred.append(True)
    else:
        pred.append(False)
validation['pred'] = pred
accuracy = (validation.pred == validation.played).sum()/len(validation.played)
print(accuracy)

0.70715


In [1215]:
pred = []
for x in validation.itertuples():
    if (cosine_pred_u(x.userID, x.gameID) > 0.03) and (cosine_pred_i(x.userID, x.gameID) > 0.02) and (x.gameID in return1):
        pred.append(True)
    else:
        pred.append(False)
validation['pred'] = pred
accuracy = (validation.pred == validation.played).sum()/len(validation.played)
print(accuracy)

0.7087


In [1226]:
pred = []
for x in validation.itertuples():
    if (cosine_pred_u(x.userID, x.gameID) > 0.05) and (cosine_pred_i(x.userID, x.gameID) > 0.02) and (x.gameID in return1):
        pred.append(True)
    else:
        pred.append(False)
validation['pred'] = pred
accuracy = (validation.pred == validation.played).sum()/len(validation.played)
print(accuracy)

0.70535


### Test the system

In [1218]:
#Load Dataset
test = pd.read_csv(data_folder+"pairs_Played.txt", sep="-|,", engine='python')

In [1219]:
#Create dictionary set of user/game pair indicating which games has each player played or not played
played={}
not_played={}
user_id = data.userID.unique()
game_id = data.gameID.unique()

for user in user_id:
    played[user] = []
    not_played[user] = []

for x in data.itertuples():
    played[x.userID].append(x.gameID)

for user in user_id:
    not_played[user]= set(game_id) - set(played[user])

In [1251]:
gameCount = defaultdict(int)
totalPlayed = 0

for x in train.itertuples():
    gameCount[x.gameID] += 1
    totalPlayed += 1

mostPopular = [(gameCount[x], x) for x in gameCount]
mostPopular.sort()
mostPopular.reverse()

return1 = set()
count = 0
    
for ic, i in mostPopular:
    count += ic
    return1.add(i)
    if count > totalPlayed/1.45: break
    
pred = []
for x in test.itertuples():
    if (cosine_pred_u(x.userID, x.gameID) > 0.03) and (cosine_pred_i(x.userID, x.gameID) > 0.02) and (x.gameID in return1):
        pred.append(1)
    else:
        pred.append(0)
test['prediction'] = pred

In [1252]:
predictions = open("predictions_Played.txt", 'w')
predictions. truncate(0)
predictions.write("userID-gameID,prediction\n")
for x in test.itertuples():
    predictions.write(str(x.userID) + '-' + str(x.gameID) + ',' + str(x.prediction) + '\n')
predictions.close()

# Tasks-Category Prediction

In [1253]:
#Load Data
f = gzip.open(data_folder+"train_Category.json.gz", 'r+')
f.readline()
d = []
for line in f:
    value = eval(line)
    d.append(value)
data=pd.DataFrame(d)

In [1254]:
data = data.sample(frac=1)
train = data.iloc[:165000]
validation = data.iloc[164999:]

In [1255]:
train.head()

Unnamed: 0,userID,genre,early_access,reviewID,hours,text,genreID,date,found_funny,user_id,compensation
10307,u41354608,Action,False,r53783629,8.5,This is a fun and addicting game i can play th...,0,2014-07-21,,,
17499,u11229669,Action,False,r85716389,30.0,Do you like serious stealth missions? How abou...,0,2016-10-24,,,
154519,u68749422,Adventure,True,r50479666,0.9,crashed instantly upon starting a new game on ...,3,2016-07-04,1.0,,
143605,u93175118,Action,False,r46983409,35.3,""" ♥‿♥ ""\n- Me, after seeing 88-4 is in there",0,2017-12-28,,,
36934,u33952685,Sports,True,r77471322,5.0,Game is really fun with friends.,4,2017-11-24,,7.656119812531402e+16,


## Baseline Bag of Words Model

In [1021]:
#Build set of all words appeared in the training data
wordCount = defaultdict(int)
punctuation = set(string.punctuation)
for d in train.itertuples():
    r = ''.join([c for c in d.text.lower() if not c in punctuation])
    for w in r.split():
        wordCount[w] += 1

In [None]:
#Extract the 2500 most common words
counts = [(wordCount[w], w) for w in wordCount]
counts.sort()
counts.reverse()

words = [x[1] for x in counts[:3000]]

In [None]:
#Build Bag of Words
wordId = dict(zip(words, range(len(words))))
wordSet = set(words)

def feature(datum):
    feat = [0]*len(words)
    r = ''.join([c for c in datum.lower() if not c in punctuation])
    for w in r.split():
        if w in words:
            feat[wordId[w]] += 1
    return feat

X = [feature(d) for d in train.text]
y = train['genreID']

In [None]:
#Build and fit logistic regression model
logreg = LogisticRegression(max_iter=100000, class_weight='balanced', C=2)
logreg.fit(X,y)

In [None]:
#Make predictions and evaludate the performance
y_pred = logreg.predict([feature(d) for d in validation.text])
acc = (y_pred == validation.genreID).sum()/len(y_pred)
print("The accuracy of this model on the validation set is ", str(acc))

## TFIDF Method

In [1256]:
train.head()

Unnamed: 0,userID,genre,early_access,reviewID,hours,text,genreID,date,found_funny,user_id,compensation
10307,u41354608,Action,False,r53783629,8.5,This is a fun and addicting game i can play th...,0,2014-07-21,,,
17499,u11229669,Action,False,r85716389,30.0,Do you like serious stealth missions? How abou...,0,2016-10-24,,,
154519,u68749422,Adventure,True,r50479666,0.9,crashed instantly upon starting a new game on ...,3,2016-07-04,1.0,,
143605,u93175118,Action,False,r46983409,35.3,""" ♥‿♥ ""\n- Me, after seeing 88-4 is in there",0,2017-12-28,,,
36934,u33952685,Sports,True,r77471322,5.0,Game is really fun with friends.,4,2017-11-24,,7.656119812531402e+16,


In [1268]:
genre_dict = {0:"Action",1:"Strategy",2:"RPG",3:"Adventure",4:"Sports"}

In [1261]:
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=20, norm='l2', encoding='latin-1', ngram_range=(1, 2), stop_words='english')
features = tfidf.fit_transform(train.text).toarray()
labels = train.genreID
features.shape

(165000, 33671)

In [None]:
N = 2
for genre, genreID in sorted(genre_dict.items()):
    features_chi2 = chi2(features, labels == genreID)
    indices = np.argsort(features_chi2[0])
    feature_names = np.array(tfidf.get_feature_names())[indices]
    unigrams = [v for v in feature_names if len(v.split(' ')) == 1]
    bigrams = [v for v in feature_names if len(v.split(' ')) == 2]
    print("# '{}':".format(genre))
    print("  . Most correlated unigrams:\n. {}".format('\n. '.join(unigrams[-N:])))
    print("  . Most correlated bigrams:\n. {}".format('\n. '.join(bigrams[-N:])))

  res_values = method(rvalues)


# '0':
  . Most correlated unigrams:
. fur
. futuristic
  . Most correlated bigrams:
. game easiest
. ﾌﾌ ヽーく


  res_values = method(rvalues)


In [1272]:
X_train = train.text
y_train = train.genreID

count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train)
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
clf = MultinomialNB().fit(X_train_tfidf, y_train)

In [1273]:
X_val = validation.text
y_val = validation.genreID

In [1283]:
pred = clf.predict(count_vect.transform(X_train))

In [1284]:
np.sum(pred == y_train)/len(y_train)

0.5974727272727273

In [None]:
models = [
    RandomForestClassifier(n_estimators=200, max_depth=3, random_state=0),
    LinearSVC(),
    MultinomialNB(),
    LogisticRegression(random_state=0),
]
CV = 5
cv_df = pd.DataFrame(index=range(CV * len(models)))
entries = []
for model in models:
    model_name = model.__class__.__name__
    accuracies = cross_val_score(model, features, labels, scoring='accuracy', cv=CV)
    for fold_idx, accuracy in enumerate(accuracies):
        entries.append((model_name, fold_idx, accuracy))
cv_df = pd.DataFrame(entries, columns=['model_name', 'fold_idx', 'accuracy'])

In [None]:
sns.boxplot(x='model_name', y='accuracy', data=cv_df)
sns.stripplot(x='model_name', y='accuracy', data=cv_df, 
              size=8, jitter=True, edgecolor="gray", linewidth=2)
plt.show()

In [1281]:
#Load test data
f = gzip.open(data_folder+"test_Category.json.gz", 'r+')
d = []
for line in f:
    value = eval(line)
    d.append(value)
test=pd.DataFrame(d)

In [1282]:
test.head()

Unnamed: 0,hours,early_access,date,reviewID,text,userID,user_id,found_funny,compensation
0,3.2,False,2016-03-17,r23357682,You shouldn't really go into this game looking...,u62606497,,,
1,10.0,False,2017-01-06,r31389322,The gameplay is nice and all but this game is ...,u14040096,,,
2,0.5,True,2017-02-25,r11214476,Bought the game yesterday played for about 30 ...,u82387637,7.656119807787946e+16,,
3,15.5,False,2014-08-29,r99452955,There's a very important lesson that MW2 teach...,u85959755,,151.0,
4,9.1,False,2013-12-02,r15626239,An extremely gorgeous and breathtaking experie...,u20275739,,,
