In [1]:
import pandas as pd
import numpy as np
from pandasql import sqldf
import random
from lightfm import LightFM
import json

rec_path = r'/Users/erik/Downloads/archive(1)/recommendations.csv'
games_path = r'/Users/erik/Downloads/archive(1)/games.csv'
users_path = r'/Users/erik/Downloads/archive(1)/users.csv'
games_metadata_path = r'/Users/erik/Downloads/archive(1)/games_metadata.json'




In [2]:
recommendations = pd.read_csv(rec_path)
games = pd.read_csv(games_path)
users = pd.read_csv(users_path)
games_metadata = pd.read_json(games_metadata_path, lines=True)

In [3]:
df_users = users.loc[users['reviews'] >= 20]
df_users

Unnamed: 0,user_id,products,reviews
33,1965432,702,32
339,4834290,433,36
541,6116691,262,24
555,20561,494,33
596,136427,172,22
...,...,...,...
6161915,93254,362,20
6163785,680714,304,21
6164214,807989,458,43
6164311,839326,185,24


In [4]:
df_games = pd.merge(games, games_metadata, how='left', on='app_id')
df_games

Unnamed: 0,app_id,title,date_release,win,mac,linux,rating,positive_ratio,user_reviews,price_final,price_original,discount,steam_deck,description,tags
0,10090,Call of Duty: World at War,2008-11-18,True,False,False,Very Positive,92,37039,19.99,19.99,0.0,True,"Call of Duty is back, redefining war like you'...","[Zombies, World War II, FPS, Multiplayer, Acti..."
1,13500,Prince of Persia: Warrior Within™,2008-11-21,True,False,False,Very Positive,84,2199,9.99,9.99,0.0,True,Enter the dark underworld of Prince of Persia ...,"[Action, Adventure, Parkour, Third Person, Gre..."
2,22364,BRINK: Agents of Change,2011-08-03,True,False,False,Positive,85,21,2.99,2.99,0.0,True,,[Action]
3,113020,Monaco: What's Yours Is Mine,2013-04-24,True,True,True,Very Positive,92,3722,14.99,14.99,0.0,True,Monaco: What's Yours Is Mine is a single playe...,"[Co-op, Stealth, Indie, Heist, Local Co-Op, St..."
4,226560,Escape Dead Island,2014-11-18,True,False,False,Mixed,61,873,14.99,14.99,0.0,True,Escape Dead Island is a Survival-Mystery adven...,"[Zombies, Adventure, Survival, Action, Third P..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48313,1803600,Gone Rogue,2023-03-01,True,False,False,Very Positive,96,65,12.49,12.49,0.0,True,Gone Rogue is a tactical stealth game with RPG...,"[Stealth, Action, Real Time Tactics, Isometric..."
48314,1811990,Wildfrost,2023-04-12,True,False,False,Mixed,64,1819,19.99,19.99,0.0,True,"Take on the elements in Wildfrost, a tactical ...","[Card Battler, Difficult, Roguelike Deckbuilde..."
48315,1868120,BLACK ACE,2022-02-03,True,False,False,Mixed,44,76,0.00,0.00,0.0,True,Competitive multiplayer match with vehicles an...,"[FPS, Third-Person Shooter, Multiplayer, Shoot..."
48316,2206390,BandRoll,2023-04-14,True,False,False,Positive,96,26,1.59,1.99,20.0,True,Feel the Neon in Your Bones! How would you lik...,"[3D Platformer, Rhythm, Parkour, 3D, Atmospher..."


In [5]:
games_feature_list = df_games.explode('tags', ignore_index=True)['tags']
games_feature_list

0                  Zombies
1             World War II
2                      FPS
3              Multiplayer
4                   Action
                ...       
566945        Singleplayer
566946       Immersive Sim
566947          Story Rich
566948    Post-apocalyptic
566949       Transhumanism
Name: tags, Length: 566950, dtype: object

In [9]:
user_app_ratings = pd.merge(df_users, recommendations, how="inner", on=["user_id"])
user_app_ratings['app_id'].nunique()

In [10]:
user_app_ratings = user_app_ratings.drop(columns=['reviews', 'products', 'helpful', 'funny', 'date', 'hours', 'review_id'])
user_app_ratings['is_recommended'] = user_app_ratings['is_recommended'].map({False: -1, True: 1})
user_app_ratings = user_app_ratings.sort_values('user_id')
user_app_ratings

Unnamed: 0,user_id,app_id,is_recommended
83525,464,323320,1
83515,464,254700,1
83516,464,35450,1
83517,464,250400,1
83518,464,46510,1
...,...,...,...
88609,6167506,250900,1
88610,6167506,823500,1
88611,6167506,881100,1
88604,6167506,1229490,1


In [54]:
unique_users = user_app_ratings['user_id'].to_list()

In [72]:
from sklearn.model_selection import train_test_split

def generate_train_set(df, n):
    unique_users = df['user_id'].to_list()
    lst = list(dict.fromkeys(unique_users))
    train_set = pd.DataFrame()
    test_set = pd.DataFrame()

    for i in lst:
        train, test = train_test_split(df.query('user_id == ' + str(i)), test_size=n)
        train_set = pd.concat([train_set, train], axis=0)
        test_set = pd.concat([test_set, test], axis=0)


    return train_set, test_set

In [73]:
new_train_set, new_test_set = generate_train_set(user_app_ratings, 0.25)
new_train_set

Unnamed: 0,user_id,app_id,is_recommended
83521,464,314070,1
83530,464,22200,1
83522,464,50300,1
83508,464,204100,1
83518,464,46510,1
...,...,...,...
88604,6167506,1229490,1
88601,6167506,107410,1
88610,6167506,823500,1
88600,6167506,620980,1


In [29]:
#Create Dataset
from lightfm.data import Dataset

dataset = Dataset()
dataset.fit((user_app_ratings['user_id']), (user_app_ratings['app_id']))

In [31]:
dataset.interactions_shape()

(14956, 1799)

In [32]:
mapping = dataset.mapping()

In [33]:
game_tags = pd.merge(user_app_ratings, games_metadata, how='left', on='app_id')
game_tags

Unnamed: 0,user_id,app_id,is_recommended,description,tags
0,464,323320,1,Experience the freedom of unbounded climbing a...,"[Open World, Adventure, Singleplayer, Explorat..."
1,464,254700,1,(Release: 2014) Special agent Leon S. Kennedy ...,"[Inventory Management, Survival Horror, Action..."
2,464,35450,1,Contains full Rising Storm content as well!,"[Realistic, World War II, FPS, Multiplayer, Hi..."
3,464,250400,1,“A real Gem” – Destructoid at E3 “Offers a dif...,"[Survival, Zombies, Crafting, Open World Survi..."
4,464,46510,1,Having searched Eastern and Western Europe for...,"[Adventure, Point & Click, Puzzle, Female Prot..."
...,...,...,...,...,...
429428,6167506,250900,1,The Binding of Isaac: Rebirth is a randomly ge...,"[Action Roguelike, Roguelike, Indie, Replay Va..."
429429,6167506,823500,1,BONEWORKS is an Experimental Physics VR Advent...,"[VR, Physics, Action, Sandbox, Shooter, Single..."
429430,6167506,881100,1,Noita is a magical action roguelite set in a w...,"[Physics, Difficult, Roguelike, Pixel Graphics..."
429431,6167506,1229490,1,ULTRAKILL is a fast-paced ultraviolent retro F...,"[Early Access, FPS, Arena Shooter, Spectacle f..."


In [34]:
game_tags_exploded = game_tags.explode('tags', ignore_index=True)

In [35]:
dataset.fit_partial(items=game_tags_exploded['app_id'], item_features=game_tags_exploded['tags'])

In [20]:
games_exploded = df_games.explode('tags', ignore_index=True)
dataset.fit_partial(items=games_exploded['app_id'], item_features=games_exploded['tags'])

In [37]:
ls = list(zip(user_app_ratings.user_id, user_app_ratings.app_id, user_app_ratings.is_recommended))

In [38]:
(interactions1, weights1) = dataset.build_interactions(ls)
print(repr(interactions1))

<14956x1799 sparse matrix of type '<class 'numpy.int32'>'
	with 429433 stored elements in COOrdinate format>


In [39]:
item_feature_list = list(zip(game_tags.app_id, game_tags.tags))
item_features = dataset.build_item_features(item_feature_list)
print(repr(item_features))

<1799x2223 sparse matrix of type '<class 'numpy.float32'>'
	with 34169 stored elements in Compressed Sparse Row format>


In [42]:
from lightfm.evaluation import auc_score, precision_at_k, recall_at_k, reciprocal_rank

model_t = LightFM(loss='warp', random_state=42, learning_rate=0.25, no_components=150)
model_t = model_t.fit(interactions1, item_features=item_features, # sample_weight=weights_train,  
                  epochs=50,
                  num_threads=2, verbose=True)

Epoch: 100%|██████████| 50/50 [03:04<00:00,  3.68s/it]


In [43]:
print("Train Precision MAP@5: %.2f" % precision_at_k(model_t, interactions1, item_features=item_features, k=5).mean())

Train Precision MAP@5: 0.88


# -----------------------------------------

Create new Dataset and interaction matrix with train and testsplit

In [80]:
#Create Dataset
from lightfm.data import Dataset

dataset2 = Dataset()
dataset2.fit((user_app_ratings['user_id']), (user_app_ratings['app_id']))

ls_new_train_set = list(zip(new_train_set.user_id, new_train_set.app_id, new_train_set.is_recommended))
(interactions_new_train, weights) = dataset2.build_interactions(ls_new_train_set)
print(repr(interactions_new_train))

<14956x1799 sparse matrix of type '<class 'numpy.int32'>'
	with 316598 stored elements in COOrdinate format>


In [82]:
dataset2.fit_partial(items=game_tags_exploded['app_id'], item_features=game_tags_exploded['tags'])
item_feature_list2 = list(zip(game_tags.app_id, game_tags.tags))
item_features2 = dataset2.build_item_features(item_feature_list2)
print(repr(item_features2))

<1799x2223 sparse matrix of type '<class 'numpy.float32'>'
	with 34169 stored elements in Compressed Sparse Row format>


In [83]:
# train model on train data
model_t3 = LightFM(loss='warp', random_state=42, learning_rate=0.25, no_components=150)
model_t3 = model_t.fit(interactions_new_train, item_features=item_features2, # sample_weight=weights_train,  
                  epochs=50,
                  num_threads=2, verbose=True)

Epoch: 100%|██████████| 50/50 [02:08<00:00,  2.58s/it]


In [104]:
print("Train Precision MAP@5: %.2f" % precision_at_k(model_t3, interactions_new_train, item_features=item_features2, k=5).mean())

Train Precision MAP@5: 0.89


In [85]:
ls_new_test_set = list(zip(new_test_set.user_id, new_test_set.app_id, new_test_set.is_recommended))
(interactions_new_test, weights) = dataset2.build_interactions(ls_new_test_set)

In [105]:
print("Test Precision MAP@5: %.2f" % precision_at_k(model_t3, interactions_new_test, train_interactions=interactions_new_train, item_features=item_features2, k=5).mean())

Train Precision MAP@5: 0.06


In [90]:
mapping2 = dataset2.mapping()

In [225]:
index_item_id_mapping =  {v: k for k, v in mapping2[2].items()}
index_user_id_mapping =  {v: k for k, v in mapping2[0].items()}

In [385]:
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

def precision_recall_f1(model, dataset, interactions, item_features, threshold = 0):

    precisions = dict()
    recalls = dict()
    f1s = dict()
    accuracies = dict()

    mapping = dataset.mapping()         
    index_user_id_mapping =  {v: k for k, v in mapping[0].items()}
    index_item_id_mapping =  {v: k for k, v in mapping[2].items()}
    n_users, n_items = interactions.shape

    for user in range(n_users):

        relevant = new_test_set[new_test_set['user_id'] == index_user_id_mapping[user]]
        #print(relevant)
        scores = pd.Series(model.predict(user,np.arange(n_items), item_features=item_features))
        scores = (scores - scores.min()) / (scores.max() - scores.min()) 

        scores_df = pd.DataFrame({'index':scores.index, 'scores': scores.values}) 
        scores_df['app_id'] = scores_df['index'].map(index_item_id_mapping)
        merged = pd.merge(relevant, scores_df, how="left", on=["app_id"])

        y_true = [1 if s==1 else 0 for s in list(merged['is_recommended'])]
        y_pred = [1 if s>=threshold else 0 for s in list(merged['scores'])]    
        
        precisions[user] = precision_score(y_true, y_pred, zero_division=0)
        recalls[user] = recall_score(y_true, y_pred, zero_division=0)
        f1s[user] = f1_score(y_true, y_pred, zero_division=0)

        accuracies[user] = accuracy_score(y_true, y_pred)

    # average scores over all users 
    avg_precision = sum(list(precisions.values())) / len(list(precisions.values()))
    avg_recall = sum(list(recalls.values())) / len(list(recalls.values()))
    avg_f1 = sum(list(f1s.values())) / len(list(f1s.values()))
    
    avg_accuracy = sum(list(accuracies.values())) / len(list(accuracies.values()))
    
    return avg_precision, avg_recall, avg_f1, avg_accuracy

In [341]:
def calculate_precision_recall_at_k(model, dataset, interactions, item_features, k, threshold = 0):
    mapping = dataset.mapping()
    index_user_id_mapping =  {v: k for k, v in mapping[0].items()}
    n_users, n_items = interactions.shape

    precisions = dict()
    recalls = dict()
    for user in range(n_users):
        
        scores = pd.Series(model.predict(user,np.arange(n_items), item_features=item_features))
        scores = (scores - scores.min()) / (scores.max() - scores.min()) 

        scores = scores[scores >= threshold]
        scores = list(pd.Series(scores.sort_values(ascending=False).index))
        scores = [index_item_id_mapping[a] for a in scores]
        relevant = list(new_test_set[new_test_set['user_id'] == index_user_id_mapping[user]]['app_id'])
        top = [s for s in scores if s in relevant]

        n_rel = len(relevant)
        n_rec_k = len(scores[:k])
        n_rel_and_rec_k = len(top[:k])

        precisions[index_user_id_mapping[user]] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 0
        recalls[index_user_id_mapping[user]] = n_rel_and_rec_k / n_rel if n_rel != 0 else 0
    return precisions, recalls


In [382]:
avg_precision, avg_recall, avg_f1, avg_accuracy = precision_recall_f1(model_t3, dataset2, interactions_new_train, item_features2, threshold=0.5)
print(f'Avg. Precision: {avg_precision}')
print(f'Avg. Recall: {avg_recall}')
print(f'Avg. F1: {avg_f1}')
print(f'Avg. Accuracy: {avg_accuracy}')

Avg. Precision: 0.7895659196834278
Avg. Recall: 0.9524032235143123
Avg. F1: 0.842535669442419
Avg. Accuracy: 0.7670138201812119


In [384]:
precision_k, recall_k = calculate_precision_recall_at_k(model_t3, dataset2, interactions_new_train, item_features2, k=5, threshold=0.6)
print('MAP@5: '+ str(sum(prec for prec in precision_k.values()) / len(precision_k)))

MAP@5: 0.9548943567798677


## Recommend new games

In [395]:
def get_top_n_games(model, dataset, interactions, item_features, user, n):
    mapping = dataset.mapping()
    index_user_id_mapping =  {v: k for k, v in mapping[0].items()}
    n_users, n_items = interactions.shape
        
    scores = pd.Series(model.predict(user,np.arange(n_items), item_features=item_features))
    scores = (scores - scores.min()) / (scores.max() - scores.min()) 

    #scores = scores[scores >= threshold]
    scores = list(pd.Series(scores.sort_values(ascending=False).index))
    scores = [index_item_id_mapping[a] for a in scores]
    relevant = list(new_test_set[new_test_set['user_id'] == index_user_id_mapping[user]]['app_id'])
    top = [s for s in scores if s not in relevant]

    return top[:n]

def get_titles_from_ids(id_list):
    list = []
    for i in id_list:
        list.append(games[games['app_id'] == i]['title'].iloc[0])
    return list

In [397]:
games_list = get_titles_from_ids(get_top_n_games(model_t3, dataset2, interactions_new_train, item_features2, user=464, n=10))
games_list

['Gunpoint',
 "Yoku's Island Express",
 'Heat Signature',
 'Slay the Spire',
 'The Binding of Isaac: Rebirth',
 'Crash Bandicoot™ N. Sane Trilogy',
 'Max Payne 3',
 'Elite Dangerous',
 'Parkitect',
 'Little Nightmares']

### Hyperparameter tuning

In [398]:
import optuna
from optuna.samplers import TPESampler

def objective(trial):
    # Define the search space
    loss = trial.suggest_categorical('loss', ['warp'])
    learning_rate = trial.suggest_float('learning_rate', 0.01, 0.25)
    no_components = trial.suggest_int('no_components', 10, 150)
    no_epochs = trial.suggest_int('no_epochs', 10, 100)

    mm = LightFM(loss=loss, random_state=42, learning_rate=learning_rate, no_components=no_components)
    mm = mm.fit(interactions_new_train,
                item_features=item_features2,
                  epochs=no_epochs,
                  num_threads=2, verbose=True)
    
    score = precision_at_k(mm, interactions_new_train, item_features=item_features2, k=5).mean()
    return score

study = optuna.create_study(study_name="lightfm_optimization",
                            direction="maximize",
                            sampler=TPESampler())

study.optimize(objective, n_trials=15)
print(study.best_params)
print(study.best_value)

[32m[I 2023-05-24 12:34:33,541][0m A new study created in memory with name: lightfm_optimization[0m
Epoch: 100%|██████████| 87/87 [01:58<00:00,  1.36s/it]
[32m[I 2023-05-24 12:36:37,041][0m Trial 0 finished with value: 0.4432067573070526 and parameters: {'loss': 'warp', 'learning_rate': 0.18435034002299955, 'no_components': 47, 'no_epochs': 87}. Best is trial 0 with value: 0.4432067573070526.[0m
Epoch: 100%|██████████| 93/93 [03:38<00:00,  2.35s/it]
[32m[I 2023-05-24 12:40:23,760][0m Trial 1 finished with value: 0.8136935234069824 and parameters: {'loss': 'warp', 'learning_rate': 0.15566768027959457, 'no_components': 127, 'no_epochs': 93}. Best is trial 1 with value: 0.8136935234069824.[0m
Epoch: 100%|██████████| 70/70 [02:08<00:00,  1.84s/it]
[32m[I 2023-05-24 12:42:37,788][0m Trial 2 finished with value: 0.5908264517784119 and parameters: {'loss': 'warp', 'learning_rate': 0.1673852974618393, 'no_components': 74, 'no_epochs': 70}. Best is trial 1 with value: 0.8136935234069

{'loss': 'warp', 'learning_rate': 0.20623504794076078, 'no_components': 147, 'no_epochs': 99}
0.9315191507339478


In [399]:
# train model on train data
model_tuned = LightFM(loss='warp', random_state=42, learning_rate=0.20623504794076078, no_components=147)
model_tuned = model_t.fit(interactions_new_train, item_features=item_features2, # sample_weight=weights_train,  
                  epochs=99,
                  num_threads=2, verbose=True)

Epoch: 100%|██████████| 99/99 [03:22<00:00,  2.04s/it]


In [401]:
avg_precision, avg_recall, avg_f1, avg_accuracy = precision_recall_f1(model_tuned, dataset2, interactions_new_train, item_features2, threshold=0.5)
print(f'Avg. Precision: {avg_precision}')
print(f'Avg. Recall: {avg_recall}')
print(f'Avg. F1: {avg_f1}')
print(f'Avg. Accuracy: {avg_accuracy}')

Avg. Precision: 0.7895235667557801
Avg. Recall: 0.9535510796990294
Avg. F1: 0.8431424306238733
Avg. Accuracy: 0.7680161365193054


In [402]:
precision_k, recall_k = calculate_precision_recall_at_k(model_tuned, dataset2, interactions_new_train, item_features2, k=5, threshold=0.6)
print('MAP@5: '+ str(sum(prec for prec in precision_k.values()) / len(precision_k)))

MAP@5: 0.9582642417758584
