In [None]:
pip install nmslib

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting nmslib
  Downloading nmslib-2.1.1-cp37-cp37m-manylinux2010_x86_64.whl (13.5 MB)
[K     |████████████████████████████████| 13.5 MB 5.2 MB/s 
[?25hCollecting pybind11<2.6.2
  Downloading pybind11-2.6.1-py2.py3-none-any.whl (188 kB)
[K     |████████████████████████████████| 188 kB 45.3 MB/s 
Installing collected packages: pybind11, nmslib
Successfully installed nmslib-2.1.1 pybind11-2.6.1


In [None]:
pip install implicit

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting implicit
  Downloading implicit-0.5.2-cp37-cp37m-manylinux2014_x86_64.whl (18.5 MB)
[K     |████████████████████████████████| 18.5 MB 510 kB/s 
Installing collected packages: implicit
Successfully installed implicit-0.5.2


In [None]:
from google.colab import drive 
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
import sys
sys.path.insert(0,'/content/gdrive/My Drive/YCNG235/Project')

In [None]:
import numpy as np
import time
from scipy.sparse import csr_matrix
from implicit.nearest_neighbours import bm25_weight
from implicit.als import AlternatingLeastSquares
from implicit.ann.nmslib import NMSLibModel
from implicit.evaluation import train_test_split, mean_average_precision_at_k
from resources import *

  f"CUDA extension is built, but disabling GPU support because of '{e}'",


## Data

In [None]:
df = read_csv_data('/content/gdrive/My Drive/YCNG235/Project/archive/steam-200k.csv')

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200000 entries, 0 to 199999
Data columns (total 4 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   user_id     200000 non-null  int64  
 1   game_title  200000 non-null  object 
 2   behavior    200000 non-null  object 
 3   value       200000 non-null  float64
dtypes: float64(1), int64(1), object(2)
memory usage: 6.1+ MB


In [None]:
df.head(3)

Unnamed: 0,user_id,game_title,behavior,value
0,151603712,The Elder Scrolls V Skyrim,purchase,1.0
1,151603712,The Elder Scrolls V Skyrim,play,273.0
2,151603712,Fallout 4,purchase,1.0


In [None]:
# df.info()
logger.info(f'The number of users: {df["user_id"].nunique()}')
logger.info(f'The number of games: {df["game_title"].nunique()}')

2022-06-14 16:17:28,295 : INFO : The number of users: 12393
2022-06-14 16:17:28,320 : INFO : The number of games: 5155


In [None]:
df.groupby(['game_title']).mean().sort_values(by="value",ascending=False).head()

Unnamed: 0_level_0,user_id,value
game_title,Unnamed: 1_level_1,Unnamed: 2_level_1
Eastside Hockey Manager,213854300.0,648.0
FIFA Manager 09,46301760.0,206.0
Perpetuum,67231110.0,200.9875
Football Manager 2012,74587870.0,194.501887
Football Manager 2014,103265200.0,194.017722


In [None]:
df.groupby(['game_title']).sum().sort_values(by="value",ascending=False).head()

Unnamed: 0_level_0,user_id,value
game_title,Unnamed: 1_level_1,Unnamed: 2_level_1
Dota 2,1752489713804,986525.6
Counter-Strike Global Offensive,398056581703,324183.6
Team Fortress 2,635214434972,175996.3
Counter-Strike,87100259484,135117.1
Sid Meier's Civilization V,107677794485,100417.3


In [None]:
steam_df = df.copy()
steam_df['like'] = [1 if x > 60 else 0 for x in df['value']]
steam_df['like'].value_counts()
steam_df.head()

Unnamed: 0,user_id,game_title,behavior,value,like
0,151603712,The Elder Scrolls V Skyrim,purchase,1.0,0
1,151603712,The Elder Scrolls V Skyrim,play,273.0,1
2,151603712,Fallout 4,purchase,1.0,0
3,151603712,Fallout 4,play,87.0,1
4,151603712,Spore,purchase,1.0,0


In [None]:
sub_steam_df=steam_df.groupby('game_title')['like'].apply(lambda x: (x==1).sum()).sort_values(ascending=False)

In [None]:
sub_steam_df = sub_steam_df.reset_index()

In [None]:
sub_steam_df['like'].describe(percentiles=[.25, .5, .75, .95, 1])

count    5155.000000
mean        1.604074
std        22.671837
min         0.000000
25%         0.000000
50%         0.000000
75%         0.000000
95%         3.000000
100%     1261.000000
max      1261.000000
Name: like, dtype: float64

In [None]:
first_quantile = np.percentile(sub_steam_df['like'], 95)
mask = sub_steam_df['like'] > first_quantile
sub_steam_df_group = sub_steam_df[mask]
sub_steam_df_group

Unnamed: 0,game_title,like
0,Dota 2,1261
1,Counter-Strike Global Offensive,689
2,Team Fortress 2,391
3,The Elder Scrolls V Skyrim,293
4,Sid Meier's Civilization V,211
...,...,...
225,L.A. Noire,4
226,Magic The Gathering Duels of the Planeswalker...,4
227,Magic Duels,4
228,Batman Arkham Asylum GOTY Edition,4


In [None]:
sub_steam_df_group = sub_steam_df_group.set_index('game_title')

In [None]:
sub_steam_df_group

Unnamed: 0_level_0,like
game_title,Unnamed: 1_level_1
Dota 2,1261
Counter-Strike Global Offensive,689
Team Fortress 2,391
The Elder Scrolls V Skyrim,293
Sid Meier's Civilization V,211
...,...
L.A. Noire,4
Magic The Gathering Duels of the Planeswalkers 2012,4
Magic Duels,4
Batman Arkham Asylum GOTY Edition,4


## Train

In [None]:
df = df.assign(group_flag=lambda x: np.where(
                                        x.game_title.isin(sub_steam_df_group.index),
                                        1,
                                        0
    )
)

In [None]:
df

Unnamed: 0,user_id,game_title,behavior,value,group_flag
0,151603712,The Elder Scrolls V Skyrim,purchase,1.0,1
1,151603712,The Elder Scrolls V Skyrim,play,273.0,1
2,151603712,Fallout 4,purchase,1.0,1
3,151603712,Fallout 4,play,87.0,1
4,151603712,Spore,purchase,1.0,1
...,...,...,...,...,...
199995,128470551,Titan Souls,play,1.5,0
199996,128470551,Grand Theft Auto Vice City,purchase,1.0,0
199997,128470551,Grand Theft Auto Vice City,play,1.5,0
199998,128470551,RUSH,purchase,1.0,0


In [None]:
# Convert dataset into a csr sparse matrix with users x items format
df['user_id'] = df['user_id'].astype('category')
df['game_title'] = df['game_title'].astype('category')

In [None]:
# Convert dataset into a csr sparse matrix with users x items format
user_item_data = csr_matrix(
    (np.ones(len(df)), (df['user_id'].cat.codes.copy(), df['game_title'].cat.codes.copy())), 
    dtype=np.float32,
)

user_item_data

<12393x5155 sparse matrix of type '<class 'numpy.float32'>'
	with 128804 stored elements in Compressed Sparse Row format>

In [None]:
# user_item_data[0].data

In [None]:
# Create mapping dictionaries
item_arr = df['game_title'].cat.categories

user_arr = df['user_id'].cat.categories

user_dict = dict((v, k) for k, v in dict(enumerate(df['user_id'].cat.categories)).items())

item_dict = dict(enumerate(df['game_title'].cat.categories))

In [None]:
# Create a list of extra games to filter out from personalized lists
item_to_filter = df[df['group_flag'] == 0]['game_title'].unique()
logger.info(f'The number of games to filter: {len(item_to_filter)}')
rev_item_dict = dict((v, k) for k, v in item_dict.items())
item_to_filter = [rev_item_dict[item] for item in item_to_filter]

2022-06-14 16:21:08,495 : INFO : The number of games to filter: 4925


In [None]:
# Reduce impact of users having purchased same games many times
# and also to reduce the weight given to popular items
item_user_data = bm25_weight(user_item_data.T, K1=100, B=0.75) # ---> COO matrix

user_item_data = item_user_data.T.tocsr()

In [None]:
# Training

models = {
    'als': AlternatingLeastSquares,
}


# K: The number of neighbours to include when calculating the item-item similarity matrix

params = {
    'als': {'factors': 64, 'iterations': 15, 'regularization': 0.05},
}

In [None]:
# ALS
model_name = 'als'
als_model = models.get(model_name)(**params.get(model_name))

  "OpenBLAS detected. Its highly recommend to set the environment variable "


In [None]:
# Evaluation

train, test = train_test_split(
    user_item_data, 
    train_percentage=0.8, 
    random_state=0
)

assert train.shape[0] == test.shape[0] == user_item_data.shape[0]
assert train.data.shape[0] + test.data.shape[0] == user_item_data.data.shape[0]

In [None]:
start = time.time()
als_model.fit(train, show_progress=True)
end = time.time()
logger.info(f'The training time: {(end - start) } seconds')

start = time.time()
map_at_k = mean_average_precision_at_k(
    model=als_model, 
    train_user_items=train, 
    test_user_items=test, 
    K=10,
    show_progress=True, 
    num_threads=1
)
end = time.time()
logger.info(f'The training time: {(end - start) } seconds')
logger.info(f'MAP@10: {map_at_k}')

  0%|          | 0/15 [00:00<?, ?it/s]

2022-06-14 16:21:51,270 : INFO : The training time: 11.096437454223633 seconds


  0%|          | 0/5848 [00:00<?, ?it/s]

2022-06-14 16:21:52,479 : INFO : The training time: 1.2020173072814941 seconds
2022-06-14 16:21:52,487 : INFO : MAP@10: 0.14641117816635696


In [None]:
als_model = None
als_model = models.get(model_name)(**params.get(model_name))
als_model.fit(user_item_data, show_progress=True)

  0%|          | 0/15 [00:00<?, ?it/s]

In [None]:
# Model artifact

logger.info(f'Users embedding: \n {als_model.user_factors}')
logger.info(f'Users embedding shape : {als_model.user_factors.shape}')
logger.info(f'\n ----------------------------------------------------------------------')
logger.info(f'Games embedding: \n {als_model.item_factors}')
logger.info(f'Games embedding shape : {als_model.item_factors.shape}')

2022-06-14 16:22:09,793 : INFO : Users embedding: 
 [[-1.4202957e-01 -2.8291908e-01  4.1940609e-01 ... -7.8938358e-02
  -3.6766675e-01  1.9821189e-01]
 [-6.8136072e-01 -3.1687063e-01  6.0322171e-01 ... -2.6513787e-02
  -8.9989193e-02  1.0891138e+00]
 [ 1.8649212e-01 -1.0359119e+00  4.0760294e-01 ...  4.2899275e-01
   2.1341239e-01 -4.5889693e-01]
 ...
 [-1.6455287e-03  3.3075230e-03 -1.4904965e-02 ...  1.4533101e-02
   1.8369769e-03 -8.9559788e-03]
 [ 8.2448078e-04  2.9390884e-04 -2.3586799e-04 ...  1.6029491e-04
   1.7934956e-04 -5.7461212e-04]
 [ 8.2448061e-04  2.9390864e-04 -2.3586804e-04 ...  1.6029492e-04
   1.7934956e-04 -5.7461212e-04]]
2022-06-14 16:22:09,795 : INFO : Users embedding shape : (12393, 64)
2022-06-14 16:22:09,798 : INFO : 
 ----------------------------------------------------------------------
2022-06-14 16:22:09,800 : INFO : Games embedding: 
 [[ 1.32182310e-03  2.96235248e-03 -1.69222243e-02 ... -1.80816427e-02
  -1.25124985e-02  2.67140626e-04]
 [ 4.13074717e-0

## Personalization 

In [None]:
# Single user prediction (static history)

user = 5250

logger.info(f'The number of games by the user: {len(df[df["user_id"] == user])}')
logger.info(f'The number of games played: {len(df[(df["user_id"] == user) & (df["behavior"] == "play")])}')

user_id = user_dict[user]
logger.info(f'user_id: {user}')

start = time.time()
ids, scores = als_model.recommend(
        userid=user_id,
        user_items=user_item_data[user_id], # A sparse matrix of shape (users, items)
        N=20,
        filter_already_liked_items=True, # "Discovery"
        filter_items=None,
        recalculate_user=False, # Static 
        items=None # list of item ids to look up 
)
end = time.time()
logger.info(f'The elapsed time: {(end - start) * 1000} milliseconds')


single_user_pred = pd.DataFrame({
    'game_title': item_arr[ids], 
    'score': scores
})

single_user_pred = pd.merge(single_user_pred, 
                    df[['user_id', 'game_title', 'behavior', 'value', 'group_flag']], 
                    how='left', 
                    on='game_title')


2022-06-14 16:39:04,619 : INFO : The number of games by the user: 27
2022-06-14 16:39:04,636 : INFO : The number of games played: 6
2022-06-14 16:39:04,639 : INFO : user_id: 5250
2022-06-14 16:39:04,643 : INFO : The elapsed time: 1.1439323425292969 milliseconds


In [None]:
single_user_pred

Unnamed: 0,game_title,score,user_id,behavior,value,group_flag
0,Counter-Strike Condition Zero,0.413108,30695285,purchase,1.0,1
1,Counter-Strike Condition Zero,0.413108,30695285,play,36.0,1
2,Counter-Strike Condition Zero,0.413108,48845802,purchase,1.0,1
3,Counter-Strike Condition Zero,0.413108,48845802,play,60.0,1
4,Counter-Strike Condition Zero,0.413108,54103616,purchase,1.0,1
...,...,...,...,...,...,...
8733,Black Mesa,0.073694,2753525,purchase,1.0,0
8734,Black Mesa,0.073694,2753525,play,2.9,0
8735,Black Mesa,0.073694,54310644,purchase,1.0,0
8736,Black Mesa,0.073694,208900216,purchase,1.0,0


In [None]:
single_user_pred_score = single_user_pred[['game_title', 'score']]

In [None]:
single_user_pred_score.drop_duplicates()

Unnamed: 0,game_title,score
0,Counter-Strike Condition Zero,0.413108
904,Counter-Strike Condition Zero Deleted Scenes,0.404833
1677,Half-Life Deathmatch Source,0.346838
1959,Half-Life Source,0.336584
2186,Day of Defeat Source,0.279267
2761,Left 4 Dead,0.20366
3246,Left 4 Dead 2,0.110608
4998,Synergy,0.105079
5086,Call of Duty 4 Modern Warfare,0.093664
5213,The Stanley Parable,0.093244


In [None]:
# Non-presonalized user to user recommendation

user = 5250

# get related items for inca jumper (item id = 6377)
user_id = user_dict[user]

start = time.time()
ids, scores = als_model.similar_users(
    userid=user_id, 
    N=10, 
)
end = time.time()
logger.info(f'The elapsed time: {(end - start) * 1000} milliseconds')

# display the results using pandas for nicer formatting
user_sim_df = pd.DataFrame({
    'user_id': user_arr[ids],
    'scores': scores
})


2022-06-14 16:40:58,227 : INFO : The elapsed time: 11.28077507019043 milliseconds


In [None]:
user_sim_df

Unnamed: 0,user_id,scores
0,5250,1.0
1,15841597,0.964817
2,49832324,0.96385
3,5949488,0.962971
4,9209946,0.962365
5,7163917,0.96012
6,18066817,0.959697
7,16080105,0.959692
8,21590667,0.956488
9,28453352,0.955783


In [None]:
# Non-presonalized item to item recommendation using ALS

game = "Dota 2"

# get related items for Dota 2 (itemid = 0)
item_id = rev_item_dict["Dota 2"]

start = time.time()
ids, scores = als_model.similar_items(
    itemid=item_id, 
    N=10, 
)
end = time.time()
logger.info(f'The elapsed time: {(end - start) * 1000} milliseconds')

# display the results using pandas for nicer formatting
als_sim_df = pd.DataFrame({
    'game_title': item_arr[ids],
    'scores': scores
})


als_sim_df = pd.merge(als_sim_df, 
                  df[['user_id', 'game_title', 'behavior', 'value', 'group_flag']], 
                    how='left', 
                    on='game_title')

2022-06-14 16:45:41,777 : INFO : The elapsed time: 2.8629302978515625 milliseconds


In [None]:
als_sim_df = als_sim_df[['game_title', 'scores']]

In [None]:
als_sim_df.drop_duplicates()

Unnamed: 0,game_title,scores
0,Dota 2,1.0
9682,Warframe,0.779741
10953,Neverwinter,0.774421
11377,Nosgoth,0.694288
11902,GunZ 2 The Second Duel,0.679035
12192,Magicka Wizard Wars,0.66779
12545,TERA,0.666322
12887,Quake Live,0.665634
13178,Dead Island Epidemic,0.652218
13658,Archeblade,0.648354


In [None]:
# Approximate nearest neighbours

model_name = 'als'
als = models.get(model_name)(**params.get(model_name))

nms_model = NMSLibModel(model=als, 
            approximate_similar_items=True, 
            approximate_recommend=True, 
            method='hnsw'
)

nms_model.fit(user_item_data, show_progress=True)

  0%|          | 0/15 [00:00<?, ?it/s]

In [None]:
# Single user prediction (static history) with filtered items

user = 5250

logger.info(f'The number of games by the user: {len(df[df["user_id"] == user])}')
logger.info(f'The number of games played: {len(df[(df["user_id"] == user) & (df["behavior"] == "play")])}')

user_id = user_dict[user]
logger.info(f'user_id: {user}')

start = time.time()
ids, scores = nms_model.recommend(
        userid=user_id,
        user_items=user_item_data[user_id],
        N=20,
        filter_already_liked_items=True, 
        filter_items=item_to_filter,
        recalculate_user=False, 
        items=None 
)
end = time.time()
logger.info(f'The elapsed time: {(end - start) * 1000} milliseconds')

personalization = pd.DataFrame({
    'game_title': item_arr[ids], 
    'scores': scores
})

personalization = pd.merge(personalization, 
                     df[['user_id', 'game_title', 'behavior', 'value', 'group_flag']], 
                    how='left', 
                    on='game_title')

2022-06-14 16:50:35,023 : INFO : The number of games by the user: 27
2022-06-14 16:50:35,044 : INFO : The number of games played: 6
2022-06-14 16:50:35,047 : INFO : user_id: 5250
2022-06-14 16:50:35,053 : INFO : The elapsed time: 3.4232139587402344 milliseconds


In [None]:
personalization

Unnamed: 0,game_title,scores,user_id,behavior,value,group_flag
0,Counter-Strike Condition Zero,0.408666,30695285,purchase,1.0,1
1,Counter-Strike Condition Zero,0.408666,30695285,play,36.0,1
2,Counter-Strike Condition Zero,0.408666,48845802,purchase,1.0,1
3,Counter-Strike Condition Zero,0.408666,48845802,play,60.0,1
4,Counter-Strike Condition Zero,0.408666,54103616,purchase,1.0,1
...,...,...,...,...,...,...
11435,Kerbal Space Program,0.034925,264626784,play,0.3,1
11436,Kerbal Space Program,0.034925,34901647,purchase,1.0,1
11437,Kerbal Space Program,0.034925,34901647,play,83.0,1
11438,Kerbal Space Program,0.034925,135400225,purchase,1.0,1


In [None]:
personalization = personalization[['game_title', 'scores']]

In [None]:
personalization.drop_duplicates()

Unnamed: 0,game_title,scores
0,Counter-Strike Condition Zero,0.408666
904,Day of Defeat Source,0.27965
1479,Left 4 Dead,0.20991
1964,Left 4 Dead 2,0.113728
3716,Call of Duty 4 Modern Warfare,0.094897
3843,Zombie Panic Source,0.089076
4015,Counter-Strike Global Offensive,0.076806
6804,Mass Effect,0.068843
6969,Battlefield Bad Company 2,0.064033
7200,Prison Architect,0.063684


In [None]:
# Non-presonalized item to item recommendation using NMS

artist = "Dota 2"

# get related items for Toy story (itemid = 0)
item_id = rev_item_dict["Dota 2"]

start = time.time()
ids, scores = nms_model.similar_items(
    itemid=item_id, 
    N=10, 
)
end = time.time()
logger.info(f'The elapsed time: {(end - start) * 1000} milliseconds')

# display the results using pandas for nicer formatting
nms_sim_df = pd.DataFrame({
    'game_title': item_arr[ids],
    'scores': scores
})


nms_sim_df = pd.merge(nms_sim_df, 
                  df[['user_id', 'game_title', 'behavior', 'value', 'group_flag']], 
                    how='left', 
                    on='game_title')

2022-06-14 16:53:33,705 : INFO : The elapsed time: 0.7352828979492188 milliseconds


In [None]:
nms_sim_df

Unnamed: 0,game_title,scores,user_id,behavior,value,group_flag
0,Dota 2,1.000000,151603712,purchase,1.0,1
1,Dota 2,1.000000,151603712,play,0.5,1
2,Dota 2,1.000000,187131847,purchase,1.0,1
3,Dota 2,1.000000,187131847,play,2.3,1
4,Dota 2,1.000000,176410694,purchase,1.0,1
...,...,...,...,...,...,...
15335,Free to Play,0.657459,175941533,play,1.4,0
15336,Free to Play,0.657459,53097340,purchase,1.0,0
15337,Free to Play,0.657459,140086587,purchase,1.0,0
15338,Free to Play,0.657459,157314597,purchase,1.0,0


In [None]:
nms_sim_df = nms_sim_df[['game_title', 'scores']]

In [None]:
nms_sim_df.drop_duplicates()

Unnamed: 0,game_title,scores
0,Dota 2,1.0
9682,Neverwinter,0.763877
10106,Nosgoth,0.739929
10631,Magicka Wizard Wars,0.736946
10984,Quake Live,0.684241
11275,Dead Island Epidemic,0.672018
11755,Counter-Strike Global Offensive,0.668353
14544,GunZ 2 The Second Duel,0.663822
14834,TERA,0.661036
15176,Free to Play,0.657459
