## Imports

In [1]:
import pandas as pd
import numpy as np

from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import cosine_similarity

## Load dataset

In [9]:
# Mount Google Drive
# from google.colab import drive
# drive.mount('/content/drive')

csv_files = [
    'dataset/users_final_games1.csv',
    'dataset/users_final_games2.csv',
    'dataset/users_final_games3.csv'
]

dfs = []

for file in csv_files:
    df = pd.read_csv(file)
    # df = pd.read_csv(file, header=None)
    dfs.append(df)

# Concatenate all CSVs
combined_df = pd.concat(dfs, ignore_index=True)

combined_df.columns = [
    "ID",
    "PlayerID",
    "GameID",
    "GameName",
    "GameGenre",
    "RunID",
    "RunTime",
    "CategoryType",
    "PlayerCountry",
    "PlayerPronouns",
    "PlayerSignupDate"
]

combined_df.head()

Unnamed: 0,ID,PlayerID,GameID,GameName,GameGenre,RunID,RunTime,CategoryType,PlayerCountry,PlayerPronouns,PlayerSignupDate
0,1,xymr52yx\n,pd0qp0w1,Pawnbarian,,mkogd5lz,215.065,per-level,Brazil,"He/Him, They/Them",2022-03-30T17:13:27Z
1,2,xymr52yx\n,j1n59y1p,Beat Stomper,Arcade,y2wox4wy,0.225,per-game,Brazil,"He/Him, They/Them",2022-03-30T17:13:27Z
2,3,xymr52yx\n,w6j7gw46,Champion Island Games,RPG,zqqog91z,0.726,per-level,Brazil,"He/Him, They/Them",2022-03-30T17:13:27Z
3,4,xymr52yx\n,j1n59y1p,Beat Stomper,Arcade,yj9d1dgz,1.278,per-game,Brazil,"He/Him, They/Them",2022-03-30T17:13:27Z
4,5,xymr52yx\n,j1n59y1p,Beat Stomper,Arcade,znxqoq3y,1.225,per-game,Brazil,"He/Him, They/Them",2022-03-30T17:13:27Z


## Collaborative Filtering

### Player-Game Matrix

In [None]:
# Build player-game interaction matrix (show which players ran which games)
player_game_matrix = combined_df.pivot_table(
    index='PlayerID',
    columns='GameName',
    values='RunID',
    aggfunc='count',  # Count how many runs player did per game
    fill_value=0
)

# Turn into binary matrix (1 = played at least once, 0 = never)
player_game_matrix = player_game_matrix.applymap(lambda x: 1 if x > 0 else 0)

player_game_matrix.head()

  player_game_matrix = player_game_matrix.applymap(lambda x: 1 if x > 0 else 0)


GameName,!findseed the map,12 LOCKS: Plasticine room,12 Locks 3: Around the world,12 Locks II,12 Locks at FFGTV home,24 Killers,3D Maze,3D Pinball for Windows: Space Cadet,420BLAZEIT 2: GAME OF THE YEAR,44th Anniversary of the Birth of Hip Hop,...,get a snack at 4 am: SNACKCORE,hhGregg's Quest for Coupons,ivanzolo2004 horror,jumpNULL,lil gator game,shit3,singularium,Корейка Даша 2,Корейка Даша 3,ПОБЕГ ОТ ЛИЗОГУБА
PlayerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0jm5mrnx\n,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0jmqgkex\n,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
18q2gyo8\n,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
18v1rw58\n,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1xy4o3zx\n,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Train/Test Splitting

In [None]:
def train_test_split_player_game_matrix(player_game_matrix, test_fraction=0.2):
    train = player_game_matrix.copy()
    test = {}

    for player in player_game_matrix.index:
        played_games = player_game_matrix.columns[player_game_matrix.loc[player] == 1]
        if len(played_games) > 1:  # Only split players who played multiple games
            test_games = np.random.choice(played_games, size=max(1, int(len(played_games) * test_fraction)), replace=False)
            for game in test_games:
                train.at[player, game] = 0  # Mask this game in train
            test[player] = test_games

    return train, test

In [None]:
train_matrix, test_dict = train_test_split_player_game_matrix(player_game_matrix, test_fraction=0.2)

train_matrix_sparse = csr_matrix(train_matrix.values)

## Model Training

In [None]:
# Sample 2000 players randomly
# sampled_players = train_matrix.sample(n=2000, random_state=42)

player_sim_df_train = pd.DataFrame(
    # cosine_similarity(sampled_players), # Compute similarity for sampled players instead of entire train_matrix
    # index=sampled_players.index,
    # columns=sampled_players.index

    cosine_similarity(train_matrix_sparse),
    index=train_matrix.index,
    columns=train_matrix.index
)

player_sim_df_train.head()

PlayerID,0jm5mrnx\n,0jmqgkex\n,18q2gyo8\n,18v1rw58\n,1xy4o3zx\n,5j52zlgj\n,68wepql8\n,68wgv1zx\n,68wrdwzj\n,7j4n39mx\n,...,xzlr5698\n,xzy4779j\n,xzy4p5rj\n,xzy4wq9j\n,xzy995rj\n,xzykrnej\n,xzyw5m9j\n,zx7ody6j\n,zx7vvvvx\n,zx7zl0q8\n
PlayerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0jm5mrnx\n,1.0,0.0,0.447214,0.0,0.0,0.707107,0.57735,0.57735,0.218218,1.0,...,1.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.707107
0jmqgkex\n,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
18q2gyo8\n,0.447214,0.0,1.0,0.0,0.0,0.316228,0.258199,0.258199,0.09759,0.447214,...,0.447214,0.0,0.447214,0.447214,0.0,0.447214,0.447214,0.447214,0.447214,0.316228
18v1rw58\n,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1xy4o3zx\n,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.089087,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Recommender Function

In [None]:
def recommend_games_for_player(player_id, player_game_matrix, player_sim_df, top_n=5):
    # 1. Get the similarity scores for the player
    similar_players = player_sim_df[player_id].sort_values(ascending=False)
    similar_players = similar_players.drop(player_id)  # Remove self

    # 2. Get games the player hasn't played yet
    player_games = player_game_matrix.loc[player_id]
    games_not_played = player_games[player_games == 0].index.tolist()

    # 3. Score unplayed games based on similar players
    game_scores = {}
    for sim_player_id, similarity_score in similar_players.items():
        sim_player_games = player_game_matrix.loc[sim_player_id]
        for game in games_not_played:
            if sim_player_games[game] > 0:
                game_scores[game] = game_scores.get(game, 0) + similarity_score

    # 4. Sort and recommend
    sorted_games = sorted(game_scores.items(), key=lambda x: x[1], reverse=True)
    recommended_games = [game for game, score in sorted_games[:top_n]]

    return recommended_games


In [None]:
# recommend_games_for_player(player_id='xymr52yx\n',
#                             player_game_matrix=player_game_matrix,
#                             player_sim_df=player_sim_df_train,
#                             top_n=5)


## Evaluation Metrics

In [None]:
import random

def dcg_at_k(ranked_list, true_items, k):
    """Compute Discounted Cumulative Gain at rank K"""
    dcg = 0.0
    for i in range(k):
        if ranked_list[i] in true_items:
            dcg += 1 / np.log2(i + 2)  # log2(rank+1)
    return dcg

def ndcg_at_k(ranked_list, true_items, k):
    """Compute Normalized DCG"""
    dcg = dcg_at_k(ranked_list, true_items, k)
    # Ideal DCG = best case where relevant items are ranked first
    ideal_dcg = sum(1 / np.log2(i + 2) for i in range(min(len(true_items), k)))
    if ideal_dcg == 0:
        return 0.0
    return dcg / ideal_dcg

# Sample 300 players from test_dict
# sampled_players = random.sample(list(test_dict.keys()), k=8)

def evaluate_recommender(player_game_matrix, player_sim_df, test_dict, top_n=5, sampled_players=None):
    hits = 0
    total = 0
    precision_total = 0
    ndcg_total = 0

    players = sampled_players if sampled_players is not None else test_dict.keys()

    for player in players:
        true_games = test_dict[player]
        if player not in player_sim_df.index:
            continue

        recs = recommend_games_for_player(player, player_game_matrix, player_sim_df, top_n)

        hit = any(game in recs for game in true_games)
        hits += int(hit)

        precision = len(set(recs) & set(true_games)) / len(recs)
        precision_total += precision

        ndcg = ndcg_at_k(recs, true_games, top_n)
        ndcg_total += ndcg

        total += 1

    hit_rate = hits / total
    avg_precision = precision_total / total
    avg_ndcg = ndcg_total / total

    return hit_rate, avg_precision, avg_ndcg

In [None]:
hit_rate, avg_precision, avg_ndcg = evaluate_recommender(
    player_game_matrix=train_matrix,
    player_sim_df=player_sim_df_train,
    test_dict=test_dict,
    top_n=5,
    # sampled_players=sampled_players
)

print(f"Hit Rate: {hit_rate:.6f}")
print(f"Average Precision: {avg_precision:.6f}")
print(f"Average NDCG: {avg_ndcg:.6f}")

Hit Rate: 0.481343
Average Precision: 0.096269
Average NDCG: 0.428349


Hit Rate@5: 0.52
Out of all players, 52% had at least one of their games correctly recommended in their top 5 suggestions.

Precision@5: 0.10
On average, 10% of the top 5 recommendations were actually correct.

NDCG
0.5–0.7 NDCG is good for recommenders.

Value	      Meaning
~1.0	      Perfect ranking (ideal)
~0.5	     	Some relevant games appear but not always at the top
~0.0		    Bad ranking (random guessing)

## Content-Based Filtering

### Game Feature Vectors

In [None]:
# 1. Calculate the average run time for each game
avg_run_time = combined_df.groupby('GameName')['RunTime'].mean()

# 2. One-hot encode game genres
genre_dummies = pd.get_dummies(combined_df[['GameName', 'GameGenre']].drop_duplicates().set_index('GameName'))

# 3. Merge into one game feature matrix
game_features = genre_dummies.join(avg_run_time)

# Fill missing values if any
game_features = game_features.fillna(0)

game_features.head()

Unnamed: 0_level_0,GameGenre_2D,GameGenre_2D Platformer,GameGenre_3D,GameGenre_3D Platformer,GameGenre_Action,GameGenre_Action Adventure,GameGenre_Action Platformer,GameGenre_Action Puzzler,GameGenre_Action RPG,GameGenre_Action-adventure,...,GameGenre_Top-down,GameGenre_Tower defense,GameGenre_Toys-to-life,GameGenre_Walking Simulator,GameGenre_Wholesome,GameGenre_horror game,GameGenre_movement shooter,GameGenre_point and click,GameGenre_retro,RunTime
GameName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ULTRAKILL,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,172.413789
ROBLOX: Tower Defense Simulator,False,False,False,False,False,False,False,False,False,False,...,False,True,False,False,False,False,False,False,False,1480.127821
Sonic Forces,False,False,False,False,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,84.82
Unbothered,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,229.817
Piano Tiles (Scratch),False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,10.967


### Compute game-game similarity

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

# Game similarity matrix
game_similarity = pd.DataFrame(
    cosine_similarity(game_features),
    index=game_features.index,
    columns=game_features.index
)

game_similarity.head()

GameName,ULTRAKILL,ROBLOX: Tower Defense Simulator,Sonic Forces,Unbothered,Piano Tiles (Scratch),Red Ball,Bionicle,Duck Game,ULTRAKILL - Category Extensions,Escape the Backrooms,...,Sonic Mania,EA Sports UFC 4,EA Sports UFC 3,Accounting+,Hot Lava,Half-Life 2,Clustertruck Category Extensions,COASTLINE,Ghostrunner 2,Among Us Clicker
GameName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ULTRAKILL,1.0,0.999983,0.999914,0.999983,0.999983,0.999976,0.999983,0.996875,0.999992,0.999983,...,0.999491,0.707095,0.707095,0.999983,0.999929,0.999992,0.999982,0.999983,0.999983,0.998675
ROBLOX: Tower Defense Simulator,0.999983,1.0,0.99993,1.0,1.0,0.999993,1.0,0.996892,0.999998,0.999999,...,0.999508,0.707107,0.707107,1.0,0.999946,0.999998,0.999999,1.0,1.0,0.998692
Sonic Forces,0.999914,0.99993,1.0,0.999931,0.999931,0.999924,0.99993,0.996823,0.999929,0.99993,...,0.999808,0.707058,0.707058,0.999931,0.999876,0.999929,0.999929,0.999931,0.999931,0.998623
Unbothered,0.999983,1.0,0.999931,1.0,1.0,0.999993,1.0,0.996892,0.999998,0.999999,...,0.999508,0.707107,0.707107,1.0,0.999946,0.999998,0.999999,1.0,1.0,0.998692
Piano Tiles (Scratch),0.999983,1.0,0.999931,1.0,1.0,0.999993,1.0,0.996892,0.999998,0.999999,...,0.999508,0.707107,0.707107,1.0,0.999946,0.999998,0.999999,1.0,1.0,0.998692


### Content-Based Recommender Function

In [None]:
def recommend_games_content_based(player_id, player_game_matrix, game_similarity, top_n=5):
    if player_id not in player_game_matrix.index:
        return []

    # Games the player already played
    played_games = player_game_matrix.columns[player_game_matrix.loc[player_id] == 1]

    # Score all games by how similar they are to played games
    scores = game_similarity.loc[played_games].sum(axis=0)

    # Remove already played games from recommendations
    scores = scores.drop(played_games, errors='ignore')

    # Get top recommendations
    recommended_games = scores.sort_values(ascending=False).head(top_n).index.tolist()

    return recommended_games

In [None]:
recommend_games_content_based(
    player_id='18v0n0nx\n',
    player_game_matrix=player_game_matrix,
    game_similarity=game_similarity,
    top_n=5
)

[]