## Imports

In [1]:
import pandas as pd
import numpy as np

from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import cosine_similarity

## Load dataset

In [2]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

csv_files = [
    '/content/drive/Shareddrives/WE ARE SPEED/dataset/users_final_games3.csv'
]

dfs = []

for file in csv_files:
    df = pd.read_csv(file)
    dfs.append(df)

# Concatenate all CSVs
combined_df = pd.concat(dfs, ignore_index=True)

combined_df.columns = [
    "ID",
    "PlayerID",
    "GameID",
    "GameName",
    "GameGenre",
    "RunID",
    "RunTime",
    "CategoryType",
    "PlayerCountry",
    "PlayerPronouns",
    "PlayerSignupDate"
]

combined_df.head()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Unnamed: 0,ID,PlayerID,GameID,GameName,GameGenre,RunID,RunTime,CategoryType,PlayerCountry,PlayerPronouns,PlayerSignupDate
0,2,j4rz0md8\n,369p3p81,ULTRAKILL,FPS,znqpq3lz,46.839,per-level,Canada,,2020-10-11T02:01:05Z
1,3,j4rz0md8\n,369p3p81,ULTRAKILL,FPS,y8x83wwm,133.236,per-game,Canada,,2020-10-11T02:01:05Z
2,4,j4rz0md8\n,369p3p81,ULTRAKILL,FPS,y4x825kz,12.379,per-level,Canada,,2020-10-11T02:01:05Z
3,5,j4rz0md8\n,369p3p81,ULTRAKILL,FPS,y48730qm,20.592,per-level,Canada,,2020-10-11T02:01:05Z
4,6,j4rz0md8\n,369p3p81,ULTRAKILL,FPS,y8je9wxz,13.092,per-level,Canada,,2020-10-11T02:01:05Z


## Collaborative Filtering

### Player-Game Matrix

In [3]:
# Build player-game interaction matrix (show which players ran which games)
player_game_matrix = combined_df.pivot_table(
    index='PlayerID',
    columns='GameName',
    values='RunID',
    aggfunc='count',  # Count how many runs player did per game
    fill_value=0
)

# Turn into binary matrix (1 = played at least once, 0 = never)
player_game_matrix = player_game_matrix.applymap(lambda x: 1 if x > 0 else 0)

player_game_matrix.head()

  player_game_matrix = player_game_matrix.applymap(lambda x: 1 if x > 0 else 0)


GameName,!findseed the map,12 LOCKS: Plasticine room,12 Locks 3: Around the world,12 Locks II,12 Locks at FFGTV home,24 Killers,3D Maze,3D Pinball for Windows: Space Cadet,420BLAZEIT 2: GAME OF THE YEAR,44th Anniversary of the Birth of Hip Hop,...,get a snack at 4 am: SNACKCORE,hhGregg's Quest for Coupons,ivanzolo2004 horror,jumpNULL,lil gator game,shit3,singularium,Корейка Даша 2,Корейка Даша 3,ПОБЕГ ОТ ЛИЗОГУБА
PlayerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0jm5mrnx\n,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0jmqgkex\n,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
18q2gyo8\n,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
18v1rw58\n,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1xy4o3zx\n,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Train/Test Splitting

In [4]:
def train_test_split_player_game_matrix(player_game_matrix, test_fraction=0.2):
    train = player_game_matrix.copy()
    test = {}

    for player in player_game_matrix.index:
        played_games = player_game_matrix.columns[player_game_matrix.loc[player] == 1]
        if len(played_games) > 1:  # Only split players who played multiple games
            test_games = np.random.choice(played_games, size=max(1, int(len(played_games) * test_fraction)), replace=False)
            for game in test_games:
                train.at[player, game] = 0  # Mask this game in train
            test[player] = test_games

    return train, test

In [5]:
train_matrix, test_dict = train_test_split_player_game_matrix(player_game_matrix, test_fraction=0.2)

# Convert to sparse matrix for efficiency
train_matrix_sparse = csr_matrix(train_matrix.values)

### Model Training

In [6]:
player_sim_sparse_df = pd.DataFrame(
    cosine_similarity(train_matrix_sparse),  # Compute similarity between players
    index=train_matrix.index,
    columns=train_matrix.index
)

player_sim_sparse_df.head()

PlayerID,0jm5mrnx\n,0jmqgkex\n,18q2gyo8\n,18v1rw58\n,1xy4o3zx\n,5j52zlgj\n,68wepql8\n,68wgv1zx\n,68wrdwzj\n,7j4n39mx\n,...,xzlr5698\n,xzy4779j\n,xzy4p5rj\n,xzy4wq9j\n,xzy995rj\n,xzykrnej\n,xzyw5m9j\n,zx7ody6j\n,zx7vvvvx\n,zx7zl0q8\n
PlayerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0jm5mrnx\n,1.0,0.0,0.0,1.0,0.408248,0.0,0.0,0.57735,0.218218,1.0,...,1.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,0.707107
0jmqgkex\n,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.57735,0.0,0.0,0.57735,0.0,0.0,0.0,0.0,0.0
18q2gyo8\n,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
18v1rw58\n,1.0,0.0,0.0,1.0,0.408248,0.0,0.0,0.57735,0.218218,1.0,...,1.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,0.707107
1xy4o3zx\n,0.408248,0.0,0.0,0.408248,1.0,0.0,0.0,0.235702,0.178174,0.408248,...,0.408248,0.0,0.408248,0.408248,0.0,0.408248,0.408248,0.408248,0.0,0.288675


In [7]:
# Recommender function
def recommend_games_for_player(player_id, player_game_matrix, player_sim_df, top_n=5):
    # 1. Get the similarity scores for the player
    similar_players = player_sim_df[player_id].sort_values(ascending=False)
    similar_players = similar_players.drop(player_id)  # Remove self

    # 2. Get games the player hasn't played yet
    player_games = player_game_matrix.loc[player_id]
    games_not_played = player_games[player_games == 0].index.tolist()

    # 3. Score unplayed games based on similar players
    game_scores = {}
    for sim_player_id, similarity_score in similar_players.items():
        sim_player_games = player_game_matrix.loc[sim_player_id]
        for game in games_not_played:
            if sim_player_games[game] > 0:
                game_scores[game] = game_scores.get(game, 0) + similarity_score

    # 4. Sort and recommend
    sorted_games = sorted(game_scores.items(), key=lambda x: x[1], reverse=True)
    recommended_games = [game for game, score in sorted_games[:top_n]]

    return recommended_games


### Evaluation Metrics

In [8]:
import random

# Helper functions to compute NDCG
def dcg_at_k(ranked_list, true_items, k):
    # Compute Discounted Cumulative Gain at rank K
    dcg = 0.0
    for i in range(k):
        if ranked_list[i] in true_items:
            dcg += 1 / np.log2(i + 2)  # log2(rank+1)
    return dcg
def ndcg_at_k(ranked_list, true_items, k):
    # Compute Normalized DCG
    dcg = dcg_at_k(ranked_list, true_items, k)
    # Ideal DCG = best case where relevant items are ranked first
    ideal_dcg = sum(1 / np.log2(i + 2) for i in range(min(len(true_items), k)))
    if ideal_dcg == 0:
        return 0.0
    return dcg / ideal_dcg

# Define evaluation metrics
def evaluate_recommender(player_game_matrix, player_sim_df, test_dict, top_n=5, sampled_players=None):
    hits = 0
    total = 0
    precision_total = 0
    ndcg_total = 0

    players = sampled_players if sampled_players is not None else test_dict.keys()

    for player in players:
        true_games = test_dict[player]
        if player not in player_sim_df.index:
            continue

        recs = recommend_games_for_player(player, player_game_matrix, player_sim_df, top_n)

        hit = any(game in recs for game in true_games)
        hits += int(hit)

        precision = len(set(recs) & set(true_games)) / len(recs)
        precision_total += precision

        ndcg = ndcg_at_k(recs, true_games, top_n)
        ndcg_total += ndcg

        total += 1

    hit_rate = hits / total
    avg_precision = precision_total / total
    avg_ndcg = ndcg_total / total

    return hit_rate, avg_precision, avg_ndcg

In [9]:
# Run evaluation function
hit_rate, avg_precision, avg_ndcg = evaluate_recommender(
    player_game_matrix=train_matrix,
    player_sim_df=player_sim_sparse_df,
    test_dict=test_dict,
    top_n=5,
)

print(f"Hit Rate: {hit_rate:.6f}")
print(f"Average Precision: {avg_precision:.6f}")
print(f"Average NDCG: {avg_ndcg:.6f}")

Hit Rate: 0.514925
Average Precision: 0.102985
Average NDCG: 0.438266
