In [None]:
# from google.colab import drive

# drive.mount('/content/drive')

Mounted at /content/drive


# Speed Run Game Recommender - SVD Model

In [2]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import normalize
from sklearn.model_selection import train_test_split

from scipy.sparse import csr_matrix
import matplotlib.pyplot as plt
from collections import defaultdict

### Import data

In [None]:
# csv_files = [
#     # '/content/drive/Shared drives/WE ARE SPEED/dataset/users_final_games1.csv',
#     '/content/drive/Shared drives/WE ARE SPEED/dataset/users_final_games2.csv',
#     '/content/drive/Shared drives/WE ARE SPEED/dataset/users_final_games3_withHeader.csv'
# ]

csv_files = [
    'dataset/users_final_games1.csv',
    'dataset/users_final_games2.csv',
    'dataset/users_final_games3.csv'
]

dfs = []

for file in csv_files:
    df = pd.read_csv(file)
    # df = pd.read_csv(file, header=None)
    dfs.append(df)

# Concatenate all CSVs
data = pd.concat(dfs, ignore_index=True)

data.columns = [
    "ID",
    "PlayerID",
    "GameID",
    "GameName",
    "GameGenre",
    "RunID",
    "RunTime",
    "CategoryType",
    "PlayerCountry",
    "PlayerPronouns",
    "PlayerSignupDate"
]

# Display basic information about the dataset
print("\nDataset Overview:")
print(f"Number of records: {len(data)}")
print(f"Columns: {data.columns.tolist()}")

# Check for missing values
print("\nMissing values per column:")
display(data.isnull().sum())

data.head()


Dataset Overview:
Number of records: 161840
Columns: ['ID', 'PlayerID', 'GameID', 'GameName', 'GameGenre', 'RunID', 'RunTime', 'CategoryType', 'PlayerCountry', 'PlayerPronouns', 'PlayerSignupDate']

Missing values per column:


Unnamed: 0,0
ID,0
PlayerID,0
GameID,0
GameName,0
GameGenre,56711
RunID,0
RunTime,0
CategoryType,0
PlayerCountry,8416
PlayerPronouns,65639


Unnamed: 0,ID,PlayerID,GameID,GameName,GameGenre,RunID,RunTime,CategoryType,PlayerCountry,PlayerPronouns,PlayerSignupDate
0,1,j52v6ozj\n,o1y9j9v6,Celeste,2D Platformer,mrx9177m,2605.743,per-game,Scotland,She/Her,2022-04-22T20:41:25Z
1,2,j52v6ozj\n,46w3xrq1,Tetris (Web),,y6xkwe6y,17.433,per-game,Scotland,She/Her,2022-04-22T20:41:25Z
2,3,j52v6ozj\n,46w3xrq1,Tetris (Web),,z0porq9y,33.0,per-game,Scotland,She/Her,2022-04-22T20:41:25Z
3,4,j52v6ozj\n,j1ne9me1,Celeste Category Extensions,2D Platformer,z52v70nz,29.818,per-game,Scotland,She/Her,2022-04-22T20:41:25Z
4,5,8ge97w1j\n,o1y9j9v6,Celeste,2D Platformer,yv2rn9em,2894.811,per-game,,,2020-06-24T18:09:48Z


### Data preprocessing

In [19]:
# Remove duplicates if any
data = data.drop_duplicates()
print(f"Dataset size after removing duplicates: {len(data)}")

# Create mappings
game_id_to_name = dict(zip(data['GameID'], data['GameName']))
print(f"Number of unique players: {data['PlayerID'].nunique()}")
print(f"Number of unique games: {data['GameID'].nunique()}")

# Add recency and frequency features to the model
def add_player_and_game_features(data):
    """Add player experience, run frequency, and game popularity features."""

    # Drop any conflicting columns first
    data = data.drop(columns=[
        'TotalRuns', 'GamePopularity',
        'TotalRuns_x', 'GamePopularity_x',
        'TotalRuns_y', 'GamePopularity_y'
    ], errors='ignore')

    # If we have timestamp information
    if 'PlayerSignUpDate' in data.columns:
        # Convert to datetime
        data['PlayerSignUpDate'] = pd.to_datetime(data['PlayerSignUpDate'], errors='coerce')

        # Calculate experience (days since sign up)
        data['PlayerExperience'] = (pd.Timestamp.now() - data['PlayerSignUpDate']).dt.days

        # Group players by experience level
        data['ExperienceGroup'] = pd.qcut(data['PlayerExperience'], 4, labels=['Novice', 'Intermediate', 'Experienced', 'Veteran'])

    # Add total run count per player
    player_run_counts = data.groupby('PlayerID')['RunID'].nunique().reset_index()
    player_run_counts.columns = ['PlayerID', 'TotalRuns']
    data = data.merge(player_run_counts, on='PlayerID', how='left')

    # Add game popularity as a feature
    game_popularity = data.groupby('GameID')['RunID'].nunique().reset_index()
    game_popularity.columns = ['GameID', 'GamePopularity']
    data = data.merge(game_popularity, on='GameID', how='left')

    return data

data = add_player_and_game_features(data)
print("Added extra features to the dataset.")

# print(data.columns.tolist())

# Keep only players with at least 2 interactions
player_counts = data['PlayerID'].value_counts()
valid_players = player_counts[player_counts >= 2].index
data = data[data['PlayerID'].isin(valid_players)]

print(f"Dataset size after filtering players with at least 2 interactions: {len(data)}")
print(f"Number of unique players after filtering: {data['PlayerID'].nunique()}")

# Split data into training (80%) and testing (20%)
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

print(f"Training set size: {len(train_data)}")
print(f"Test set size: {len(test_data)}")

# Ensure all test players exist in training
test_data = test_data[test_data['PlayerID'].isin(train_data['PlayerID'])]

# Create player-game interaction matrices
train_player_game_matrix = pd.crosstab(train_data['PlayerID'], train_data['GameID'])
test_player_game_matrix = pd.crosstab(test_data['PlayerID'], test_data['GameID'])

# Normalize
train_player_game_matrix = train_player_game_matrix.div(train_player_game_matrix.sum(axis=1), axis=0).fillna(0)
test_player_game_matrix = test_player_game_matrix.div(test_player_game_matrix.sum(axis=1), axis=0).fillna(0)


Dataset size after removing duplicates: 161840
Number of unique players: 14524
Number of unique games: 9435
Added extra features to the dataset.
Dataset size after filtering players with at least 2 interactions: 155136
Number of unique players after filtering: 7820
Training set size: 124108
Test set size: 31028


### Build Recommendation Model

In [20]:
# Method 1: Collaborative Filtering - Game Similarity
game_similarity = cosine_similarity(train_player_game_matrix.T)
game_similarity_df = pd.DataFrame(
    game_similarity,
    index=train_player_game_matrix.columns,
    columns=train_player_game_matrix.columns
)

def get_similar_games(game_id, n=5):
    """Get the most similar games to a given game based on player overlap"""
    if game_id not in game_similarity_df.index:
        print(f"Game ID {game_id} not found in the dataset")
        return pd.Series()

    similar_games = game_similarity_df[game_id].sort_values(ascending=False)[1:n+1]
    return similar_games

# Method 2: Player-based recommendations
def recommend_games_for_player(player_id, n=5):
    """Recommend games for a player based on similar players' game choices"""
    if player_id not in train_player_game_matrix.index:
        print(f"Player ID {player_id} not found in the dataset")
        return pd.Series()

    # Get the games this player has already played
    player_games = set(train_player_game_matrix.loc[player_id][train_player_game_matrix.loc[player_id] > 0].index)

    # Calculate score for each game based on similarity of players who played it
    game_scores = defaultdict(float)

    for other_player in train_player_game_matrix.index:
        if other_player == player_id:
            continue

        # Get games played by the other player
        other_player_games = set(train_player_game_matrix.loc[other_player][train_player_game_matrix.loc[other_player] > 0].index)

        # Calculate Jaccard similarity (intersection over union)
        common_games = player_games.intersection(other_player_games)
        if not player_games or not other_player_games:
            similarity = 0
        else:
            similarity = len(common_games) / len(player_games.union(other_player_games))

        # Add score for each game the other player has played that our player hasn't
        for game in other_player_games - player_games:
            game_scores[game] += similarity

    # Sort games by score and return top n
    recommendations = pd.Series(game_scores).sort_values(ascending=False).head(n)

    # Convert GameIDs to game names for better readability
    if len(recommendations) > 0:
        recommendations.index = [game_id_to_name.get(game_id, f"Game {game_id}") for game_id in recommendations.index]

    return recommendations

def build_content_based_model():
    """Build a content-based recommendation model using game genres"""
    if 'GameGenre' not in data.columns or data['GameGenre'].isna().all():
        print("Genre data not available or empty. Skipping content-based model.")
        return None

    # Create a unique game-genre dataset (only include games with genre data)
    game_genres = data[['GameID', 'GameName', 'GameGenre']].dropna(subset=['GameGenre']).drop_duplicates()

    if len(game_genres) == 0:
        print("No valid genre data found. Skipping content-based model.")
        return None

    print(f"Building content-based model with {len(game_genres)} games that have genre information")

    # One-hot encode genres (assuming genres are comma-separated)
    game_genres['GenreList'] = game_genres['GameGenre'].str.split(',')

    # Ensure GenreList is properly formatted and handle any potential NaN values
    valid_genres = []
    for genres in game_genres['GenreList']:
        if isinstance(genres, list):
            valid_genres.extend([g.strip() for g in genres if isinstance(g, str)])

    unique_genres = list(set(valid_genres))

    if not unique_genres:
        print("No valid genres found. Skipping content-based model.")
        return None

    # Create genre feature matrix
    genre_matrix = pd.DataFrame(0, index=game_genres['GameID'], columns=unique_genres)

    for _, row in game_genres.iterrows():
        if isinstance(row['GenreList'], list):
            for genre in row['GenreList']:
                if isinstance(genre, str) and genre.strip() in unique_genres:
                    genre_matrix.loc[row['GameID'], genre.strip()] = 1

    # Remove any remaining NaN values
    genre_matrix.fillna(0, inplace=True)

    # Calculate genre-based similarity
    genre_similarity = cosine_similarity(genre_matrix)
    genre_sim_df = pd.DataFrame(
        genre_similarity,
        index=genre_matrix.index,
        columns=genre_matrix.index
    )

    return genre_sim_df

genre_sim_df = build_content_based_model()

Building content-based model with 3935 games that have genre information


### Upgraded recommendation function (with error handling)

In [21]:
def get_recommendations(player_id=None, game_id=None, n=5, method='hybrid'):
    """Get game recommendations using the specified method"""
    try:
        if method == 'collaborative' and game_id is not None:
            # Similar games recommendation
            similar_games = get_similar_games(game_id, n)
            if len(similar_games) > 0:
                similar_games.index = [game_id_to_name.get(g_id, f"Game {g_id}") for g_id in similar_games.index]
            return similar_games

        elif method == 'user_based' and player_id is not None:
            # User-based recommendation
            return recommend_games_for_player(player_id, n)

        elif method == 'content_based' and game_id is not None and genre_sim_df is not None:
            # Content-based recommendation
            if game_id not in genre_sim_df.index:
                print(f"Game ID {game_id} not found in the genre dataset")
                return pd.Series()

            similar_games = genre_sim_df[game_id].sort_values(ascending=False)[1:n+1]
            similar_games.index = [game_id_to_name.get(g_id, f"Game {g_id}") for g_id in similar_games.index]
            return similar_games

        elif method == 'hybrid' and player_id is not None:
            # Hybrid recommendation
            # Get player's game history
            if player_id not in train_player_game_matrix.index:
                print(f"Player ID {player_id} not found in the dataset")
                return pd.Series()

            player_history = train_player_game_matrix.loc[player_id]
            player_games = player_history[player_history > 0].index.tolist()

            # If player has no games, return most popular games
            if not player_games:
                print("Player has no game history. Recommending popular games.")
                popular_games = data['GameID'].value_counts().head(n)
                popular_games.index = [game_id_to_name.get(g_id, f"Game {g_id}") for g_id in popular_games.index]
                return popular_games

            # Get collaborative filtering scores
            cf_scores = defaultdict(float)
            for game in player_games:
                similar_games = get_similar_games(game, 20)
                for sim_game, score in similar_games.items():
                    if sim_game not in player_games:  # Don't recommend games already played
                        cf_scores[sim_game] += score

            # Add content-based scores if available
            if genre_sim_df is not None:
                cb_scores = defaultdict(float)
                for game in player_games:
                    if game in genre_sim_df.index:
                        similar_games = genre_sim_df[game].sort_values(ascending=False)[1:20]
                        for sim_game, score in similar_games.items():
                            if sim_game not in player_games:  # Don't recommend games already played
                                cb_scores[sim_game] += score

                # Normalize and combine scores (0.7 weight to CF, 0.3 to content)
                if cf_scores:
                    cf_max = max(cf_scores.values()) if cf_scores else 1
                    for game in cf_scores:
                        cf_scores[game] /= cf_max

                if cb_scores:
                    cb_max = max(cb_scores.values()) if cb_scores else 1
                    for game in cb_scores:
                        cb_scores[game] /= cb_max

                # Combine scores
                final_scores = defaultdict(float)
                for game in set(list(cf_scores.keys()) + list(cb_scores.keys())):
                    final_scores[game] = 0.7 * cf_scores.get(game, 0) + 0.3 * cb_scores.get(game, 0)

                recommendations = pd.Series(final_scores).sort_values(ascending=False).head(n)
            else:
                recommendations = pd.Series(cf_scores).sort_values(ascending=False).head(n)

            # Convert GameIDs to game names
            if len(recommendations) > 0:
                recommendations.index = [game_id_to_name.get(g_id, f"Game {g_id}") for g_id in recommendations.index]

            return recommendations

        else:
            print("Invalid method or missing required parameters")
            return pd.Series()

    except Exception as e:
        print(f"Error generating recommendations: {str(e)}")
        print("Falling back to collaborative filtering method...")

        # Fallback recommendation method
        if player_id is not None:
            player_games = []
            if player_id in train_player_game_matrix.index:
                player_history = train_player_game_matrix.loc[player_id]
                player_games = player_history[player_history > 0].index.tolist()

            if player_games:
                # Get recommendations based on game similarity
                cf_scores = defaultdict(float)
                for game in player_games:
                    if game in game_similarity_df.columns:
                        similar_games = game_similarity_df[game].sort_values(ascending=False)[1:20]
                        for sim_game, score in similar_games.items():
                            if sim_game not in player_games:
                                cf_scores[sim_game] += score

                recommendations = pd.Series(cf_scores).sort_values(ascending=False).head(n)

                if len(recommendations) > 0:
                    recommendations.index = [game_id_to_name.get(g_id, f"Game {g_id}") for g_id in recommendations.index]

                return recommendations
            else:
                # Return most popular games
                popular_games = data['GameID'].value_counts().head(n)
                popular_games.index = [game_id_to_name.get(g_id, f"Game {g_id}") for g_id in popular_games.index]
                return popular_games
        elif game_id is not None:
            # Return similar games
            if game_id in game_similarity_df.columns:
                similar_games = game_similarity_df[game_id].sort_values(ascending=False)[1:n+1]
                similar_games.index = [game_id_to_name.get(g_id, f"Game {g_id}") for g_id in similar_games.index]
                return similar_games
            else:
                return pd.Series()
        else:
            return pd.Series()

### Model Evaluation

In [22]:
def evaluate_model():
    """Evaluate the model using leave-one-out cross validation"""
    # Only perform this if we have enough data
    if len(train_player_game_matrix) < 10 or train_player_game_matrix.shape[1] < 10:
        print("Not enough data for meaningful evaluation")
        return

    print("Performing leave-one-out validation...")

    try:
        # Get players with at least 2 games
        valid_players = [p for p in train_player_game_matrix.index
                 if (train_player_game_matrix.loc[p] > 0).sum() >= 2]

        if len(valid_players) < 5:
            print("Not enough players with multiple games for evaluation")
            return

        # Sample players for evaluation
        num_eval_players = min(100, len(valid_players))
        eval_players = np.random.choice(valid_players, num_eval_players, replace=False)

        hit_rates = []
        ndcg_scores = []

        for player in eval_players:
            # Get games played by this player
            games_played = train_player_game_matrix.loc[player][train_player_game_matrix.loc[player] > 0].index.tolist()

            if len(games_played) <= 1:
                continue

            # Hide one game
            test_game = np.random.choice(games_played)

            # Create a copy of the matrix with the test game hidden
            temp_matrix = train_player_game_matrix.copy()
            temp_matrix.loc[player, test_game] = 0

            # Get recommendations
            player_games = temp_matrix.loc[player][temp_matrix.loc[player] > 0].index.tolist()
            game_scores = defaultdict(float)
            for game in player_games:
                if game in game_similarity_df.columns:
                    similar_games = game_similarity_df[game].sort_values(ascending=False)
                    for sim_game, score in similar_games.items():
                        if sim_game not in player_games:
                            game_scores[sim_game] += score

            # Get top recommendations
            if game_scores:
                recommendations = pd.Series(game_scores).sort_values(ascending=False).head(10)

                # Check if the hidden game is in recommendations
                if test_game in recommendations.index:
                    hit_rates.append(1)

                    # Get the rank position (0-indexed)
                    rank = recommendations.index.tolist().index(test_game)
                    ndcg = 1 / np.log2(rank + 2)  # rank+2 because of log2(1+rank)
                    ndcg_scores.append(ndcg)
                else:
                    hit_rates.append(0)
                    ndcg_scores.append(0)

        if hit_rates:
          hit_rate_at_10 = sum(hit_rates) / len(hit_rates)
          precision_at_10 = sum(hit_rates) / (len(hit_rates) * 10)
          ndcg_at_10 = np.mean(ndcg_scores)

          print(f"Hit rate@10: {hit_rate_at_10:.2f}")
          print(f"Precision@10: {precision_at_10:.2f}")
          print(f"NDCG@10: {ndcg_at_10:.4f}")
          print(f"Number of players evaluated: {len(hit_rates)}")
        else:
          print("No valid evaluation results obtained")

    except Exception as e:
        print(f"Error during evaluation: {str(e)}")

evaluate_model()

Performing leave-one-out validation...
Hit rate@10: 0.57
Precision@10: 0.06
NDCG@10: 0.4396
Number of players evaluated: 100


### Evaluate on test set

In [24]:
def evaluate_model_on_test():
    """Evaluate the recommendation model using the test set."""
    if len(test_player_game_matrix) == 0 or len(train_player_game_matrix) == 0:
        print("Training or testing matrix is empty. Cannot evaluate.")
        return

    print("Evaluating on unseen test data...")

    hit_rates = []
    precisions = []
    ndcgs = []

    num_eval_players = min(50, len(test_player_game_matrix.index))
    eval_players = np.random.choice(test_player_game_matrix.index, num_eval_players, replace=False)

    for player in eval_players:
        # Actual games the player interacted with in the test set
        actual_games = test_player_game_matrix.loc[player][test_player_game_matrix.loc[player] > 0].index.tolist()

        if not actual_games:
            continue

        # Get top N recommendations from the model (trained on training set)
        recommendations = get_recommendations(player_id=player, n=10, method='hybrid')

        if recommendations is None or len(recommendations) == 0:
            continue

        # Map recommendation names back to GameIDs
        recommended_game_ids = []
        for rec_name in recommendations.index:
            for gid, gname in game_id_to_name.items():
                if gname == rec_name:
                    recommended_game_ids.append(gid)
                    break

        if not recommended_game_ids:
            continue

        # Calculate Hits
        hits = len(set(actual_games) & set(recommended_game_ids))

        hit_rates.append(1 if hits > 0 else 0)
        precisions.append(hits / 10)  # because top-10 recommendations
        if hits > 0:
            # NDCG calculation: reward higher-ranked hits
            ndcg = 0
            for idx, rec_game in enumerate(recommended_game_ids):
                if rec_game in actual_games:
                    ndcg += 1 / np.log2(idx + 2)  # position is idx + 1, and formula uses log2(position + 1)
            ndcgs.append(ndcg)
        else:
            ndcgs.append(0)

    if hit_rates:
        print(f"Hit Rate@10: {np.mean(hit_rates):.4f}")
        print(f"Precision@10: {np.mean(precisions):.4f}")
        print(f"NDCG@10: {np.mean(ndcgs):.4f}")
        print(f"Number of players evaluated: {len(hit_rates)}")
    else:
        print("No valid evaluations were performed.")

evaluate_model_on_test()

Evaluating on unseen test data...
Hit Rate@10: 0.1800
Precision@10: 0.0180
NDCG@10: 0.1315
Number of players evaluated: 50
