In [None]:
# from google.colab import drive

# drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import normalize
from scipy.sparse import csr_matrix
import matplotlib.pyplot as plt
# import seaborn as sns
from collections import defaultdict

In [12]:
# csv_files = [
#     '/content/drive/Shared drives/WE ARE SPEED/dataset/users_final_games1.csv',
#     '/content/drive/Shared drives/WE ARE SPEED/dataset/users_final_games2.csv',
#     '/content/drive/Shared drives/WE ARE SPEED/dataset/users_final_games3_withHeader.csv'
# ]

csv_files = [
    'dataset/users_final_games1.csv',
    'dataset/users_final_games2.csv',
    'dataset/users_final_games3.csv'
]

dfs = []

for file in csv_files:
    df = pd.read_csv(file)
    # df = pd.read_csv(file, header=None)
    dfs.append(df)

# Concatenate all CSVs
data = pd.concat(dfs, ignore_index=True)

data.columns = [
    "ID",
    "PlayerID",
    "GameID",
    "GameName",
    "GameGenre",
    "RunID",
    "RunTime",
    "CategoryType",
    "PlayerCountry",
    "PlayerPronouns",
    "PlayerSignupDate"
]

data.head()

Unnamed: 0,ID,PlayerID,GameID,GameName,GameGenre,RunID,RunTime,CategoryType,PlayerCountry,PlayerPronouns,PlayerSignupDate
0,1,xymr52yx\n,pd0qp0w1,Pawnbarian,,mkogd5lz,215.065,per-level,Brazil,"He/Him, They/Them",2022-03-30T17:13:27Z
1,2,xymr52yx\n,j1n59y1p,Beat Stomper,Arcade,y2wox4wy,0.225,per-game,Brazil,"He/Him, They/Them",2022-03-30T17:13:27Z
2,3,xymr52yx\n,w6j7gw46,Champion Island Games,RPG,zqqog91z,0.726,per-level,Brazil,"He/Him, They/Them",2022-03-30T17:13:27Z
3,4,xymr52yx\n,j1n59y1p,Beat Stomper,Arcade,yj9d1dgz,1.278,per-game,Brazil,"He/Him, They/Them",2022-03-30T17:13:27Z
4,5,xymr52yx\n,j1n59y1p,Beat Stomper,Arcade,znxqoq3y,1.225,per-game,Brazil,"He/Him, They/Them",2022-03-30T17:13:27Z


In [13]:
# Display basic information about the dataset
print("\nDataset Overview:")
print(f"Number of records: {len(data)}")
print(f"Columns: {data.columns.tolist()}")
print("\nSample data:")
# display(data.head())

# Check for missing values
print("\nMissing values per column:")
display(data.isnull().sum())

# Step 2: Data Preprocessing
print("\n### Step 2: Data Preprocessing ###")

# Remove duplicates if any
data = data.drop_duplicates()
print(f"Dataset size after removing duplicates: {len(data)}")

# Create mappings
game_id_to_name = dict(zip(data['GameID'], data['GameName']))
# player_run_counts = data.groupby('PlayerID').size().reset_index(name='RunCount')
print(f"Number of unique players: {data['PlayerID'].nunique()}")
print(f"Number of unique games: {data['GameID'].nunique()}")

# Add recency and frequency features to the model
def add_player_and_game_features(data):
    """Add player experience, run frequency, and game popularity features."""

    # Drop any conflicting columns first
    data = data.drop(columns=[
        'TotalRuns', 'GamePopularity',
        'TotalRuns_x', 'GamePopularity_x',
        'TotalRuns_y', 'GamePopularity_y'
    ], errors='ignore')

    # If we have timestamp information
    if 'PlayerSignUpDate' in data.columns:
        # Convert to datetime
        data['PlayerSignUpDate'] = pd.to_datetime(data['PlayerSignUpDate'], errors='coerce')

        # Calculate experience (days since sign up)
        data['PlayerExperience'] = (pd.Timestamp.now() - data['PlayerSignUpDate']).dt.days

        # Group players by experience level
        data['ExperienceGroup'] = pd.qcut(data['PlayerExperience'], 4, labels=['Novice', 'Intermediate', 'Experienced', 'Veteran'])

    # Add total run count per player
    player_run_counts = data.groupby('PlayerID')['RunID'].nunique().reset_index()
    player_run_counts.columns = ['PlayerID', 'TotalRuns']
    data = data.merge(player_run_counts, on='PlayerID', how='left')

    # Add game popularity as a feature
    game_popularity = data.groupby('GameID')['RunID'].nunique().reset_index()
    game_popularity.columns = ['GameID', 'GamePopularity']
    data = data.merge(game_popularity, on='GameID', how='left')

    return data


# Add temporal features
data = add_player_and_game_features(data)
print("Added temporal features to the dataset.")

print(data.columns.tolist())

# Create player-game interaction matrix (1 if player has run the game, 0 otherwise)
player_game_matrix = pd.crosstab(data['PlayerID'], data['GameID'])
print(f"\nPlayer-Game Matrix Shape: {player_game_matrix.shape}")

# Normalize so each player's row sums to 1
player_game_matrix = player_game_matrix.div(player_game_matrix.sum(axis=1), axis=0).fillna(0)



Dataset Overview:
Number of records: 412938
Columns: ['ID', 'PlayerID', 'GameID', 'GameName', 'GameGenre', 'RunID', 'RunTime', 'CategoryType', 'PlayerCountry', 'PlayerPronouns', 'PlayerSignupDate']

Sample data:

Missing values per column:


ID                       0
PlayerID                 0
GameID                   0
GameName                 0
GameGenre           165770
RunID                    0
RunTime                  0
CategoryType             0
PlayerCountry        20868
PlayerPronouns      179657
PlayerSignupDate        65
dtype: int64


### Step 2: Data Preprocessing ###
Dataset size after removing duplicates: 412938
Number of unique players: 29791
Number of unique games: 16373
Added temporal features to the dataset.
['ID', 'PlayerID', 'GameID', 'GameName', 'GameGenre', 'RunID', 'RunTime', 'CategoryType', 'PlayerCountry', 'PlayerPronouns', 'PlayerSignupDate', 'TotalRuns', 'GamePopularity']

Player-Game Matrix Shape: (29791, 16373)


In [14]:
# Step 3: Build the Recommendation Model
print("\n### Step 3: Building Recommendation Models ###")

# Method 1: Collaborative Filtering - Game Similarity
game_similarity = cosine_similarity(player_game_matrix.T)
game_similarity_df = pd.DataFrame(
    game_similarity,
    index=player_game_matrix.columns,
    columns=player_game_matrix.columns
)

def get_similar_games(game_id, n=5):
    """Get the most similar games to a given game based on player overlap"""
    if game_id not in game_similarity_df.index:
        print(f"Game ID {game_id} not found in the dataset")
        return pd.Series()

    similar_games = game_similarity_df[game_id].sort_values(ascending=False)[1:n+1]
    return similar_games

# Method 2: Player-based recommendations
def recommend_games_for_player(player_id, n=5):
    """Recommend games for a player based on similar players' game choices"""
    if player_id not in player_game_matrix.index:
        print(f"Player ID {player_id} not found in the dataset")
        return pd.Series()

    # Get the games this player has already played
    player_games = set(player_game_matrix.loc[player_id][player_game_matrix.loc[player_id] > 0].index)

    # Calculate score for each game based on similarity of players who played it
    game_scores = defaultdict(float)

    for other_player in player_game_matrix.index:
        if other_player == player_id:
            continue

        # Get games played by the other player
        other_player_games = set(player_game_matrix.loc[other_player][player_game_matrix.loc[other_player] > 0].index)

        # Calculate Jaccard similarity (intersection over union)
        common_games = player_games.intersection(other_player_games)
        if not player_games or not other_player_games:
            similarity = 0
        else:
            similarity = len(common_games) / len(player_games.union(other_player_games))

        # Add score for each game the other player has played that our player hasn't
        for game in other_player_games - player_games:
            game_scores[game] += similarity

    # Sort games by score and return top n
    recommendations = pd.Series(game_scores).sort_values(ascending=False).head(n)

    # Convert GameIDs to game names for better readability
    if len(recommendations) > 0:
        recommendations.index = [game_id_to_name.get(game_id, f"Game {game_id}") for game_id in recommendations.index]

    return recommendations

# Modified content-based model function that handles NaN values
def build_content_based_model():
    """Build a content-based recommendation model using game genres"""
    if 'GameGenre' not in data.columns or data['GameGenre'].isna().all():
        print("Genre data not available or empty. Skipping content-based model.")
        return None

    # Create a unique game-genre dataset (only include games with genre data)
    game_genres = data[['GameID', 'GameName', 'GameGenre']].dropna(subset=['GameGenre']).drop_duplicates()

    if len(game_genres) == 0:
        print("No valid genre data found. Skipping content-based model.")
        return None

    print(f"Building content-based model with {len(game_genres)} games that have genre information")

    # One-hot encode genres (assuming genres are comma-separated)
    game_genres['GenreList'] = game_genres['GameGenre'].str.split(',')

    # Ensure GenreList is properly formatted and handle any potential NaN values
    valid_genres = []
    for genres in game_genres['GenreList']:
        if isinstance(genres, list):
            valid_genres.extend([g.strip() for g in genres if isinstance(g, str)])

    unique_genres = list(set(valid_genres))

    if not unique_genres:
        print("No valid genres found. Skipping content-based model.")
        return None

    # Create genre feature matrix
    genre_matrix = pd.DataFrame(0, index=game_genres['GameID'], columns=unique_genres)

    for _, row in game_genres.iterrows():
        if isinstance(row['GenreList'], list):
            for genre in row['GenreList']:
                if isinstance(genre, str) and genre.strip() in unique_genres:
                    genre_matrix.loc[row['GameID'], genre.strip()] = 1

    # Remove any remaining NaN values
    genre_matrix.fillna(0, inplace=True)

    # Calculate genre-based similarity
    genre_similarity = cosine_similarity(genre_matrix)
    genre_sim_df = pd.DataFrame(
        genre_similarity,
        index=genre_matrix.index,
        columns=genre_matrix.index
    )

    return genre_sim_df

genre_sim_df = build_content_based_model()

# Step 4: Hybrid Recommendation Function (with error handling)
print("\n### Step 4: Hybrid Recommendation Function (with error handling) ###")

def get_recommendations(player_id=None, game_id=None, n=5, method='hybrid'):
    """Get game recommendations using the specified method"""
    try:
        if method == 'collaborative' and game_id is not None:
            # Similar games recommendation
            similar_games = get_similar_games(game_id, n)
            if len(similar_games) > 0:
                similar_games.index = [game_id_to_name.get(g_id, f"Game {g_id}") for g_id in similar_games.index]
            return similar_games

        elif method == 'user_based' and player_id is not None:
            # User-based recommendation
            return recommend_games_for_player(player_id, n)

        elif method == 'content_based' and game_id is not None and genre_sim_df is not None:
            # Content-based recommendation
            if game_id not in genre_sim_df.index:
                print(f"Game ID {game_id} not found in the genre dataset")
                return pd.Series()

            similar_games = genre_sim_df[game_id].sort_values(ascending=False)[1:n+1]
            similar_games.index = [game_id_to_name.get(g_id, f"Game {g_id}") for g_id in similar_games.index]
            return similar_games

        elif method == 'hybrid' and player_id is not None:
            # Hybrid recommendation
            # Get player's game history
            if player_id not in player_game_matrix.index:
                print(f"Player ID {player_id} not found in the dataset")
                return pd.Series()

            player_history = player_game_matrix.loc[player_id]
            player_games = player_history[player_history > 0].index.tolist()

            # If player has no games, return most popular games
            if not player_games:
                print("Player has no game history. Recommending popular games.")
                popular_games = data['GameID'].value_counts().head(n)
                popular_games.index = [game_id_to_name.get(g_id, f"Game {g_id}") for g_id in popular_games.index]
                return popular_games

            # Get collaborative filtering scores
            cf_scores = defaultdict(float)
            for game in player_games:
                similar_games = get_similar_games(game, 20)
                for sim_game, score in similar_games.items():
                    if sim_game not in player_games:  # Don't recommend games already played
                        cf_scores[sim_game] += score

            # Add content-based scores if available
            if genre_sim_df is not None:
                cb_scores = defaultdict(float)
                for game in player_games:
                    if game in genre_sim_df.index:
                        similar_games = genre_sim_df[game].sort_values(ascending=False)[1:20]
                        for sim_game, score in similar_games.items():
                            if sim_game not in player_games:  # Don't recommend games already played
                                cb_scores[sim_game] += score

                # Normalize and combine scores (0.7 weight to CF, 0.3 to content)
                if cf_scores:
                    cf_max = max(cf_scores.values()) if cf_scores else 1
                    for game in cf_scores:
                        cf_scores[game] /= cf_max

                if cb_scores:
                    cb_max = max(cb_scores.values()) if cb_scores else 1
                    for game in cb_scores:
                        cb_scores[game] /= cb_max

                # Combine scores
                final_scores = defaultdict(float)
                for game in set(list(cf_scores.keys()) + list(cb_scores.keys())):
                    final_scores[game] = 0.7 * cf_scores.get(game, 0) + 0.3 * cb_scores.get(game, 0)

                recommendations = pd.Series(final_scores).sort_values(ascending=False).head(n)
            else:
                recommendations = pd.Series(cf_scores).sort_values(ascending=False).head(n)

            # Convert GameIDs to game names
            if len(recommendations) > 0:
                recommendations.index = [game_id_to_name.get(g_id, f"Game {g_id}") for g_id in recommendations.index]

            return recommendations

        else:
            print("Invalid method or missing required parameters")
            return pd.Series()

    except Exception as e:
        print(f"Error generating recommendations: {str(e)}")
        print("Falling back to collaborative filtering method...")

        # Fallback recommendation method
        if player_id is not None:
            player_games = []
            if player_id in player_game_matrix.index:
                player_history = player_game_matrix.loc[player_id]
                player_games = player_history[player_history > 0].index.tolist()

            if player_games:
                # Get recommendations based on game similarity
                cf_scores = defaultdict(float)
                for game in player_games:
                    if game in game_similarity_df.columns:
                        similar_games = game_similarity_df[game].sort_values(ascending=False)[1:20]
                        for sim_game, score in similar_games.items():
                            if sim_game not in player_games:
                                cf_scores[sim_game] += score

                recommendations = pd.Series(cf_scores).sort_values(ascending=False).head(n)

                if len(recommendations) > 0:
                    recommendations.index = [game_id_to_name.get(g_id, f"Game {g_id}") for g_id in recommendations.index]

                return recommendations
            else:
                # Return most popular games
                popular_games = data['GameID'].value_counts().head(n)
                popular_games.index = [game_id_to_name.get(g_id, f"Game {g_id}") for g_id in popular_games.index]
                return popular_games
        elif game_id is not None:
            # Return similar games
            if game_id in game_similarity_df.columns:
                similar_games = game_similarity_df[game_id].sort_values(ascending=False)[1:n+1]
                similar_games.index = [game_id_to_name.get(g_id, f"Game {g_id}") for g_id in similar_games.index]
                return similar_games
            else:
                return pd.Series()
        else:
            return pd.Series()

# Step 6: Interactive Testing
print("\n### Step 6: Test the Recommendation System ###")
print("Example recommendations:")

# Find valid players with at least 3 games
valid_players = [player for player in player_game_matrix.index if (player_game_matrix.loc[player] > 0).sum() >= 3]

if len(valid_players) >= 1:
    num_players_to_sample = min(5, len(valid_players))  # sample up to 5 but not more than available
    sample_players = np.random.choice(valid_players, size=num_players_to_sample, replace=False)

    for player_id in sample_players:
        print(f"\nRecommendations for Player {player_id}:")
        display(get_recommendations(player_id=player_id, n=5))
else:
    print("Not enough valid players with sufficient games for random sampling.")

# Randomly pick a sample game
sample_game = np.random.choice(player_game_matrix.columns)
sample_game_name = game_id_to_name.get(sample_game, f"Game {sample_game}")
print(f"\nSimilar games to {sample_game_name}:")
display(get_recommendations(game_id=sample_game, method='collaborative', n=5))

# Function to make it easy to get recommendations for any player or game
def get_player_recommendations(player_id):
    """Show recommendations for a specific player"""
    print(f"Recommendations for Player {player_id}:")
    recs = get_recommendations(player_id=player_id, n=5)
    display(recs)
    return recs

def get_game_recommendations(game_id):
    """Show similar games to a specific game"""
    game_name = game_id_to_name.get(game_id, f"Game {game_id}")
    print(f"Similar games to {game_name}:")
    recs = get_recommendations(game_id=game_id, method='collaborative', n=5)
    display(recs)
    return recs

# Get list of player IDs and game IDs for easy reference
print("\nTo get recommendations, use these functions:")
print("get_player_recommendations(player_id) - Get recommendations for a specific player")
print("get_game_recommendations(game_id) - Get games similar to a specific game")
print("\nAvailable player IDs (sample):", player_game_matrix.index[:5].tolist())

games_sample = {game_id: game_id_to_name.get(game_id, f"Game {game_id}")
               for game_id in player_game_matrix.columns[:5]}
print("Available games (sample):", games_sample)

# Step 7: Model Evaluation
print("\n### Step 7: Model Evaluation ###")

# Updated Model Evaluation
def evaluate_model():
    """Evaluate the model using leave-one-out cross validation"""
    # Only perform this if we have enough data
    if len(player_game_matrix) < 10 or player_game_matrix.shape[1] < 10:
        print("Not enough data for meaningful evaluation")
        return

    print("Performing leave-one-out validation...")

    try:
        # Get players with at least 2 games
        valid_players = [p for p in player_game_matrix.index
                 if (player_game_matrix.loc[p] > 0).sum() >= 2]

        if len(valid_players) < 5:
            print("Not enough players with multiple games for evaluation")
            return

        # Sample players for evaluation
        num_eval_players = min(50, len(valid_players))
        eval_players = np.random.choice(valid_players, num_eval_players, replace=False)

        hit_rates = []
        ndcg_scores = []

        for player in eval_players:
            # Get games played by this player
            games_played = player_game_matrix.loc[player][player_game_matrix.loc[player] > 0].index.tolist()

            if len(games_played) <= 1:
                continue

            # Hide one game
            test_game = np.random.choice(games_played)

            # Create a copy of the matrix with the test game hidden
            temp_matrix = player_game_matrix.copy()
            temp_matrix.loc[player, test_game] = 0

            # Get recommendations
            player_games = temp_matrix.loc[player][temp_matrix.loc[player] > 0].index.tolist()
            game_scores = defaultdict(float)
            for game in player_games:
                if game in game_similarity_df.columns:
                    similar_games = game_similarity_df[game].sort_values(ascending=False)
                    for sim_game, score in similar_games.items():
                        if sim_game not in player_games:
                            game_scores[sim_game] += score

            # Get top recommendations
            if game_scores:
                recommendations = pd.Series(game_scores).sort_values(ascending=False).head(10)

                # Check if the hidden game is in recommendations
                if test_game in recommendations.index:
                    hit_rates.append(1)

                    # Get the rank position (0-indexed)
                    rank = recommendations.index.tolist().index(test_game)
                    ndcg = 1 / np.log2(rank + 2)  # rank+2 because of log2(1+rank)
                    ndcg_scores.append(ndcg)
                else:
                    hit_rates.append(0)
                    ndcg_scores.append(0)

        if hit_rates:
          hit_rate_at_10 = sum(hit_rates) / len(hit_rates)
          precision_at_10 = sum(hit_rates) / (len(hit_rates) * 10)
          # recall_at_10 = hit_rate_at_10
          ndcg_at_10 = np.mean(ndcg_scores)

          print(f"Hit rate@10: {hit_rate_at_10:.2f}")
          print(f"Precision@10: {precision_at_10:.2f}")
          # print(f"Recall@10: {recall_at_10:.4f}")
          print(f"NDCG@10: {ndcg_at_10:.4f}")
          print(f"Number of players evaluated: {len(hit_rates)}")
        else:
          print("No valid evaluation results obtained")

    except Exception as e:
        print(f"Error during evaluation: {str(e)}")

evaluate_model()


### Step 3: Building Recommendation Models ###
Building content-based model with 6382 games that have genre information

### Step 4: Hybrid Recommendation Function (with error handling) ###

### Step 6: Test the Recommendation System ###
Example recommendations:

Recommendations for Player v813w9qx
:


The Simpsons: Hit & Run Randomizer    0.700000
Alien Storm (Genesis/Mega Drive)      0.656351
Road Rash II                          0.656351
Winter Challenge                      0.656351
James Bond 007                        0.648533
dtype: float64


Recommendations for Player xkpwm17j
:


Multiple Mario Games                                        0.700000
Super Mario 3D World + Bowser's Fury Category Extensions    0.666943
Mario One Day Escapade                                      0.636331
Card Shark                                                  0.617144
Mahjong Soul                                                0.581608
dtype: float64


Recommendations for Player 8geqed1j
:


ROBLOX: FAXSTORY                0.700000
Super Cube Cavern               0.700000
Filtered Seed Glitchless        0.615543
Snow Drift                      0.569491
Among Us Category Extensions    0.538334
dtype: float64


Recommendations for Player 8667dw08
:


Hollow Knight Category Extensions     1.000000
Pizza Tower (Demos)                   0.575529
CELESTE Classic                       0.401856
Castlevania: Harmony of Dissonance    0.300000
Islets                                0.300000
dtype: float64


Recommendations for Player y8dm0y58
:


Powerpool Frenzy               0.700000
Enemy Front                    0.700000
Powerpool 2                    0.700000
Aim Time Trial by uLLeticaL    0.668545
Aim Course 2 by uLLeticaL      0.368448
dtype: float64


Similar games to Disneyland Adventures:


Disney Illusion Island                        1.0
New Super Lucky's Tale Category Extensions    1.0
Celeste 64 Custom Maps                        1.0
Disneyland Adventures                         1.0
Runner3                                       1.0
Name: m1zjop06, dtype: float64


To get recommendations, use these functions:
get_player_recommendations(player_id) - Get recommendations for a specific player
get_game_recommendations(game_id) - Get games similar to a specific game

Available player IDs (sample): ['0jm002y8\n', '0jm00348\n', '0jm01yo8\n', '0jm02o81\n', '0jm04481\n']
Available games (sample): {'26803k1p': 'SpongeBob SquarePants: Lights, Camera, Pants!', '2680571p': 'Fortified Zone', '2680751p': 'Connect Four, Perfection, Trouble!', '2680ek1p': 'VeggieTales: LarryBoy and the Bad Apple (PS2)', '2680g51p': 'Amazing Tater'}

### Step 7: Model Evaluation ###
Performing leave-one-out validation...
Hit rate@10: 0.50
Precision@10: 0.05
NDCG@10: 0.3786
Number of players evaluated: 50
