Installs

In [None]:
%pip install opendatasets
%pip install pandas
%pip install scikit-learn
%pip install matplotlib
%matplotlib inline

Imports

In [8]:
import pandas as pd
import os
import numpy as np
import opendatasets as od 
from typing import List
from sklearn.neighbors import KDTree
from sklearn.decomposition import PCA

Model 1

In [9]:
class Model_1:
    def __init__(self, game_stats: pd.DataFrame, user_data: pd.DataFrame) -> None:
        self.game_stats = game_stats
        self.user_data = user_data
        
    def predict(self, user_id: int) -> List[str]:    
        self.game_stats.rename(columns={'AppID': 'game_id','Name':'game_name','Average playtime forever':'hours_played', 'Release date': "release_date"}, inplace=True)
        self.game_stats = self.game_stats.drop(self.game_stats[self.game_stats['Positive'] == 0].index)
        self.game_stats = self.game_stats.sort_values(by=['ranking','hours_played', 'release_date'], ascending=[False, False, False])
        game_stats2 = self.game_stats[['game_id', 'game_name']]
        hund_recs = game_stats2.head(100)
        rand_ten = hund_recs.sample(n=50)
        
        user_games = self.user_data.loc[user_id].reset_index().set_index('Game').drop('level_0', axis='columns').rename(columns={user_id: 'Quantity'})
        user_games = list(user_games[user_games['Quantity'] != 0.0].index)
        
        recommendations = []
        i = 0
        while len(recommendations) < 10:
            game_name = rand_ten.iloc[i].game_name
            if game_name not in user_games:
                recommendations.append(game_name)            
            i += 1
        
        return recommendations

Model 2

In [4]:
class Model_2:
    def __init__(self) -> None:
        pass

    def predict(self) -> list[str]:
        raise NotImplementedError('Model 2 predict not implemented')

Model 3

In [11]:
class Model_3:
    def __init__(self, user_data: pd.DataFrame) -> None:
        self.user_data = user_data
        self.pca = PCA(n_components=200)
        self.pca_data = self.pca.fit_transform(user_data.values)
        self.tree = KDTree(self.pca_data)

    def predict(self, user_id: int) -> List[str]:
        # Find neighbors
        N_NEIGHBORS = 100
        user_index = self.user_data.index.get_loc(user_id)
        neighbors = self.tree.query(self.pca_data[user_index:user_index+1], k=N_NEIGHBORS, return_distance=False)[0]
        combined_neighbors = self.user_data.iloc[neighbors].sum().T.sort_values(ascending=False).reset_index().set_index('Game').drop('level_0', axis='columns').rename(columns={0: 'Quantity'})

        # Get list of user's currently owned games
        user_games = self.user_data.loc[user_id].reset_index().set_index('Game').drop('level_0', axis='columns').rename(columns={user_id: 'Quantity'})
        user_games = list(user_games[user_games['Quantity'] != 0.0].index)

        # Build recommendation list
        recommendations = []
        i = 0
        while len(recommendations) < 10:
            game_name = combined_neighbors.iloc[i].name
            if game_name not in user_games:
                recommendations.append(game_name)
            i += 1
            
            # Break if out of relevant games
            if combined_neighbors.iloc[i].Quantity == 0.0:
                break

        return recommendations

Main Model

In [14]:
class Model:
    def __init__(self) -> None:
        self.NUMBER_OF_RECOMMENDATIONS = 10
        self.USE_MODEL_1 = 20
        self.USE_MODEL_2 = 50

        # Download data as required

        if not os.path.exists('steam-games-dataset'):
            od.download( 
                "https://www.kaggle.com/datasets/fronkongames/steam-games-dataset/data") 
    
        if not os.path.exists('steam-video-games'):
            od.download( 
                "https://www.kaggle.com/datasets/tamber/steam-video-games/data") 
    
        if not os.path.exists('popularity-of-games-on-steam'):
            od.download( 
                "https://www.kaggle.com/datasets/michau96/popularity-of-games-on-steam")       
        
        # Transform Game Data
        game_stats = pd.read_csv('steam-games-dataset/games.csv', encoding='ISO-8859-1', usecols = [0, 1, 2, 22, 23, 28]) 
        tot_pos_reviews = (game_stats['Positive'].sum())
        game_stats['ranking'] = np.sqrt(game_stats['Positive']/tot_pos_reviews) * 100.0
        self.game_stats = game_stats
        
        # Transform User Data
        user_data = pd.read_csv('steam-video-games/steam-200k.csv', encoding='ISO-8859-1', usecols=[0, 1, 2, 3], names=['UserID', 'Game', 'Behavior', 'Quantity'])
        hours_played = user_data[user_data['Behavior'] == 'play'].groupby('Game').agg(np.mean).sort_values(by='Quantity', ascending=False).drop('UserID', axis='columns')
        normalized_user_data = user_data.copy()[user_data['Game'].isin(hours_played.index)]
        normalized_user_data['Quantity'] = normalized_user_data.apply(lambda x: x['Quantity'] if x['Behavior'] == 'purchase' else x['Quantity'] / hours_played.loc[x['Game'], :]['Quantity'], axis=1)
        grouped_users = normalized_user_data.groupby(['UserID', 'Game']).aggregate({'Quantity': 'sum'})
        grouped_users = grouped_users.reset_index(level='Game')
        pivoted_users = grouped_users.reset_index()
        pivoted_users = pivoted_users.set_index(['UserID', 'Game'])
        self.pivoted_users = pivoted_users.unstack(fill_value=0)
    
        # Instaniate Models
        self.model_1 = Model_1(self.game_stats, self.pivoted_users)
        self.model_2 = Model_2()
        self.model_3 = Model_3(self.pivoted_users)

    def predict(self, user_id: int) -> List[str]:
        number_non_zero_columns = self.pivoted_users.shape[1] - self.pivoted_users.loc[user_id].isin([0.0]).sum()

        recommendations = []
        if number_non_zero_columns >= self.USE_MODEL_2:
            recommendations.extend(self.model_3.predict(user_id))
            if len(recommendations) >= self.NUMBER_OF_RECOMMENDATIONS:
                return recommendations
        if number_non_zero_columns >= self.USE_MODEL_1:
            recommendations.extend(self.model_2.predict(user_id)[:self.NUMBER_OF_RECOMMENDATIONS - len(recommendations)])
            if len(recommendations) >= self.NUMBER_OF_RECOMMENDATIONS:
                return recommendations
        if number_non_zero_columns < self.USE_MODEL_1:
            recommendations.extend(self.model_1.predict(user_id)[:self.NUMBER_OF_RECOMMENDATIONS - len(recommendations)])
            if len(recommendations) >= self.NUMBER_OF_RECOMMENDATIONS:
                return recommendations

Main

In [15]:
model = Model()

In [16]:
# Predict with model 3
model.predict(11403772)

['Grand Theft Auto IV',
 'War Thunder',
 'Counter-Strike Condition Zero',
 'The Witcher 2 Assassins of Kings Enhanced Edition',
 'Just Cause 2',
 'Counter-Strike Condition Zero Deleted Scenes',
 'Castle Crashers',
 'Free to Play',
 'Dishonored',
 'Neverwinter']