Installs

In [None]:
%pip install opendatasets
%pip install pandas
%pip install scikit-learn
%pip install matplotlib
%matplotlib inline

Imports

In [45]:
import pandas as pd
import os
import gc
import numpy as np
import opendatasets as od 
import random
from typing import List
from sklearn.neighbors import KDTree
from sklearn.decomposition import PCA

Model 1

In [46]:
class Model_1:
    def __init__(self, game_stats: pd.DataFrame, user_data: pd.DataFrame) -> None:
        self.game_stats = game_stats
        self.user_data = user_data
        
    def predict(self, user_id: int) -> List[str]:    
        self.game_stats.rename(columns={'AppID': 'game_id','Name':'game_name','Average playtime forever':'hours_played', 'Release date': "release_date"}, inplace=True)
        self.game_stats = self.game_stats.drop(self.game_stats[self.game_stats['Positive'] == 0].index)
        self.game_stats = self.game_stats.sort_values(by=['ranking','hours_played', 'release_date'], ascending=[False, False, False])
        game_stats2 = self.game_stats[['game_id', 'game_name']]
        hund_recs = game_stats2.head(100)
        rand_ten = hund_recs.sample(n=50)
        
        user_games = self.user_data.loc[user_id].reset_index().set_index('Game').drop('level_0', axis='columns').rename(columns={user_id: 'Quantity'})
        user_games = list(user_games[user_games['Quantity'] != 0.0].index)
        
        recommendations = []
        i = 0
        while len(recommendations) < 20:
            game_name = rand_ten.iloc[i].game_name
            if game_name not in user_games:
                recommendations.append(game_name)            
            i += 1
        
        return recommendations

In [60]:
class Model_2:
    def __init__(self, game_stats: pd.DataFrame, user_data: pd.DataFrame) -> None:
        self.game_stats = game_stats
        self.user_data = user_data

    def predict(self, user_id: int) -> List[str]: 
        self.game_stats.rename(columns={'AppID': 'game_id','Name':'game_name','Average playtime forever':'hours_played', 'Release date': "release_date", 'Genres': "genre"}, inplace=True)
        self.game_stats = self.game_stats.drop(self.game_stats[self.game_stats['Positive'] == 0].index)
        self.game_stats = self.game_stats.sort_values(by=['genre_counts','hours_played', 'release_date'], ascending=[False, False, False])

Model 3

In [47]:
class Model_3:
    def __init__(self, user_data: pd.DataFrame) -> None:
        self.user_data = user_data
        self.pca = PCA(n_components=200)
        self.pca_data = self.pca.fit_transform(user_data.values)
        self.tree = KDTree(self.pca_data)

    def predict(self, user_id: int) -> List[str]:
        # Find neighbors
        N_NEIGHBORS = 100
        user_index = self.user_data.index.get_loc(user_id)
        neighbors = self.tree.query(self.pca_data[user_index:user_index+1], k=N_NEIGHBORS, return_distance=False)[0]
        combined_neighbors = self.user_data.iloc[neighbors].sum().T.sort_values(ascending=False).reset_index().set_index('Game').drop('level_0', axis='columns').rename(columns={0: 'Quantity'})

        # Get list of user's currently owned games
        user_games = self.user_data.loc[user_id].reset_index().set_index('Game').drop('level_0', axis='columns').rename(columns={user_id: 'Quantity'})
        user_games = list(user_games[user_games['Quantity'] != 0.0].index)
        
        # Build recommendation list
        recommendations = []
        i = 0
        while len(recommendations) < 20:
            game_name = combined_neighbors.iloc[i].name
            if game_name not in user_games:
                recommendations.append(game_name)
            i += 1
            
            # Break if out of relevant games
            if combined_neighbors.iloc[i].Quantity == 0.0:
                break

        return recommendations

In [56]:
class Testing:
    def __init__(self, selection: str, k: int) -> None:  
        self.NUMBER_OF_RECOMMENDATIONS = 20
        self.USE_MODEL_1 = 15
        self.USE_MODEL_2 = 20
                
        # Transform Game Data
        game_stats = pd.read_csv('steam-games-dataset/games.csv', encoding='ISO-8859-1', usecols = [0, 1, 2, 22, 23, 28]) 
        tot_pos_reviews = (game_stats['Positive'].sum())
        game_stats['ranking'] = np.sqrt(game_stats['Positive']/tot_pos_reviews) * 100.0
        self.game_stats = game_stats
        
        # Transform User Data
        user_data = pd.read_csv('steam-video-games/steam-200k.csv', encoding='ISO-8859-1', usecols=[0, 1, 2, 3], names=['UserID', 'Game', 'Behavior', 'Quantity'])
        hours_played = user_data[user_data['Behavior'] == 'play'].groupby('Game').agg(np.mean).sort_values(by='Quantity', ascending=False).drop('UserID', axis='columns')
        normalized_user_data = user_data.copy()[user_data['Game'].isin(hours_played.index)]
        normalized_user_data['Quantity'] = normalized_user_data.apply(lambda x: x['Quantity'] if x['Behavior'] == 'purchase' else x['Quantity'] / hours_played.loc[x['Game'], :]['Quantity'], axis=1)
        grouped_users = normalized_user_data.groupby(['UserID', 'Game']).aggregate({'Quantity': 'sum'})
        grouped_users = grouped_users.reset_index(level='Game')
        pivoted_users = grouped_users.reset_index()
        pivoted_users_og = pivoted_users.set_index(['UserID', 'Game'])
        pivoted_users_og = pivoted_users_og.unstack(fill_value=0)
        
        ####################################TESTING#########################################
        
        # SELECTION
            #for all rows, aggregate the sum of columns and count of 
            #     columns [1:n] that are not 0.0, create new two new columns
            #     "total_quantity" and "game_count"
        pivoted_users_og['total_quantity'] = pivoted_users_og.sum(axis=1)
        
        #Note for REMOVAL step: game_count + 1 (includes the userID)
        pivoted_users_og['game_count'] = pivoted_users_og.apply(lambda row: (row != 0.0).sum(), axis=1)
        
        # selection of top k and rand k users with their quantity 
        #   (for visualization/placement on chart) and game_count to avoid large index removal error
        pivoted_users_select = pivoted_users_og.sort_values(by=['total_quantity','game_count'], ascending=[False, False])
        new_df = pivoted_users_select.iloc[:, -2:].reset_index()
        del pivoted_users_select
        del pivoted_users_og
        # ERROR FIX on lossing the dataframe format in the LOCATE & RENAME step
        new_df = new_df.drop(new_df[(new_df['game_count'] == 2)].index)
        new_df['UserID'] = new_df['UserID'].astype(str)
        self.top_k_users = new_df.head(k).values.tolist()
        #echo print (can access elements as a 2d array)
        self.rand_k_users = new_df.sample(k).values.tolist()
        del new_df
        #echo print (can access elements as a 2d array)
        # LOCATE AND RENAME AND REMOVE
        self.k_users_testid = []
        self.k_users_dropped = []

        if selection == 'top':
            users = self.top_k_users
        elif selection == 'rand':
            users = self.rand_k_users
        
        for i in range(k):
            # LOCATE
            ACTUAL_user = grouped_users.loc[int(users[i][0])].reset_index()
            
            # RENAME
            ACTUAL_user.loc[ACTUAL_user['UserID'] == int(users[i][0]), 'UserID'] = 'TEST_' +str(users[i][0])
            TEST_user = ACTUAL_user.copy()
            TEST_userid = TEST_user['UserID'].values[0]
            self.k_users_testid.append(TEST_userid)

            # REMOVE
            if selection == 'rand' and users[i][2] <= self.NUMBER_OF_RECOMMENDATIONS:
                drop_rows = TEST_user.sample(n=users[i][2]-1)            
            else: 
                drop_rows = TEST_user.sample(n=self.NUMBER_OF_RECOMMENDATIONS)

            self.k_users_dropped.append(drop_rows['Game'].values.tolist())
            
            TEST_user_dropped = TEST_user.drop(drop_rows.index)
            
            del ACTUAL_user
            del TEST_user
            #concat this dataframe to the pivoted_users dataframe
            pivoted_users = pd.concat([pivoted_users, TEST_user_dropped], axis=0)

        #TESTING SETUP ENDS 
        pivoted_users = pivoted_users.set_index(['UserID', 'Game'])
        self.pivoted_users = pivoted_users.unstack(fill_value=0)
        
        del pivoted_users
        
        # Instaniate Models
        self.model_1 = Model_1(self.game_stats, self.pivoted_users)
        #self.model_2 = Model_2()
        self.model_3 = Model_3(self.pivoted_users)
        
        gc.collect()
    
    def evaluate(self, select: int) -> List[str]:
        print(self.k_users_dropped[select])
        
    def predict(self, k: int) -> List[str]:
        
        user_id = str(self.k_users_testid[0])
        
        number_non_zero_columns = self.pivoted_users.shape[1] - self.pivoted_users.loc[user_id].isin([0.0]).sum()

        recommendations = []
        if number_non_zero_columns >= self.USE_MODEL_2:
            print("Applying model 3")
            recommendations.extend(self.model_3.predict(user_id))
            if len(recommendations) >= self.NUMBER_OF_RECOMMENDATIONS:
                return recommendations
        if number_non_zero_columns >= self.USE_MODEL_1:
            print("Applying model 2")
            recommendations.extend(self.model_2.predict(user_id)[:self.NUMBER_OF_RECOMMENDATIONS - len(recommendations)])
            if len(recommendations) >= self.NUMBER_OF_RECOMMENDATIONS:
                return recommendations
        if number_non_zero_columns < self.USE_MODEL_1:
            print("Applying model 1")
            recommendations.extend(self.model_1.predict(user_id)[:self.NUMBER_OF_RECOMMENDATIONS - len(recommendations)])
            if len(recommendations) >= self.NUMBER_OF_RECOMMENDATIONS:
                return recommendations

In [57]:
k = 5
test = Testing('top', k)

In [59]:
k = 5

recomm = []
recomm = test.predict(k)
dropped = []
dropped = test.evaluate(0)
accuracy = []
tot_acc = 0


res = np.in1d(recomm, dropped)
hit_count = sum(res)
acc = hit_count/(len(recomm[0]))
accuracy.append(acc)
tot_acc += acc

print(tot_acc)


Applying model 3
["Penny Arcade's On the Rain-Slick Precipice of Darkness 3", 'Command and Conquer Red Alert 3', 'Afterfall InSanity Extended Edition', 'Gratuitous Space Battles', 'Dungeon Defenders', 'Cossacks II Napoleonic Wars', 'Aliens vs. Predator', 'Star Wars - Battlefront II', 'Primordia', 'SpellForce 2 Gold Edition', 'Paper Sorcerer', 'Scourge Outbreak', 'Red Orchestra 2 Heroes of Stalingrad - Single Player', 'Hearts of Iron II Complete', 'Arma 2', 'Pirates of Black Cove Gold', '16bit Trader', 'Sacred Gold', 'Particula', 'KnightShift']
0.0


In [7]:
class Model:
    def __init__(self) -> None:
        self.NUMBER_OF_RECOMMENDATIONS = 10
        self.USE_MODEL_1 = 20
        self.USE_MODEL_2 = 50

        # Download data as required

        if not os.path.exists('steam-games-dataset'):
            od.download( 
                "https://www.kaggle.com/datasets/fronkongames/steam-games-dataset/data") 
    
        if not os.path.exists('steam-video-games'):
            od.download( 
                "https://www.kaggle.com/datasets/tamber/steam-video-games/data") 
    
        if not os.path.exists('popularity-of-games-on-steam'):
            od.download( 
                "https://www.kaggle.com/datasets/michau96/popularity-of-games-on-steam")       
        
        # Transform Game Data
        game_stats = pd.read_csv('steam-games-dataset/games.csv', encoding='ISO-8859-1', usecols = [0, 1, 2, 22, 23, 28]) 
        tot_pos_reviews = (game_stats['Positive'].sum())
        game_stats['ranking'] = np.sqrt(game_stats['Positive']/tot_pos_reviews) * 100.0
        self.game_stats = game_stats
        
        # Transform User Data
        user_data = pd.read_csv('steam-video-games/steam-200k.csv', encoding='ISO-8859-1', usecols=[0, 1, 2, 3], names=['UserID', 'Game', 'Behavior', 'Quantity'])
        hours_played = user_data[user_data['Behavior'] == 'play'].groupby('Game').agg(np.mean).sort_values(by='Quantity', ascending=False).drop('UserID', axis='columns')
        normalized_user_data = user_data.copy()[user_data['Game'].isin(hours_played.index)]
        normalized_user_data['Quantity'] = normalized_user_data.apply(lambda x: x['Quantity'] if x['Behavior'] == 'purchase' else x['Quantity'] / hours_played.loc[x['Game'], :]['Quantity'], axis=1)
        grouped_users = normalized_user_data.groupby(['UserID', 'Game']).aggregate({'Quantity': 'sum'})
        grouped_users = grouped_users.reset_index(level='Game')
        pivoted_users = grouped_users.reset_index()
        pivoted_users = pivoted_users.set_index(['UserID', 'Game'])
        self.pivoted_users = pivoted_users.unstack(fill_value=0)
    
        # Instaniate Models
        self.model_1 = Model_1(self.game_stats, self.pivoted_users)
        self.model_2 = Model_2()
        self.model_3 = Model_3(self.pivoted_users)

    def predict(self, user_id: int) -> List[str]:
        number_non_zero_columns = self.pivoted_users.shape[1] - self.pivoted_users.loc[user_id].isin([0.0]).sum()

        recommendations = []
        if number_non_zero_columns >= self.USE_MODEL_2:
            recommendations.extend(self.model_3.predict(user_id))
            if len(recommendations) >= self.NUMBER_OF_RECOMMENDATIONS:
                return recommendations
        if number_non_zero_columns >= self.USE_MODEL_1:
            recommendations.extend(self.model_2.predict(user_id)[:self.NUMBER_OF_RECOMMENDATIONS - len(recommendations)])
            if len(recommendations) >= self.NUMBER_OF_RECOMMENDATIONS:
                return recommendations
        if number_non_zero_columns < self.USE_MODEL_1:
            recommendations.extend(self.model_1.predict(user_id)[:self.NUMBER_OF_RECOMMENDATIONS - len(recommendations)])
            if len(recommendations) >= self.NUMBER_OF_RECOMMENDATIONS:
                return recommendations

In [8]:
model = Model()

In [9]:
model.predict(185914106)

['Dead by Daylight',
 'Euro Truck Simulator 2',
 'New World',
 'RimWorld',
 'Destiny 2',
 'Hollow Knight',
 'ARK: Survival Evolved',
 'The Binding of Isaac: Rebirth',
 'Brawlhalla',
 'Life is Strange - Episode 1',
 'Mount & Blade II: Bannerlord',
 'Human: Fall Flat',
 'Hearts of Iron IV',
 'Risk of Rain 2',
 'Raft',
 'Red Dead Redemption 2',
 'Fallout 4',
 'Cyberpunk 2077',
 'DOOM Eternal',
 'Arma 3']