In [None]:
# Imports
import pandas as pd
import dask.dataframe as dd
import numpy as np
from surprise import Dataset
from surprise.reader import Reader
from surprise.prediction_algorithms.matrix_factorization import SVD as FunkSVD
import scipy as sp

In [None]:
# Loads reviews dataframe
df_reviews = dd.read_parquet("../../data/reviews.parquet").compute()

In [None]:
# Subset reviews dataframe to relevant columns
df_reviews = df_reviews[['author_steamid','steam_appid','voted_up']]
df_reviews.head()

Unnamed: 0_level_0,author_steamid,steam_appid,voted_up
recommendationid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
92426316,76561197991126058,1000000,True
92738312,76561198043369921,1000000,True
95149549,76561198308633526,1000000,False
116584366,76561197990036156,1000000,False
129441214,76561198281676677,1000000,False


In [None]:
# Loads game info
df_games = pd.read_parquet("../../data/store_info.parquet")

# Adds game name to reviews dataframe.
# Index is reset and then set because pandas.DataFrame.merge removes the index otherwise
df_reviews = df_reviews.reset_index(drop=False).merge(df_games[['name']], left_on='steam_appid', right_on='steam_appid').set_index('recommendationid').rename(columns={"name":"app_name"})

# Previews DataFrame
df_reviews.head()

Unnamed: 0_level_0,author_steamid,steam_appid,voted_up,app_name
recommendationid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
92426316,76561197991126058,1000000,True,ASCENXION
92738312,76561198043369921,1000000,True,ASCENXION
95149549,76561198308633526,1000000,False,ASCENXION
116584366,76561197990036156,1000000,False,ASCENXION
129441214,76561198281676677,1000000,False,ASCENXION


In [None]:
# Loads graph-derived clusters

# Imports clusters from positive review graph
ddf_clusters_postive = dd.read_parquet("../../models/clusters/positive_clusters.parquet")
# Renames cluster columns
ddf_clusters_postive = ddf_clusters_postive.rename(columns={'author_cluster':'author_cluster_positive','app_cluster':'app_cluster_positive'})
# Calls compute method to execute delayed instructions. Using dask here is probably totally unnecessary.
df_clusters_positive = ddf_clusters_postive.compute()

# Same as above for clusters from negative review graph (negative clusters not currently in use)
ddf_clusters_negative = dd.read_parquet("../../models/clusters/negative_clusters.parquet")
ddf_clusters_negative = ddf_clusters_negative.rename(columns={'author_cluster':'author_cluster_negative','app_cluster':'app_cluster_negative'})
df_clusters_negative = ddf_clusters_negative.compute()

In [None]:
# Joins graph-derived clusters with reviews dataframe
df_reviews = df_reviews.join(df_clusters_positive[['author_cluster_positive','app_cluster_positive']])
df_reviews = df_reviews.join(df_clusters_negative[['author_cluster_negative','app_cluster_negative']])

# Deletes cluster dataframes
del df_clusters_positive, df_clusters_negative
df_reviews.head()

Unnamed: 0_level_0,author_steamid,steam_appid,voted_up,app_name,author_cluster_positive,app_cluster_positive,author_cluster_negative,app_cluster_negative
recommendationid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
92426316,76561197991126058,1000000,True,ASCENXION,0.0,3.0,,
92738312,76561198043369921,1000000,True,ASCENXION,0.0,3.0,,
95149549,76561198308633526,1000000,False,ASCENXION,1.0,3.0,0.0,0.0
116584366,76561197990036156,1000000,False,ASCENXION,0.0,3.0,0.0,0.0
129441214,76561198281676677,1000000,False,ASCENXION,0.0,3.0,0.0,0.0


In [None]:
def get_recs(df_reviews:pd.DataFrame, current_author: int) -> pd.DataFrame:
    """Function to perform collaborative filtering of reviews to generate recommendations for given author based on preferences of authors in their cluster.

    Args:
        df_reviews (pd.DataFrame): Reviews dataframe
        current_author (int): Steam id of current author (as found in reviews dataframe)

    Returns:
        pd.DataFrame: _description_
    """
    # Gets cluster of current author
    current_author_cluster = df_reviews[df_reviews['author_steamid'] == current_author]['author_cluster_positive'].mean()
    
    if np.isnan(current_author_cluster):
        # If author has no cluster, does not subset reviews dataframe
        df_subset = df_reviews
    else:
        # If author has a cluster, reviews dataframe is subset to reviews within author's cluster
        df_subset = df_reviews[df_reviews['author_cluster_positive'] == current_author_cluster]

    # Trims subset to only relevant columns
    df_subset = df_subset[['author_steamid','app_name','voted_up']]

    # Loads data for funk SVD
    svd_data = Dataset.load_from_df(df_subset, Reader(rating_scale=(0,1)))
    svd_data_train = svd_data.build_full_trainset()

    # Calculates number of latent factors (rounds square root of harmonic mean of authors and apps, maynbe not the best approach)
    n_latents = int((2/(df_subset['author_steamid'].nunique()**-1+df_subset['app_name'].nunique()**-1))**0.5)

    # Instantiates SVD
    my_algorithm = FunkSVD(n_factors=n_latents,
                        n_epochs=100,
                        biased=False,  # This forces the algorithm to store all latent information in the matrices
                        verbose=0,
                        random_state=42)
    
    # Fits
    my_algorithm.fit(svd_data_train)

    # Extracts latent features
    U = my_algorithm.pu
    M = my_algorithm.qi.T

    # Instantiates empty lists to hold app names and recommendations
    list_rec_app = []
    list_rec_score = []

    # Gets user features
    uid = svd_data_train.to_inner_uid(current_author)
    
    # Iterates through all apps; estimates rating user would give app.
    # Ratings in the training data are booleann, so we might think of these estimated ratings as the 
    #   probability of recommendation. 
    for app in df_subset['app_name'].unique():
        iid = svd_data_train.to_inner_iid(app)

        list_rec_app.append(app)
        list_rec_score.append(np.dot(U[uid, :], M[:, iid]))
    
    # Combines probability/app lists into dataframe.
    df_recs = pd.DataFrame({'app':list_rec_app,
                        'probability':list_rec_score})

    # Removes games author has reviewed from recommendations (though these might be useful for scoring the model)
    df_recs = df_recs[df_recs['app'].isin(df_subset[df_subset['author_steamid'] == current_author]['app_name']) == False]

    # Adds count of positive reviews per-game in current subset to recommendation dataframe
    df_recs = pd.merge(df_recs, df_subset['app_name'].value_counts(), left_on='app', right_on='app_name')
    
    # Adds percent of reviews per-game that are positive to current dataframe
    df_recs = pd.merge(df_recs, df_subset.groupby("app_name")['voted_up'].mean(), left_on='app', right_on='app_name')
    
    # Removes games with only negative reviews
    df_recs = df_recs[df_recs['voted_up'] > 0]
    
    # Removes games with fewer than 10 reviews
    df_recs = df_recs[df_recs['count'] >= 10]
    
    # Calculates a "popularity" metric derived from review count and the percent of reviews that are popular.
    # I should add an appendix figure or something that shows what this metric evaluates to at different recommendation percents and review counts.
    df_recs['popularity'] = df_recs["voted_up"]**np.log10(df_recs['count'])*np.log10(np.log10(df_recs['count']))
    
    # Calculates a final score by multiplying the recommendation proability with the popularity metric.
    df_recs["score"] = df_recs['probability'] * df_recs['popularity']

    return df_recs.sort_values("score", ascending=False)

In [None]:
# Code to generate some sample recommendations.
for current_author in np.random.default_rng(seed=42).choice(df_reviews['author_steamid'].unique(), 1, replace=False):
    df_recs = get_recs(df_reviews, current_author)
    display(df_recs.head())
    display(df_reviews[df_reviews['author_steamid'] == current_author][['app_name','voted_up']])

Unnamed: 0,app,probability,count,voted_up,popularity,score
13831,Noita,0.979946,18903,0.983124,0.586789,0.575022
1791,Gunfire Reborn,0.939093,19563,0.989981,0.605849,0.568948
1233,Hades,0.983846,16817,0.980734,0.576507,0.567194
4311,Vampire Survivors,0.838942,60120,0.995542,0.664988,0.557886
9305,Factorio,0.996368,9862,0.977388,0.548901,0.546908


Unnamed: 0_level_0,app_name,voted_up
recommendationid,Unnamed: 1_level_1,Unnamed: 2_level_1
37182673,Terraria,True
104756108,Mindustry,True
27095346,FTL: Faster Than Light,True
46458926,SYNTHETIK: Legion Rising,True
87578290,Valheim,True
