In [1]:
import pandas as pd
import time
import matplotlib.pyplot as plt
from pyspark.sql import SparkSession, types
import pyspark.sql.functions as F
from  pyspark.sql.functions import col

spark = (
    SparkSession.builder.config("spark.sql.debug.maxToStringFields", 100)
    .appName("reviews")
    .getOrCreate()
)

# steam_reviews_filepath = "../data/cleaned_steam_reviews/game_id={70,240,420,620}"
steam_reviews_filepath = "../data/cleaned_steam_reviews"
steam_games_filepath = "../data/cleaned_steam_games"

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/11/24 18:07:50 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/11/24 18:07:51 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
23/11/24 18:07:51 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
23/11/24 18:07:51 WARN Utils: Service 'SparkUI' could not bind on port 4042. Attempting port 4043.
23/11/24 18:07:51 WARN Utils: Service 'SparkUI' could not bind on port 4043. Attempting port 4044.
23/11/24 18:07:51 WARN Utils: Service 'SparkUI' could not bind on port 4044. Attempting port 4045.


## Average Recommended of Games

In [2]:
steam_reviews = spark.read.parquet(steam_reviews_filepath)

                                                                                

In [3]:
steam_reviews.show()

+------+--------------------+---------+--------+-------------------------------------+-----------------+-----------------+-----------+-------------+-----------+-------------------+-------------+--------------+-----------------+---------------------------+-----------------+----------------------+------------------+-----------------------+------------------------------+-------------------------+------------------+-------+
|app_id|            app_name|review_id|language|                               review|timestamp_created|timestamp_updated|recommended|votes_helpful|votes_funny|weighted_vote_score|comment_count|steam_purchase|received_for_free|written_during_early_access|   author_steamid|author_num_games_owned|author_num_reviews|author_playtime_forever|author_playtime_last_two_weeks|author_playtime_at_review|author_last_played|game_id|
+------+--------------------+---------+--------+-------------------------------------+-----------------+-----------------+-----------+-------------+----

In [6]:
# get only the row with app_id of above 4 items and select only recommended, votes_helpful, votes_funny, weighted_vote_score, and averages all for of them grouping by app_id
# have to cast recommended from boolean to int first
steam_reviews_votes = steam_reviews.filter( \
                                    (col("app_id") == 582010) \
                                    | (col("app_id") == 240) \
                                    | (col("app_id") == 271590) \
                                    | (col("app_id") == 578080) \
                                ) \
                                .select("app_id", "recommended", "votes_helpful", "votes_funny", "weighted_vote_score") \
                                .withColumn("recommended", col("recommended").cast("int")) \
                                .groupBy("app_id") \
                                .agg( \
                                    F.avg("recommended").alias("recommended"), \
                                    F.avg("votes_helpful").alias("votes_helpful"), \
                                    F.avg("votes_funny").alias("votes_funny"), \
                                    F.avg("weighted_vote_score").alias("weighted_vote_score")
                                )

In [8]:
steam_reviews_votes.show()



+------+------------------+------------------+------------------+-------------------+
|app_id|       recommended|     votes_helpful|       votes_funny|weighted_vote_score|
+------+------------------+------------------+------------------+-------------------+
|578080|0.5443536358078113| 1418442.570487019| 896619.3629818892|  598288.9780154979|
|271590|0.8300795797561645|1664430.3329698893|1056727.3309614416|  699633.3845441695|
|582010|0.8535994544134058|1674042.2674139808|1120098.7417649452|  810485.6233716555|
|   240|0.9627381537628044|1496687.3917553672| 903456.5318264667|  564580.8643406603|
+------+------------------+------------------+------------------+-------------------+



                                                                                

## Total Recommended of Genres

In [5]:
steam_reviews = spark.read.parquet(steam_reviews_filepath)

                                                                                

In [2]:
steam_games = spark.read.parquet(steam_games_filepath)

                                                                                

In [7]:
# get only the row with app_id of above 4 items and select only recommended, votes_helpful, votes_funny, weighted_vote_score, and averages all for of them grouping by app_id
# have to cast recommended from boolean to int first
steam_reviews_recommended = steam_reviews.select("app_id", "recommended") \
                                .withColumn("recommended", col("recommended").cast("int")) \
                                .groupBy("app_id") \
                                .agg( \
                                    F.sum("recommended").alias("recommended"), \
                                )

In [3]:
tags_map = steam_games.select("game_id", F.explode_outer(F.split(col("popular_tags"),",")).alias("tags")) \
                        .drop("popular_tags") \
                        .where(col("tags") != F.lit("null"))

In [12]:
genre_recommended = steam_reviews_recommended.join(tags_map, tags_map["game_id"] == steam_reviews_recommended["app_id"]) \
                            .drop("game_id")

genre_recommended = genre_recommended.groupBy("tags").agg(F.sum(col("recommended")).alias("recommended"))

genres = ["Indie", "Action", "Adventure", "Casual", "Simulation", "Strategy", "RPG", "Singleplayer", "Early Access",\
            "Great Soundtrack", "Atmospheric", "Multiplayer", "2D", "Free to Play", "Puzzle", "VR", "Violent", "Story Rich", "Difficult", "Fantasy"]
genre_recommended = genre_recommended.where(col("tags").isin(genres))

In [13]:
genre_recommended.show()

                                                                                

+----------------+-----------+
|            tags|recommended|
+----------------+-----------+
|       Difficult|    3386698|
|       Adventure|    8519864|
|              VR|     257194|
|     Atmospheric|    7832112|
|         Fantasy|    1748833|
|     Multiplayer|   10257550|
|              2D|    2784555|
|         Violent|    1356898|
|      Story Rich|    3223304|
|    Singleplayer|   11661784|
|Great Soundtrack|    6284463|
|          Puzzle|     961659|
|    Early Access|    3457825|
|          Casual|    3688050|
|        Strategy|    4953679|
|          Action|   11481408|
|           Indie|    6750653|
|    Free to Play|     557902|
|             RPG|    4447267|
|      Simulation|    5845971|
+----------------+-----------+

