In [15]:
import pandas as pd
import time
import matplotlib.pyplot as plt
from pyspark.sql import SparkSession, types
from pyspark.sql import functions
from  pyspark.sql.functions import col
data_filepath = "../data/steam_reviews.csv"

In [None]:
steam_reviews_schema = types.StructType([
    types.StructField('_c0', types.StringType()),
    types.StructField('app_id', types.StringType()),
    types.StructField('app_name', types.StringType()),
    types.StructField('review_id', types.StringType()),
    types.StructField('language', types.StringType()),
    types.StructField('review', types.StringType()),
    types.StructField('timestamp_created', types.StringType()),
    types.StructField('timestamp_updated', types.StringType()),
    types.StructField('recommended', types.BooleanType()),
    types.StructField('votes_helpful', types.IntegerType()),
    types.StructField('votes_funny', types.IntegerType()),
    types.StructField('weighted_vote_score', types.FloatType()),
    types.StructField('comment_count', types.IntegerType()),
    types.StructField('steam_purchase', types.BooleanType()),
    types.StructField('received_for_free', types.BooleanType()),
    types.StructField('written_during_early_access', types.BooleanType()),
    types.StructField('author.steamid', types.StringType()),
    types.StructField('author.num_games_owned', types.IntegerType()),
    types.StructField('author.num_reviews', types.IntegerType()),
    types.StructField('author.playtime_forever', types.FloatType()),
    types.StructField('author.playtime_last_two_weeks', types.FloatType()),
    types.StructField('author.playtime_at_review', types.FloatType()),
    types.StructField('author.last_played', types.FloatType()),
])


In [2]:
#spark = SparkSession.builder.appName('games').getOrCreate()
spark = (
    SparkSession.builder.config("spark.sql.debug.maxToStringFields", 100)
    .appName("reviews")
    .getOrCreate()
)

steam_reviews = spark.read.format('csv') \
                    .schema(steam_reviews_schema) \
                    .option('header', 'true') \
                    .load(data_filepath)

steam_reviews.printSchema()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/10/26 13:23:04 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


root
 |-- _c0: string (nullable = true)
 |-- app_id: string (nullable = true)
 |-- app_name: string (nullable = true)
 |-- review_id: string (nullable = true)
 |-- language: string (nullable = true)
 |-- review: string (nullable = true)
 |-- timestamp_created: string (nullable = true)
 |-- timestamp_updated: string (nullable = true)
 |-- recommended: string (nullable = true)
 |-- votes_helpful: string (nullable = true)
 |-- votes_funny: string (nullable = true)
 |-- weighted_vote_score: string (nullable = true)
 |-- comment_count: string (nullable = true)
 |-- steam_purchase: string (nullable = true)
 |-- received_for_free: string (nullable = true)
 |-- written_during_early_access: string (nullable = true)
 |-- author.steamid: string (nullable = true)
 |-- author.num_games_owned: string (nullable = true)
 |-- author.num_reviews: string (nullable = true)
 |-- author.playtime_forever: string (nullable = true)
 |-- author.playtime_last_two_weeks: string (nullable = true)
 |-- author.playt

## Clean Data

1. 

In [23]:
steam_reviews.select([functions.count(functions.when(functions.isnan("`" + c + "`"), "`" + c + "`")).alias("`" + c + "`") for c in steam_reviews.columns]).show()

23/10/26 13:52:33 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , app_id, app_name, review_id, language, review, timestamp_created, timestamp_updated, recommended, votes_helpful, votes_funny, weighted_vote_score, comment_count, steam_purchase, received_for_free, written_during_early_access, author.steamid, author.num_games_owned, author.num_reviews, author.playtime_forever, author.playtime_last_two_weeks, author.playtime_at_review, author.last_played
 Schema: _c0, app_id, app_name, review_id, language, review, timestamp_created, timestamp_updated, recommended, votes_helpful, votes_funny, weighted_vote_score, comment_count, steam_purchase, received_for_free, written_during_early_access, author.steamid, author.num_games_owned, author.num_reviews, author.playtime_forever, author.playtime_last_two_weeks, author.playtime_at_review, author.last_played
Expected: _c0 but found: 
CSV file: file:///Users/alexto/Documents/ProgrammingProjects/Tech-Job-Market-Trends/dat

+-----+--------+----------+-----------+----------+--------+-------------------+-------------------+-------------+---------------+-------------+---------------------+---------------+----------------+-------------------+-----------------------------+----------------+------------------------+--------------------+-------------------------+--------------------------------+---------------------------+--------------------+
|`_c0`|`app_id`|`app_name`|`review_id`|`language`|`review`|`timestamp_created`|`timestamp_updated`|`recommended`|`votes_helpful`|`votes_funny`|`weighted_vote_score`|`comment_count`|`steam_purchase`|`received_for_free`|`written_during_early_access`|`author.steamid`|`author.num_games_owned`|`author.num_reviews`|`author.playtime_forever`|`author.playtime_last_two_weeks`|`author.playtime_at_review`|`author.last_played`|
+-----+--------+----------+-----------+----------+--------+-------------------+-------------------+-------------+---------------+-------------+-----------------

                                                                                

In [24]:
steam_reviews.select([functions.count(functions.when(functions.isnull("`" + c + "`"), "`" + c + "`")).alias("`" + c + "`") for c in steam_reviews.columns]).show()

23/10/26 13:58:21 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , app_id, app_name, review_id, language, review, timestamp_created, timestamp_updated, recommended, votes_helpful, votes_funny, weighted_vote_score, comment_count, steam_purchase, received_for_free, written_during_early_access, author.steamid, author.num_games_owned, author.num_reviews, author.playtime_forever, author.playtime_last_two_weeks, author.playtime_at_review, author.last_played
 Schema: _c0, app_id, app_name, review_id, language, review, timestamp_created, timestamp_updated, recommended, votes_helpful, votes_funny, weighted_vote_score, comment_count, steam_purchase, received_for_free, written_during_early_access, author.steamid, author.num_games_owned, author.num_reviews, author.playtime_forever, author.playtime_last_two_weeks, author.playtime_at_review, author.last_played
Expected: _c0 but found: 
CSV file: file:///Users/alexto/Documents/ProgrammingProjects/Tech-Job-Market-Trends/dat

+-----+--------+----------+-----------+----------+--------+-------------------+-------------------+-------------+---------------+-------------+---------------------+---------------+----------------+-------------------+-----------------------------+----------------+------------------------+--------------------+-------------------------+--------------------------------+---------------------------+--------------------+
|`_c0`|`app_id`|`app_name`|`review_id`|`language`|`review`|`timestamp_created`|`timestamp_updated`|`recommended`|`votes_helpful`|`votes_funny`|`weighted_vote_score`|`comment_count`|`steam_purchase`|`received_for_free`|`written_during_early_access`|`author.steamid`|`author.num_games_owned`|`author.num_reviews`|`author.playtime_forever`|`author.playtime_last_two_weeks`|`author.playtime_at_review`|`author.last_played`|
+-----+--------+----------+-----------+----------+--------+-------------------+-------------------+-------------+---------------+-------------+-----------------

                                                                                

In [4]:
steam_reviews.show(10)

+--------------------+------+--------------------+---------+--------+----------------------------------+-----------------+-----------------+-----------+-------------+-----------+-------------------+-------------+--------------+-----------------+---------------------------+-----------------+----------------------+------------------+-----------------------+------------------------------+-------------------------+------------------+
|                 _c0|app_id|            app_name|review_id|language|                            review|timestamp_created|timestamp_updated|recommended|votes_helpful|votes_funny|weighted_vote_score|comment_count|steam_purchase|received_for_free|written_during_early_access|   author.steamid|author.num_games_owned|author.num_reviews|author.playtime_forever|author.playtime_last_two_weeks|author.playtime_at_review|author.last_played|
+--------------------+------+--------------------+---------+--------+----------------------------------+-----------------+----------

23/10/26 13:27:39 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , app_id, app_name, review_id, language, review, timestamp_created, timestamp_updated, recommended, votes_helpful, votes_funny, weighted_vote_score, comment_count, steam_purchase, received_for_free, written_during_early_access, author.steamid, author.num_games_owned, author.num_reviews, author.playtime_forever, author.playtime_last_two_weeks, author.playtime_at_review, author.last_played
 Schema: _c0, app_id, app_name, review_id, language, review, timestamp_created, timestamp_updated, recommended, votes_helpful, votes_funny, weighted_vote_score, comment_count, steam_purchase, received_for_free, written_during_early_access, author.steamid, author.num_games_owned, author.num_reviews, author.playtime_forever, author.playtime_last_two_weeks, author.playtime_at_review, author.last_played
Expected: _c0 but found: 
CSV file: file:///Users/alexto/Documents/ProgrammingProjects/Tech-Job-Market-Trends/dat

In [10]:
playerTypeData = steam_reviews.select("app_id", "app_name", "recommended", "comment_count", "`author.steamid`",
                                      "`author.num_games_owned`", "`author.num_reviews`", "`author.playtime_forever`",
                                      "`author.playtime_last_two_weeks`", "`author.playtime_at_review`", "`author.last_played`").cache()

In [11]:
playerTypeData.show(10)

[Stage 3:>                                                          (0 + 1) / 1]

+------+--------------------+-----------+-------------+-----------------+----------------------+------------------+-----------------------+------------------------------+-------------------------+------------------+
|app_id|            app_name|recommended|comment_count|   author.steamid|author.num_games_owned|author.num_reviews|author.playtime_forever|author.playtime_last_two_weeks|author.playtime_at_review|author.last_played|
+------+--------------------+-----------+-------------+-----------------+----------------------+------------------+-----------------------+------------------------------+-------------------------+------------------+
|292030|The Witcher 3: Wi...|       True|            0|76561199095369542|                     6|                 2|                 1909.0|                        1448.0|                   1909.0|      1611343383.0|
|292030|The Witcher 3: Wi...|       True|            0|76561198949504115|                    30|                10|                 2764

                                                                                

In [20]:
review_count = playerTypeData.where(col("app_name").isNotNull()) \
                              .groupBy(playerTypeData['app_name']).agg(functions.count("*").alias("count")) \
                              .where(col("count") > 100) \
                              .select(col("app_name"), col("count")) \
                              .orderBy(functions.desc(col("count")))

In [21]:
review_count.show()

23/10/26 13:47:39 WARN MemoryStore: Not enough space to cache rdd_19_5 in memory! (computed 25.5 MiB so far)
23/10/26 13:47:39 WARN MemoryStore: Not enough space to cache rdd_19_1 in memory! (computed 25.9 MiB so far)
23/10/26 13:47:39 WARN MemoryStore: Not enough space to cache rdd_19_0 in memory! (computed 22.5 MiB so far)
23/10/26 13:47:39 WARN MemoryStore: Not enough space to cache rdd_19_3 in memory! (computed 31.0 MiB so far)
23/10/26 13:47:39 WARN MemoryStore: Not enough space to cache rdd_19_7 in memory! (computed 33.4 MiB so far)
23/10/26 13:47:39 WARN MemoryStore: Not enough space to cache rdd_19_4 in memory! (computed 47.4 MiB so far)
23/10/26 13:47:39 WARN MemoryStore: Not enough space to cache rdd_19_6 in memory! (computed 47.1 MiB so far)
23/10/26 13:47:41 WARN MemoryStore: Not enough space to cache rdd_19_14 in memory! (computed 25.5 MiB so far)
23/10/26 13:47:41 WARN MemoryStore: Not enough space to cache rdd_19_8 in memory! (computed 28.3 MiB so far)
23/10/26 13:47:41 

+--------------------+-------+
|            app_name|  count|
+--------------------+-------+
|PLAYERUNKNOWN'S B...|1644255|
|  Grand Theft Auto V|1019116|
|Tom Clancy's Rain...| 841918|
|            Terraria| 672817|
|         Garry's Mod| 655524|
|                Rust| 549075|
|       Rocket League| 498565|
|            PAYDAY 2| 487747|
|            Among Us| 485293|
|The Witcher 3: Wi...| 469395|
|    Dead by Daylight| 418897|
|ARK: Survival Evo...| 400009|
|Euro Truck Simula...| 387553|
|      Stardew Valley| 315717|
|The Elder Scrolls...| 294966|
|    Wallpaper Engine| 292790|
|Monster Hunter: W...| 290946|
|       Hollow Knight| 269854|
|          The Forest| 239734|
|Don't Starve Toge...| 238636|
+--------------------+-------+
only showing top 20 rows



                                                                                