**Set-up(The following installations are adapted for Google Colab)** ⛳

In [None]:
# Install Java, Spark, and Findspark
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget https://downloads.apache.org/spark/spark-3.4.1/spark-3.4.1-bin-hadoop3.tgz
!tar xf spark-3.4.1-bin-hadoop3.tgz
!pip install -q findspark

--2023-11-29 00:55:46--  https://downloads.apache.org/spark/spark-3.4.1/spark-3.4.1-bin-hadoop3.tgz
Resolving downloads.apache.org (downloads.apache.org)... 88.99.95.219, 135.181.214.104, 2a01:4f8:10a:201a::2, ...
Connecting to downloads.apache.org (downloads.apache.org)|88.99.95.219|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 388341449 (370M) [application/x-gzip]
Saving to: ‘spark-3.4.1-bin-hadoop3.tgz’


2023-11-29 00:55:59 (29.8 MB/s) - ‘spark-3.4.1-bin-hadoop3.tgz’ saved [388341449/388341449]



In [None]:
# Set up environment paths for Java and Spark
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.4.1-bin-hadoop3"

In [None]:
!pip install pyspark



In [None]:
# Only import pyspark after findSpark.init()
import findspark
findspark.init()

from pyspark.sql import SparkSession, functions, types
import sys
assert sys.version_info >= (3, 5) # make sure we have Python 3.5

spark = SparkSession.builder.appName('Steam review analysis').getOrCreate()
spark.sparkContext.setLogLevel('WARN')
sc = spark.sparkContext

**Data Cleaning & Preparation:** ⛳





> *Read train and test dataset*








In [None]:
game_reviews_schema = types.StructType([
    types.StructField('Review', types.StringType()),
    types.StructField('Translated_Review', types.StringType()),
    types.StructField('Cleaned_Review', types.StringType()),
    types.StructField('Sentiment_Score', types.IntegerType()),
    types.StructField('Game_Name', types.StringType()),
])
game_reviews_train = spark.read.csv('Train.csv', header=True, schema=game_reviews_schema)
game_reviews_train.show(5)

game_reviews_test = spark.read.csv('Test.csv', header=True, schema=game_reviews_schema)
game_reviews_test.show(5)

+--------------------+--------------------+--------------------+---------------+--------------+
|              Review|   Translated_Review|      Cleaned_Review|Sentiment_Score|     Game_Name|
+--------------------+--------------------+--------------------+---------------+--------------+
|It's full of bots...|It's full of bots...|s bots years grea...|              1|Team Fortress2|
|                 Yes|                 Yes|                 yes|              1|         GTA V|
|             Moogus.|             Moogus.|              moogus|              1|Team Fortress2|
|the worst game i ...|the worst game i ...|worst game played...|              0|          RUST|
|Pretty nice game....|Pretty nice game....|pretty nice game ...|              1|  Apex Legends|
+--------------------+--------------------+--------------------+---------------+--------------+
only showing top 5 rows

+--------------------+--------------------+--------------------+---------------+--------------------+
|        



> *Drop rows with NULL value*



In [None]:
train_no_null = game_reviews_train.dropna()
train_no_null.show(5)

test_no_null = game_reviews_test.dropna()
test_no_null.show(5)

+--------------------+--------------------+--------------------+---------------+--------------+
|              Review|   Translated_Review|      Cleaned_Review|Sentiment_Score|     Game_Name|
+--------------------+--------------------+--------------------+---------------+--------------+
|It's full of bots...|It's full of bots...|s bots years grea...|              1|Team Fortress2|
|                 Yes|                 Yes|                 yes|              1|         GTA V|
|             Moogus.|             Moogus.|              moogus|              1|Team Fortress2|
|the worst game i ...|the worst game i ...|worst game played...|              0|          RUST|
|Pretty nice game....|Pretty nice game....|pretty nice game ...|              1|  Apex Legends|
+--------------------+--------------------+--------------------+---------------+--------------+
only showing top 5 rows

+--------------------+--------------------+--------------------+---------------+--------------------+
|        



> *Drop unwanted columns*



In [None]:
truncate_train = train_no_null.drop("Translated_Review","Cleaned_Review")
truncate_test = test_no_null.drop("Translated_Review","Cleaned_Review")
#clean_review_test = test_no_null.drop("Translated_Review","Cleaned_Review")



> *Combine train and test datasets*



In [None]:
all_reviews = truncate_train.union(truncate_test)



> *Save it to Parquet and Read*



In [None]:
all_reviews.write.parquet('reviews_parquet', mode='overwrite')
clean_review = spark.read.parquet('reviews_parquet')
clean_review.show(5)
print(f"This dataset contains {clean_review.count()} records.")

+--------------------+---------------+--------------+
|              Review|Sentiment_Score|     Game_Name|
+--------------------+---------------+--------------+
|It's full of bots...|              1|Team Fortress2|
|                 Yes|              1|         GTA V|
|             Moogus.|              1|Team Fortress2|
|the worst game i ...|              0|          RUST|
|Pretty nice game....|              1|  Apex Legends|
+--------------------+---------------+--------------+
only showing top 5 rows

This dataset contains 113094 records.




> *Remove all formats inside Game_Name*



In [None]:
pattern = "[\\p{Punct}\\s]+"
clean_review = clean_review.withColumn("Clean_Game_Name",functions.lower(functions.regexp_replace(clean_review["Game_Name"],
                          pattern, "")))
clean_review.show(5)

#clean_review_test = clean_review_test.withColumn("Clean_Game_Name",functions.lower(functions.regexp_replace(clean_review_test["Game_Name"],pattern, "")))

+--------------------+---------------+--------------+---------------+
|              Review|Sentiment_Score|     Game_Name|Clean_Game_Name|
+--------------------+---------------+--------------+---------------+
|It's full of bots...|              1|Team Fortress2|  teamfortress2|
|                 Yes|              1|         GTA V|           gtav|
|             Moogus.|              1|Team Fortress2|  teamfortress2|
|the worst game i ...|              0|          RUST|           rust|
|Pretty nice game....|              1|  Apex Legends|    apexlegends|
+--------------------+---------------+--------------+---------------+
only showing top 5 rows





> *Remove rows with review length < 2*



In [None]:
long_review = clean_review.filter(functions.length(clean_review['Review']) >= 2).cache()
long_review.show(5)

num_row_after = long_review.count()
print(f"After filtering, this dataset contains {long_review.count()} records.")

+--------------------+---------------+--------------+---------------+
|              Review|Sentiment_Score|     Game_Name|Clean_Game_Name|
+--------------------+---------------+--------------+---------------+
|It's full of bots...|              1|Team Fortress2|  teamfortress2|
|                 Yes|              1|         GTA V|           gtav|
|             Moogus.|              1|Team Fortress2|  teamfortress2|
|the worst game i ...|              0|          RUST|           rust|
|Pretty nice game....|              1|  Apex Legends|    apexlegends|
+--------------------+---------------+--------------+---------------+
only showing top 5 rows

After filtering, this dataset contains 112202 records.
