In [24]:
import sys
assert sys.version_info >= (3, 5) # make sure we have Python 3.5+
from pyspark.sql import SparkSession, functions, types

import os

In [25]:
spark = SparkSession.builder.appName('top10 game review').getOrCreate()
spark.sparkContext.setLogLevel('WARN')
sc = spark.sparkContext

## Data Cleaning & Preparation

    Read training dataset

In [26]:
game_reviews_schema = types.StructType([
    types.StructField('Review', types.StringType()),
    types.StructField('Translated_Review', types.StringType()),
    types.StructField('Cleaned_Review', types.StringType()),
    types.StructField('Sentiment_Score', types.IntegerType()),
    types.StructField('Game_Name', types.StringType()),
])
game_reviews_train = spark.read.csv('../Top10_Game_Review/Train.csv', header=True, schema=game_reviews_schema).cache()
game_reviews_train.show(20)

+--------------------+--------------------+--------------------+---------------+--------------------+
|              Review|   Translated_Review|      Cleaned_Review|Sentiment_Score|           Game_Name|
+--------------------+--------------------+--------------------+---------------+--------------------+
|It's full of bots...|It's full of bots...|s bots years grea...|              1|      Team Fortress2|
|                 Yes|                 Yes|                 yes|              1|               GTA V|
|             Moogus.|             Moogus.|              moogus|              1|      Team Fortress2|
|the worst game i ...|the worst game i ...|worst game played...|              0|                RUST|
|Pretty nice game....|Pretty nice game....|pretty nice game ...|              1|        Apex Legends|
|It's about drive,...|It's about drive,...|     s drive s power|              1|        Apex Legends|
|           best game|           best game|           best game|              1|  

23/10/29 14:16:56 WARN CacheManager: Asked to cache already cached data.


    Read testing dataset

In [27]:
game_reviews_test = spark.read.csv('../Top10_Game_Review/Test.csv', header=True, schema=game_reviews_schema).cache()
game_reviews_test.show(20)

+--------------------+--------------------+--------------------+---------------+--------------------+
|              Review|   Translated_Review|      Cleaned_Review|Sentiment_Score|           Game_Name|
+--------------------+--------------------+--------------------+---------------+--------------------+
|              peepee|               Pepee|               pepee|              1|        Apex Legends|
|Painfully horrend...|Painfully horrend...|painfully horrend...|              0|PlayerUnknown's B...|
|              its ok|              its ok|                  ok|              1|        Apex Legends|
|           Nice game|           Nice game|           nice game|              1|      Team Fortress2|
|they copied many ...|they copied many ...|copied things gmo...|              0|      Team Fortress2|
|                 yes|                 yes|                 yes|              1|      Team Fortress2|
|                  ツ|                 TSU|                 tsu|              1|   

23/10/29 14:16:56 WARN CacheManager: Asked to cache already cached data.


    Drop rows with NULL value

In [28]:
train_no_null = game_reviews_train.dropna()
train_no_null.show(20)

+--------------------+--------------------+--------------------+---------------+--------------------+
|              Review|   Translated_Review|      Cleaned_Review|Sentiment_Score|           Game_Name|
+--------------------+--------------------+--------------------+---------------+--------------------+
|It's full of bots...|It's full of bots...|s bots years grea...|              1|      Team Fortress2|
|                 Yes|                 Yes|                 yes|              1|               GTA V|
|             Moogus.|             Moogus.|              moogus|              1|      Team Fortress2|
|the worst game i ...|the worst game i ...|worst game played...|              0|                RUST|
|Pretty nice game....|Pretty nice game....|pretty nice game ...|              1|        Apex Legends|
|It's about drive,...|It's about drive,...|     s drive s power|              1|        Apex Legends|
|           best game|           best game|           best game|              1|  

In [29]:
test_no_null = game_reviews_test.dropna()
test_no_null.show(20)

+--------------------+--------------------+--------------------+---------------+--------------------+
|              Review|   Translated_Review|      Cleaned_Review|Sentiment_Score|           Game_Name|
+--------------------+--------------------+--------------------+---------------+--------------------+
|              peepee|               Pepee|               pepee|              1|        Apex Legends|
|Painfully horrend...|Painfully horrend...|painfully horrend...|              0|PlayerUnknown's B...|
|              its ok|              its ok|                  ok|              1|        Apex Legends|
|           Nice game|           Nice game|           nice game|              1|      Team Fortress2|
|they copied many ...|they copied many ...|copied things gmo...|              0|      Team Fortress2|
|                 yes|                 yes|                 yes|              1|      Team Fortress2|
|                  ツ|                 TSU|                 tsu|              1|   

    Check if two datasets containing any other missing values

In [30]:
def is_contain_none(df):
  for col in df.columns:
    null_df = df.filter((functions.col(col) == "null") | (functions.col(col) == "NULL"))
    if (null_df.isEmpty()) == False:
      nan_percent = null_df.count()/df.count()
      print(f'Column "{col}" CONTAINS {nan_percent:.2%} null values')
  print("Good! There is NO missing value")

print("----------Summary of game_review_train dataset----------")
is_contain_none(game_reviews_train)
print(" ") 
print("----------Summary of game_review_test dataset------------")
is_contain_none(game_reviews_test) 

----------Summary of game_review_train dataset----------
Good! There is NO missing value
 
----------Summary of game_review_test dataset------------
Good! There is NO missing value


    Drop some columns

In [34]:
clean_review_train = train_no_null.drop("Translated_Review","Cleaned_Review").cache()
clean_review_test = test_no_null.drop("Translated_Review","Cleaned_Review").cache()

clean_review_train.show(10)
clean_review_test.show(10)

+--------------------+---------------+--------------------+
|              Review|Sentiment_Score|           Game_Name|
+--------------------+---------------+--------------------+
|It's full of bots...|              1|      Team Fortress2|
|                 Yes|              1|               GTA V|
|             Moogus.|              1|      Team Fortress2|
|the worst game i ...|              0|                RUST|
|Pretty nice game....|              1|        Apex Legends|
|It's about drive,...|              1|        Apex Legends|
|           best game|              1|                RUST|
|damn give me my m...|              0|PlayerUnknown's B...|
|it is a fun time ...|              1|        Apex Legends|
|Only 9 heroes. An...|              0|  NARAKA: BLADEPOINT|
+--------------------+---------------+--------------------+
only showing top 10 rows

+--------------------+---------------+--------------------+
|              Review|Sentiment_Score|           Game_Name|
+-------------

    Add a column for Game_Name that is ready for join

In [45]:
pattern = "[\\p{Punct}\\s]+"
clean_review_train = clean_review_train.withColumn("Clean_Game_Name", 
                                                functions.lower(functions.regexp_replace(clean_review_train["Game_Name"], pattern, ""))).cache()

clean_review_test = clean_review_test.withColumn("Clean_Game_Name", 
                                                functions.lower(functions.regexp_replace(clean_review_test["Game_Name"], pattern, ""))).cache()

    Translate game review to English through "googletrans" library
        pip install googletrans==4.0.0-rc1

In [53]:
# RUN this to import translator
from googletrans import Translator

@functions.udf(returnType=types.StringType())
def translate_to_english(text):
    translator = Translator()
    try:
        return translator.translate(text, src='auto', dest='en').text
    except:
        return None  # Return None if translation fails
clean_review_train = clean_review_train.withColumn("Translated_Review", translate_to_english(clean_review_train["Review"])).dropna()
clean_review_test = clean_review_test.withColumn("Translated_Review", translate_to_english(clean_review_test["Review"])).dropna()

# Combine train and test datasets

In [56]:
# print(f'The number of records in clean_review_train are {str(clean_review_train.count())}')
# print(f'The number of records in clean_review_test are {str(clean_review_test.count())}')

all_game_reviews = clean_review_train.union(clean_review_test).cache()
# print("-----------------------------------------------------------------------------------")
#print(f'The number of records in all_game_reviews are {str(all_game_reviews.count())}')

23/10/29 14:47:45 WARN CacheManager: Asked to cache already cached data.


[Stage 178:>  (0 + 7) / 8][Stage 179:>  (0 + 0) / 8][Stage 180:>  (0 + 0) / 6]

Exploratory Data Analysis