In [17]:
from pyspark.sql import SparkSession

# Initialize Spark session
spark = SparkSession.builder.appName("AddLabel").getOrCreate()

# Load CSV files into DataFrames
review = spark.read.csv("gs://msca-bdp-student-gcs/Group5/english_reviews/part-00000-dbebe656-10b1-42c2-93d1-bf55b07e437d-c000.csv", header=True, inferSchema=True)
game = spark.read.csv("gs://msca-bdp-student-gcs/Group5/game_info/part-00000-4c409236-3e37-4b2d-8927-474fa4977f7b-c000.csv", header=True, inferSchema=True)

                                                                                

In [14]:
# Add an index column to game DataFrame
#game_with_index = game.rdd.zipWithIndex().toDF(["row", "index"])

# Filter out the first two rows
#games = game_with_index.filter(game_with_index["index"] > 1).select("row.*")

# Show the resulting game DataFrame after deleting the first two rows
#games.show(3)

In [18]:
review.head(2)

[Row(recommendationid=148912575, appid=10, game='Counter-Strike', author_steamid=76561198363716821, author_num_games_owned=0, author_num_reviews=2, author_playtime_forever=197, author_playtime_last_two_weeks=41, author_playtime_at_review=197, author_last_played=1698329401, language='english', review='GOAT Game !!', timestamp_created='1698329419', timestamp_updated='1698329419', voted_up='1', votes_up='0', votes_funny='0', weighted_vote_score='0.0', comment_count='0', steam_purchase='1', received_for_free='0', written_during_early_access='0', hidden_in_steam_china='1', steam_china_location=None),
 Row(recommendationid=148895540, appid=10, game='Counter-Strike', author_steamid=76561198134752176, author_num_games_owned=69, author_num_reviews=1, author_playtime_forever=12107, author_playtime_last_two_weeks=0, author_playtime_at_review=12107, author_last_played=1670487891, language='english', review='First crush. Always in my heart.', timestamp_created='1698306096', timestamp_updated='16983

In [19]:
game.head(2)

                                                                                

[Row(appid=10, name='Counter-Strike', developer='Valve', publisher='Valve', positive='235682', negative='6218', owners='10,000,000 .. 20,000,000', average_forever='8920', average_2weeks='7', median_forever=174, median_2weeks=10, price=999, initialprice=999, discount=0, ccu=14002, Afrikaans=0, Arabic=0, Armenian=0, Basque=0, Belarusian=0, Bulgarian=0, Catalan=0, Croatian=0, Czech=0, Danish=0, Dutch=0, English=1, Estonian=0, Filipino=0, Finnish=0, French=1, Galician=0, Georgian=0, German=1, Greek=0, Hebrew=0, Hindi=0, Hungarian=0, Icelandic=0, Indonesian=0, Irish=0, Italian=1, Japanese=0, Kannada=0, Kazakh=0, Korean=1, Latvian=0, Lithuanian=0, Malay=0, Maori=0, Mongolian=0, Norwegian=0, Not supported=0, Persian=0, Polish=0, Portuguese=0, Portuguese - Brazil=0, Portuguese - Portugal=0, Romanian=0, Russian=0, Serbian=0, Simplified Chinese=1, Slovak=0, Slovenian=0, Spanish=0, Spanish - Latin America=0, Spanish - Spain=1, Swedish=0, Tamil=0, Telugu=0, Thai=0, Traditional Chinese=1, Turkish=0

In [20]:
game.show(2, truncate=False)

[Stage 29:>                                                         (0 + 1) / 1]

+-------+--------------+------------+--------------------+--------+--------+------------------------+---------------+--------------+--------------+-------------+-----+------------+--------+-----+---------+------+--------+------+----------+---------+-------+--------+-----+------+-----+-------+--------+--------+-------+------+--------+--------+------+-----+------+-----+---------+---------+----------+-----+-------+--------+-------+------+------+-------+----------+-----+-----+---------+---------+-------------+-------+------+----------+-------------------+---------------------+--------+-------+-------+------------------+------+---------+-------+-----------------------+---------------+-------+-----+------+----+-------------------+-------+---------+----------+------------+------------------+---------------+--------------------------+----------------------+------------+---------------------------+------------------+---------------+--------------------+--------------------+---------------------

                                                                                

In [29]:
from pyspark.sql import functions as F

game = spark.read.csv("gs://msca-bdp-student-gcs/Group5/game_info/part-00000-4c409236-3e37-4b2d-8927-474fa4977f7b-c000.csv", header=True, inferSchema=True)

# Replace NULL values in 'positive' and 'negative' columns with 0
game = game.fillna({"positive": 0, "negative": 0})

# Add a total column
game = game.withColumn("total", F.col("positive") + F.col("negative"))

# Add positive_percentage, handling total == 0 to avoid division by zero
game = game.withColumn(
    "positive_percentage",
    F.when(F.col("total") == 0, 0).otherwise((F.col("positive") / F.col("total")) * 100)
)


# Define a UDF to assign labels based on positive_percentage and total
def assign_label(positive_percentage, total):
    # Handle cases where positive_percentage or total is None
    if positive_percentage is None or total is None:
        return None
    if total == 0:
        return "No Reviews"
    if total >= 500:
        if 95 <= positive_percentage <= 100:
            return "Overwhelmingly Positive"
        elif 80 <= positive_percentage <= 94:
            return "Very Positive"
        elif 40 <= positive_percentage <= 79:
            return "Mixed"
        elif 20 <= positive_percentage <= 39:
            return "Mostly Negative"
        elif 0 <= positive_percentage <= 19:
            return "Overwhelmingly Negative"
    elif 50 <= total < 500:
        if 80 <= positive_percentage <= 100:
            return "Very Positive"
        elif 70 <= positive_percentage <= 79:
            return "Mostly Positive"
        elif 40 <= positive_percentage <= 69:
            return "Mixed"
        elif 20 <= positive_percentage <= 39:
            return "Mostly Negative"
        elif 0 <= positive_percentage <= 19:
            return "Very Negative"
    elif 10 <= total < 50:
        if 80 <= positive_percentage <= 100:
            return "Positive"
        elif 70 <= positive_percentage <= 79:
            return "Mostly Positive"
        elif 40 <= positive_percentage <= 69:
            return "Mixed"
        elif 0 <= positive_percentage <= 39:
            return "Negative"
    else:
        return None

# Register the UDF
label_udf = F.udf(assign_label, returnType=F.StringType())

# Add the label column
game = game.withColumn("label", label_udf(F.col("positive_percentage"), F.col("total")))

# Show the final DataFrame
game.select("positive", "negative", "total", "positive_percentage", "label").show()

                                                                                

+--------+--------+--------+-------------------+--------------------+
|positive|negative|   total|positive_percentage|               label|
+--------+--------+--------+-------------------+--------------------+
|  235682|    6218|241900.0|   97.4295163290616|Overwhelmingly Po...|
|    4414|     739|  5153.0|   85.6588395109645|       Very Positive|
|    1904|     184|  2088.0|  91.18773946360153|       Very Positive|
|    1015|     498|  1513.0|  67.08526107072042|               Mixed|
|      35|      16|    51.0|  68.62745098039215|               Mixed|
|       4|       5|     9.0|  44.44444444444444|                null|
|    6073|     651|  6724.0|  90.31826293872695|       Very Positive|
|     318|      91|   409.0|  77.75061124694376|     Mostly Positive|
|    1766|     625|  2391.0|  73.86030949393559|               Mixed|
|     373|      32|   405.0|  92.09876543209876|       Very Positive|
|     227|      21|   248.0|  91.53225806451613|       Very Positive|
|    1072|     241| 

In [30]:
# Drop rows where label is NULL or 'No Reviews'
game = game.filter(F.col("label").isNotNull() & (F.col("label") != "No Reviews"))

In [31]:
# Show remaining rows
game.select("positive", "negative", "total", "positive_percentage", "label").show()

# Count rows to verify
print(f"Number of rows after dropping NULL and 'No Reviews': {game.count()}")

+--------+--------+--------+-------------------+--------------------+
|positive|negative|   total|positive_percentage|               label|
+--------+--------+--------+-------------------+--------------------+
|  235682|    6218|241900.0|   97.4295163290616|Overwhelmingly Po...|
|    4414|     739|  5153.0|   85.6588395109645|       Very Positive|
|    1904|     184|  2088.0|  91.18773946360153|       Very Positive|
|    1015|     498|  1513.0|  67.08526107072042|               Mixed|
|      35|      16|    51.0|  68.62745098039215|               Mixed|
|    6073|     651|  6724.0|  90.31826293872695|       Very Positive|
|     318|      91|   409.0|  77.75061124694376|     Mostly Positive|
|    1766|     625|  2391.0|  73.86030949393559|               Mixed|
|     373|      32|   405.0|  92.09876543209876|       Very Positive|
|     227|      21|   248.0|  91.53225806451613|       Very Positive|
|    1072|     241|  1313.0|  81.64508758568164|       Very Positive|
|     164|       4| 

In [32]:
game.coalesce(1).write.csv(
    "gs://msca-bdp-student-gcs/Group5/game_w_label",
    header=True,
    mode="overwrite"
)

print("Export successful!")

                                                                                

Export successful!
