In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.functions import from_unixtime
from pyspark.ml.recommendation import ALS
from pyspark.ml.feature import StringIndexer, IndexToString
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from scipy import stats
from pyspark.sql.functions import col, sum as _sum
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("SteamReviewsHDFStransalte") \
    .config("spark.hadoop.fs.defaultFS", "hdfs://localhost:9000") \
    .config("spark.driver.memory", "4g") \
    .config("spark.executor.memory", "4g") \
    .config("spark.sql.shuffle.partitions", "100")\
    .config("spark.sql.execution.arrow.pyspark.enabled", "true") \
    .getOrCreate()


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/05/09 18:45:08 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/05/09 18:45:08 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
25/05/09 18:45:08 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.


In [3]:

df = spark.read.parquet("/user/tejashree/project/outputs/steam_sentiment_final_batched.parquet") 

# Add numeric sentiment for easier aggregation
df = df.withColumn("is_positive", when(col("sentiment_label") == "POSITIVE", 1).otherwise(0))
df = df.withColumn("is_negative", when(col("sentiment_label") == "NEGATIVE", 1).otherwise(0))

In [4]:

# Group by app_id and app_name
summary = df.groupBy("app_id", "app_name").agg(
    avg("sentiment_score").alias("avg_sentiment_score"),
    count("*").alias("total_reviews"),
    count(when(col("is_positive") == 1, True)).alias("positive_reviews"),
    count(when(col("is_negative") == 1, True)).alias("negative_reviews")
)

# Calculate percentages
summary = summary.withColumn("percent_positive", (col("positive_reviews") / col("total_reviews")) * 100)
summary = summary.withColumn("percent_negative", (col("negative_reviews") / col("total_reviews")) * 100)


In [5]:
summary.show(5, truncate=False)

                                                                                

+------+--------------------------+-------------------+-------------+----------------+----------------+------------------+------------------+
|app_id|app_name                  |avg_sentiment_score|total_reviews|positive_reviews|negative_reviews|percent_positive  |percent_negative  |
+------+--------------------------+-------------------+-------------+----------------+----------------+------------------+------------------+
|8930  |Sid Meier's Civilization V|0.9680371623198893 |104090       |73808           |30282           |70.90786819098857 |29.092131809011434|
|4000  |Garry's Mod               |0.9697753596000136 |331060       |240874          |90186           |72.75841237237964 |27.241587627620373|
|420   |Half-Life 2: Episode Two  |0.9628003750171743 |11453        |7029            |4424            |61.372566139876014|38.627433860123986|
|381210|Dead by Daylight          |0.9681665309390616 |143772       |89856           |53916           |62.49895668141223 |37.50104331858776 |
|2870 

In [6]:
summary.filter(col("app_name") == "Hades").show(5, truncate=False)


+-------+--------+-------------------+-------------+----------------+----------------+-----------------+------------------+
|app_id |app_name|avg_sentiment_score|total_reviews|positive_reviews|negative_reviews|percent_positive |percent_negative  |
+-------+--------+-------------------+-------------+----------------+----------------+-----------------+------------------+
|1145360|Hades   |0.976708369881619  |65629        |54337           |11292           |82.79419159213153|17.205808407868474|
+-------+--------+-------------------+-------------+----------------+----------------+-----------------+------------------+



In [7]:
# Save summary table
summary.write.mode("overwrite").parquet("/user/tejashree/project/data/processed/steam_sentiment_summary.parquet")

print("✅ Sentiment summary saved to /user/tejashree/project/data/processed/steam_sentiment_summary.parquet")

✅ Sentiment summary saved to /user/tejashree/project/data/processed/steam_sentiment_summary.parquet
