In [38]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.functions import from_unixtime
from pyspark.ml.recommendation import ALS
from pyspark.ml.feature import StringIndexer, IndexToString
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from scipy import stats
from pyspark.sql.functions import col, sum as _sum
from pyspark.ml import Pipeline

from pyspark.ml.feature import Tokenizer, NGram, HashingTF, IDF, VectorAssembler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.sql.functions import length, when, col


In [3]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("SteamReviewsHDFS") \
    .config("spark.hadoop.fs.defaultFS", "hdfs://localhost:9000") \
    .config("spark.driver.memory", "4g") \
    .config("spark.executor.memory", "4g") \
    .config("spark.sql.shuffle.partitions", "100")\
    .config("spark.sql.execution.arrow.pyspark.enabled", "true") \
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/04/28 13:10:34 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/04/28 13:10:35 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
25/04/28 13:10:35 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
25/04/28 13:10:35 WARN Utils: Service 'SparkUI' could not bind on port 4042. Attempting port 4043.


In [5]:
english_df = spark.read.parquet("/user/tejashree/project/steam_review_english.parquet")


In [11]:
english_df.show(5, truncate=False)
english_df.printSchema()


+-------+------+------------+---------+--------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------+-------------------+-----------+-------------+-----------+-------------------+-------------+--------------+-----------------+---------------------------+-----------------+----------------------+------------------+-----------------------+------------------------------+-------------------------+-------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|_c0    |app_id|app_name    |review_id|language|review                                                                                                                                    

### Remove very short reviews

In [18]:
from pyspark.sql.functions import length

english_df = english_df.filter(length("cleaned_review") > 5)


In [22]:

# 2. Fix label: convert boolean to numeric
english_df = english_df.withColumn(
    "label",
    when(col("recommended") == True, 1.0).otherwise(0.0)
)



In [24]:
# 3. Tokenizer
tokenizer = Tokenizer(inputCol="cleaned_review", outputCol="words")

# 4. Feature Extraction: TF-IDF
hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=10000)
idf = IDF(inputCol="rawFeatures", outputCol="features")

# 5. Logistic Regression Model
lr = LogisticRegression(featuresCol="features", labelCol="label")

# 6. Build Pipeline
pipeline = Pipeline(stages=[tokenizer, hashingTF, idf, lr])


In [26]:

# 7. Split Data
train_data, test_data = english_df.randomSplit([0.8, 0.2], seed=42)

# 8. Train Model
model = pipeline.fit(train_data)

# 9. Predictions
predictions = model.transform(test_data)

# 10. Evaluation
evaluator = BinaryClassificationEvaluator(
    labelCol="label", rawPredictionCol="rawPrediction", metricName="areaUnderROC"
)

roc_auc = evaluator.evaluate(predictions)
print(f"✅ ROC-AUC Score: {roc_auc:.4f}")

# 11. View some predictions
predictions.select("cleaned_review", "label", "prediction", "probability").show(10, truncate=False)


25/04/28 13:30:33 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
                                                                                

✅ ROC-AUC Score: 0.8973


[Stage 224:>                                                        (0 + 1) / 1]

+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----+----------+------------------------------------------+
|cleaned_review                                                                                                                                                        

                                                                                

In [28]:
model.save("/user/tejashree/project/sentiment_model")


In [30]:
from pyspark.ml.pipeline import PipelineModel

model = PipelineModel.load("/user/tejashree/project/sentiment_model")


In [32]:
from pyspark.sql import Row

# Example review
new_review_text = "not so happy with the game I've  played!"

# Create a single-row DataFrame
new_review_df = spark.createDataFrame([Row(cleaned_review=new_review_text)])


In [34]:
# Predict sentiment
prediction_result = model.transform(new_review_df)

# Show the prediction
prediction_result.select("cleaned_review", "prediction", "probability").show(truncate=False)


+----------------------------------------+----------+----------------------------------------+
|cleaned_review                          |prediction|probability                             |
+----------------------------------------+----------+----------------------------------------+
|not so happy with the game I've  played!|1.0       |[0.08431177295831398,0.9156882270416861]|
+----------------------------------------+----------+----------------------------------------+



In [40]:

# 3. Tokenizer: Split cleaned review text into words
tokenizer = Tokenizer(inputCol="cleaned_review", outputCol="words")

# 4. Create Bigrams (n-grams of size 2)
bigram = NGram(n=2, inputCol="words", outputCol="bigrams")

# 5. HashingTF for unigrams
hashingTF_unigrams = HashingTF(inputCol="words", outputCol="unigramFeatures", numFeatures=10000)

# 6. HashingTF for bigrams
hashingTF_bigrams = HashingTF(inputCol="bigrams", outputCol="bigramFeatures", numFeatures=10000)

# 7. Combine unigram and bigram features
assembler = VectorAssembler(
    inputCols=["unigramFeatures", "bigramFeatures"],
    outputCol="rawFeatures"
)

# 8. IDF to down-weight frequent words
idf = IDF(inputCol="rawFeatures", outputCol="features")

# 9. Logistic Regression Model
lr = LogisticRegression(featuresCol="features", labelCol="label")

# 10. Build Pipeline
pipeline = Pipeline(stages=[
    tokenizer,
    bigram,
    hashingTF_unigrams,
    hashingTF_bigrams,
    assembler,
    idf,
    lr
])

# 11. Split data
train_data, test_data = english_df.randomSplit([0.8, 0.2], seed=42)

# 12. Train Model
model = pipeline.fit(train_data)

# 13. Predictions
predictions = model.transform(test_data)

# 14. Evaluation
evaluator = BinaryClassificationEvaluator(
    labelCol="label", rawPredictionCol="rawPrediction", metricName="areaUnderROC"
)

roc_auc = evaluator.evaluate(predictions)
print(f"✅ ROC-AUC Score after Bigrams: {roc_auc:.4f}")

# 15. View some predictions
predictions.select("cleaned_review", "label", "prediction", "probability").show(10, truncate=False)


25/04/28 17:18:24 WARN DAGScheduler: Broadcasting large task binary with size 1328.3 KiB
25/04/28 17:19:01 WARN DAGScheduler: Broadcasting large task binary with size 1329.3 KiB
25/04/28 17:19:02 WARN DAGScheduler: Broadcasting large task binary with size 1674.4 KiB
25/04/28 17:19:44 WARN DAGScheduler: Broadcasting large task binary with size 1675.6 KiB
25/04/28 17:19:44 WARN DAGScheduler: Broadcasting large task binary with size 1675.0 KiB
25/04/28 17:20:02 WARN MemoryStore: Not enough space to cache rdd_587_5 in memory! (computed 113.1 MiB so far)
25/04/28 17:20:02 WARN BlockManager: Persisting block rdd_587_5 to disk instead.
25/04/28 17:20:03 WARN MemoryStore: Not enough space to cache rdd_587_1 in memory! (computed 113.1 MiB so far)
25/04/28 17:20:03 WARN BlockManager: Persisting block rdd_587_1 to disk instead.
25/04/28 17:20:03 WARN MemoryStore: Not enough space to cache rdd_587_0 in memory! (computed 113.1 MiB so far)
25/04/28 17:20:03 WARN BlockManager: Persisting block rdd_58

✅ ROC-AUC Score after Bigrams: 0.9131


25/04/28 17:21:30 WARN DAGScheduler: Broadcasting large task binary with size 1822.5 KiB
[Stage 474:>                                                        (0 + 1) / 1]

+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----+----------+------------------------------------------+
|cleaned_review                                                                                                                                                        

                                                                                

In [42]:
model.save("/user/tejashree/project/sentiment_model_with_bigrams")


                                                                                

In [48]:
from pyspark.ml.pipeline import PipelineModel

model = PipelineModel.load("/user/tejashree/project/sentiment_model_with_bigrams")
from pyspark.sql import Row

# Example review
new_review_text = "not  happy with the game I've  played!"

# Create a single-row DataFrame
new_review_df = spark.createDataFrame([Row(cleaned_review=new_review_text)])
# Predict sentiment
prediction_result = model.transform(new_review_df)

# Show the prediction
prediction_result.select("cleaned_review", "prediction", "probability").show(truncate=False)


+--------------------------------------+----------+----------------------------------------+
|cleaned_review                        |prediction|probability                             |
+--------------------------------------+----------+----------------------------------------+
|not  happy with the game I've  played!|1.0       |[0.10846962727216483,0.8915303727278352]|
+--------------------------------------+----------+----------------------------------------+



In [50]:
from pyspark.sql import Row
from pyspark.ml.pipeline import PipelineModel

# 1. Load your trained model (only once)
model = PipelineModel.load("/user/tejashree/project/sentiment_model_with_bigrams")

# 2. Define the prediction function
def predict_review(text):
    # Create a Spark DataFrame from input text
    new_review_df = spark.createDataFrame([Row(cleaned_review=text)])
    
    # Use the model to predict
    prediction_result = model.transform(new_review_df)
    
    # Extract prediction and probability
    row = prediction_result.select("prediction", "probability").head()
    
    prediction = int(row.prediction)  # 0 or 1
    probability_positive = row.probability[1]  # Probability of being positive
    probability_negative = row.probability[0]  # Probability of being negative
    
    # Print result
    sentiment = "Positive" if prediction == 1 else "Negative"
    print(f"✅ Review Sentiment: {sentiment}")
    print(f"📈 Confidence Score: {probability_positive*100:.2f}% (Positive)")
    print(f"📉 Confidence Score: {probability_negative*100:.2f}% (Negative)")
    
    return sentiment, probability_positive



In [66]:
predict_review("it wasnt good.")

✅ Review Sentiment: Positive
📈 Confidence Score: 87.31% (Positive)
📉 Confidence Score: 12.69% (Negative)


('Positive', 0.8730798504328429)

In [64]:
!pip install transformers
!pip install torch


Collecting transformers
  Downloading transformers-4.51.3-py3-none-any.whl.metadata (38 kB)
Collecting huggingface-hub<1.0,>=0.30.0 (from transformers)
  Downloading huggingface_hub-0.30.2-py3-none-any.whl.metadata (13 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers)
  Downloading tokenizers-0.21.1-cp39-abi3-macosx_11_0_arm64.whl.metadata (6.8 kB)
Collecting safetensors>=0.4.3 (from transformers)
  Downloading safetensors-0.5.3-cp38-abi3-macosx_11_0_arm64.whl.metadata (3.8 kB)
Downloading transformers-4.51.3-py3-none-any.whl (10.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.4/10.4 MB[0m [31m67.0 MB/s[0m eta [36m0:00:00[0m00:01[0m0:01[0m
[?25hDownloading huggingface_hub-0.30.2-py3-none-any.whl (481 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m481.4/481.4 kB[0m [31m40.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading safetensors-0.5.3-cp38-abi3-macosx_11_0_arm64.whl (418 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━