1. Load the Dataset into a PySpark DataFrame

In [1]:
from pyspark.sql import SparkSession

# Initialize Spark session
spark = SparkSession.builder \
    .appName("Load Dataset") \
    .getOrCreate()

# Load the dataset
df = spark.read.csv("reviews_2.csv", header=True, inferSchema=True)

# Show the first few rows
df.show()


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/08/31 15:05:40 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


+-------+----------+--------------------+-------------+------------+-----------+----------+-----+
|user_id|product_id|              review|      summary|profile_name|helpfulness|      time|score|
+-------+----------+--------------------+-------------+------------+-----------+----------+-----+
|      1|       101|Great product, hi...|       Great!|       Alice|         10|2024-01-01|    5|
|      1|       102|Not what I expected.|Disappointing|       Alice|          2|2024-01-02|    2|
|      2|       101|Good value for th...|        Value|         Bob|          5|2024-01-03|    4|
|      2|       103|The product broke...|      Unhappy|         Bob|          1|2024-01-04|    1|
|      3|       104|Excellent quality...|      Quality|     Charlie|          8|2024-01-05|    5|
|      3|       105|It does the job b...|         Okay|     Charlie|          4|2024-01-06|    3|
|      4|       101|Loved it, but a b...|    Expensive|       David|          6|2024-01-07|    4|
|      4|       102|

2. Split Data and Train a Recommendation Model

In [2]:
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql.functions import col

# Initialize Spark session
spark = SparkSession.builder \
    .appName("Train Recommendation Model") \
    .getOrCreate()

# Load the dataset
df = spark.read.csv("reviews_2.csv", header=True, inferSchema=True)

# Prepare the data for training
df = df.select(col("user_id").alias("user"), col("product_id").alias("item"), col("score").alias("rating"))

# Split the data into training and test sets
(training_data, test_data) = df.randomSplit([0.8, 0.2])

# Initialize the ALS model
als = ALS(userCol="user", itemCol="item", ratingCol="rating", coldStartStrategy="drop")
model = als.fit(training_data)

# Save the model (optional)
model.save("als_model")


24/08/31 15:06:19 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.
24/08/31 15:06:21 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory
Scaling row group sizes to 96.54% for 7 writers
24/08/31 15:06:21 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory
Scaling row group sizes to 84.47% for 8 writers
24/08/31 15:06:21 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory
Scaling row group sizes to 75.08% for 9 writers
24/08/31 15:06:21 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory
Scaling row group sizes to 67.58% for 10 writers
24/08/31 15:06:21 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory
Scaling row group sizes to 75.08% for 9 writers
24/08/31 15:06:21 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory
Scaling r

3. Implement ALS Algorithm for Collaborative Filtering

In [3]:
from pyspark.ml.recommendation import ALS
from pyspark.sql.functions import col

# Initialize Spark session
spark = SparkSession.builder \
    .appName("ALS Model") \
    .getOrCreate()

# Load the dataset
df = spark.read.csv("reviews_2.csv", header=True, inferSchema=True)

# Prepare the data for training
df = df.select(col("user_id").alias("user"), col("product_id").alias("item"), col("score").alias("rating"))

# Split the data into training and test sets
(training_data, test_data) = df.randomSplit([0.8, 0.2])

# Initialize the ALS model
als = ALS(userCol="user", itemCol="item", ratingCol="rating", coldStartStrategy="drop")
model = als.fit(training_data)

# Save the model (optional)
model.save("als_model")


24/08/31 15:07:17 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


Py4JJavaError: An error occurred while calling o238.save.
: java.io.IOException: Path als_model already exists. To overwrite it, please use write.overwrite().save(path) for Scala and use write().overwrite().save(path) for Java and Python.
	at org.apache.spark.ml.util.FileSystemOverwrite.handleOverwrite(ReadWrite.scala:683)
	at org.apache.spark.ml.util.MLWriter.save(ReadWrite.scala:167)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.lang.Thread.run(Thread.java:750)


In [6]:
from pyspark.sql import SparkSession
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql.functions import col, desc

# Initialize Spark session
spark = SparkSession.builder \
    .appName("Recommendation System") \
    .getOrCreate()

# Load the dataset
df = spark.read.csv("reviews_2.csv", header=True, inferSchema=True)

# Prepare the data for training
df = df.select(col("user_id").alias("user"), col("product_id").alias("item"), col("score").alias("rating"))

# Split the data into training and test sets
(training_data, test_data) = df.randomSplit([0.8, 0.2])

# Initialize and train the ALS model
als = ALS(userCol="user", itemCol="item", ratingCol="rating", coldStartStrategy="drop")
model = als.fit(training_data)

# Save the model with overwrite option
model.write().overwrite().save("als_model")

# Load the model (if needed)
loaded_model = ALS.load("als_model")

# Make predictions
predictions = loaded_model.transform(test_data)

# Evaluate the model
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print(f"Root-mean-square error = {rmse}")

# Additional analysis: Get top rated and worst rated movies
top_rated_movies = df.filter(col("score") == 5).groupBy("item").count().orderBy(desc("count"))
worst_rated_movies = df.filter(col("score") == 1).groupBy("item").count().orderBy("count")

top_rated_movies.show()
worst_rated_movies.show()


24/08/31 15:10:39 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


Py4JJavaError: An error occurred while calling o690.load.
: java.lang.NoSuchMethodException: org.apache.spark.ml.recommendation.ALSModel.<init>(java.lang.String)
	at java.lang.Class.getConstructor0(Class.java:3082)
	at java.lang.Class.getConstructor(Class.java:1825)
	at org.apache.spark.ml.util.DefaultParamsReader.load(ReadWrite.scala:468)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.lang.Thread.run(Thread.java:750)


4. Evaluate the Recommendation Model

In [4]:
from pyspark.ml.evaluation import RegressionEvaluator

# Initialize Spark session
spark = SparkSession.builder \
    .appName("Evaluate ALS Model") \
    .getOrCreate()

# Load the dataset
df = spark.read.csv("reviews_2.csv", header=True, inferSchema=True)

# Prepare the data for training
df = df.select(col("user_id").alias("user"), col("product_id").alias("item"), col("score").alias("rating"))

# Split the data into training and test sets
(training_data, test_data) = df.randomSplit([0.8, 0.2])

# Load the model (or re-train if not saved)
model = ALS(userCol="user", itemCol="item", ratingCol="rating", coldStartStrategy="drop").fit(training_data)

# Make predictions
predictions = model.transform(test_data)

# Evaluate the model
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print(f"Root-mean-square error = {rmse}")


24/08/31 15:07:56 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


Root-mean-square error = 3.5585605401532687


Additional Analysis and Metrics

In [5]:
from pyspark.sql.functions import desc
from pyspark.ml.feature import Tokenizer
from pyspark.ml.feature import StopWordsRemover
from pyspark.ml.feature import CountVectorizer
from pyspark.ml.feature import IDF
from pyspark.sql.functions import col

# Initialize Spark session
spark = SparkSession.builder \
    .appName("Additional Analysis") \
    .getOrCreate()

# Load the dataset
df = spark.read.csv("reviews_2.csv", header=True, inferSchema=True)

# Prepare data
df_reviews = df.select(col("review"), col("score"))

# Tokenize reviews
tokenizer = Tokenizer(inputCol="review", outputCol="words")
words_data = tokenizer.transform(df_reviews)

# Remove stop words
remover = StopWordsRemover(inputCol="words", outputCol="filtered")
filtered_data = remover.transform(words_data)

# Calculate word frequencies
cv = CountVectorizer(inputCol="filtered", outputCol="features")
cv_model = cv.fit(filtered_data)
vector_df = cv_model.transform(filtered_data)

# Calculate TF-IDF
idf = IDF(inputCol="features", outputCol="tfidf")
idf_model = idf.fit(vector_df)
rescaled_data = idf_model.transform(vector_df)

# Get top rated and lowest rated movies
top_rated_movies = df.filter(col("score") == 5).groupBy("product_id").count().orderBy(desc("count"))
worst_rated_movies = df.filter(col("score") == 1).groupBy("product_id").count().orderBy("count")

top_rated_movies.show()
worst_rated_movies.show()

# Print information
total_reviews = df.count()
total_movies = df.select("product_id").distinct().count()
total_users = df.select("user_id").distinct().count()

print(f"Total reviews: {total_reviews}")
print(f"Total movies: {total_movies}")
print(f"Total users: {total_users}")

# Calculate word frequencies and sort
word_counts = rescaled_data.select(col("filtered").alias("words"), col("tfidf")).rdd.flatMap(lambda x: x[0]).map(lambda x: (x, 1)).reduceByKey(lambda x, y: x + y).toDF(["word", "count"])

# You may need to adjust based on whether you want to categorize words as 'good' or 'bad'
# For demonstration, sorting all word frequencies
word_counts.orderBy(desc("count")).show()


24/08/31 15:08:31 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.
                                                                                

+----------+-----+
|product_id|count|
+----------+-----+
|       101|    2|
|       105|    2|
|       104|    2|
|       103|    1|
|       111|    1|
|       117|    1|
|       113|    1|
|       121|    1|
|       109|    1|
+----------+-----+

+----------+-----+
|product_id|count|
+----------+-----+
|       101|    1|
|       103|    1|
|       105|    1|
|       118|    1|
+----------+-----+

Total reviews: 40
Total movies: 22
Total users: 20


  from pandas.core import (
[Stage 302:>                                                        (0 + 1) / 1]

+-------------+-----+
|         word|count|
+-------------+-----+
|      product|    5|
|        great|    3|
|       highly|    3|
|        worth|    3|
|expectations.|    3|
|    recommend|    3|
|     product,|    2|
|         good|    2|
|        value|    2|
|    excellent|    2|
|     quality,|    2|
|          buy|    2|
|       again.|    2|
|       price.|    2|
|     exceeded|    2|
|        okay,|    2|
|        every|    2|
|        good,|    2|
|       great.|    2|
|        okay.|    2|
+-------------+-----+
only showing top 20 rows



                                                                                