1. Load a Dataset into a PySpark DataFrame

In [2]:
from pyspark.sql import SparkSession

# Initialize a Spark session
spark = SparkSession.builder \
    .appName("Recommendation System") \
    .getOrCreate()

# Path to the dataset (change this to the path where your dataset is stored)
dataset_path = "ratings.csv"

# Load the dataset into a DataFrame
df = spark.read.csv(dataset_path, header=True, inferSchema=True)

# Show the first few rows of the DataFrame
df.show()

# Print the schema of the DataFrame
df.printSchema()


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/08/31 14:37:40 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


+------+-------+------+---------+
|userId|movieId|rating|timestamp|
+------+-------+------+---------+
|     1|      1|   4.0|964982703|
|     1|      2|   3.0|964981247|
|     1|      3|   5.0|964982224|
|     2|      1|   5.0|964982821|
|     2|      2|   4.0|964983034|
|     2|      4|   2.0|964982564|
|     3|      2|   4.0|964982274|
|     3|      3|   3.0|964982304|
|     3|      4|   5.0|964982134|
|     4|      1|   2.0|964983234|
|     4|      3|   4.0|964982923|
|     4|      5|   3.0|964982891|
|     5|      2|   3.0|964982634|
|     5|      4|   4.0|964983456|
|     5|      6|   5.0|964983567|
|     6|      1|   4.0|964982765|
|     6|      3|   2.0|964982876|
|     6|      5|   3.0|964982987|
|     7|      2|   5.0|964983098|
|     7|      4|   3.0|964983209|
+------+-------+------+---------+
only showing top 20 rows

root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: double (nullable = true)
 |-- timestamp: integer (nullable = 

2. Split the Data and Train a Recommendation Model

In [6]:
from pyspark.ml.recommendation import ALS
from pyspark.sql import SparkSession
from pyspark.ml.evaluation import RegressionEvaluator

# Initialize a Spark session
spark = SparkSession.builder \
    .appName("Recommendation System") \
    .getOrCreate()

# Load the dataset (assuming it's already loaded in the variable 'df')
# For simplicity, we assume 'df' has columns 'userId', 'itemId', 'rating'
# Split the data into training and test sets
(training_data, test_data) = df.randomSplit([0.8, 0.2])

# Initialize the ALS model
als = ALS(
    maxIter=10,
    regParam=0.01,
    userCol="userId",
    itemCol="movieId",
    ratingCol="rating",
    coldStartStrategy="drop"
)

# Train the model
model = als.fit(training_data)

# Make predictions on the test set
predictions = model.transform(test_data)


3. Implement Collaborative Filtering with ALS

In [9]:
from pyspark.ml.recommendation import ALS

# Assuming 'training_data' is already split from the previous step

# Initialize the ALS model
als = ALS(
    maxIter=10,
    regParam=0.01,
    userCol="userId",
    itemCol="movieId",
    ratingCol="rating",
    coldStartStrategy="drop"
)

# Train the model
model = als.fit(training_data)

# Save the model if needed
model.save("model_pred")

# Generate recommendations
user_recommendations = model.recommendForAllUsers(10)
item_recommendations = model.recommendForAllItems(10)

# Show recommendations for a few users
user_recommendations.show()


24/08/31 14:43:10 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory
Scaling row group sizes to 96.54% for 7 writers
24/08/31 14:43:10 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory
Scaling row group sizes to 84.47% for 8 writers
24/08/31 14:43:10 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory
Scaling row group sizes to 96.54% for 7 writers
24/08/31 14:43:10 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory
Scaling row group sizes to 96.54% for 7 writers


+------+--------------------+
|userId|     recommendations|
+------+--------------------+
|    10|[{3, 2.9968333}, ...|
|     1|[{1, 3.997205}, {...|
|     2|[{1, 4.995301}, {...|
|     3|[{4, 4.9967003}, ...|
|     4|[{3, 3.9956493}, ...|
|     5|[{6, 4.995433}, {...|
|     6|[{5, 2.990452}, {...|
|     7|[{2, 4.9963336}, ...|
|     8|[{7, 3.9913669}, ...|
|     9|[{4, 4.995452}, {...|
+------+--------------------+



4. Evaluate the Performance of the Recommendation Model

In [10]:
from pyspark.ml.evaluation import RegressionEvaluator

# Assuming 'predictions' DataFrame from the previous step

# Initialize the RegressionEvaluator
evaluator = RegressionEvaluator(
    metricName="rmse",
    labelCol="rating",
    predictionCol="prediction"
)

# Compute the RMSE
rmse = evaluator.evaluate(predictions)
print(f"Root Mean Squared Error (RMSE) = {rmse}")


Root Mean Squared Error (RMSE) = 2.9817852977302564
