In [10]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import CountVectorizer, IDF
from pyspark.ml.linalg import Vectors
from pyspark.sql.functions import col, udf, explode, desc

# Create or get SparkSession
spark = SparkSession.builder.appName("ContentBasedFiltering").getOrCreate()

# Load the dataset
df = spark.read.csv("modified_movies.csv", header=True, inferSchema=True)


24/05/26 21:54:40 WARN Utils: Your hostname, nhatdm2k4 resolves to a loopback address: 127.0.1.1; using 192.168.1.43 instead (on interface wlo1)
24/05/26 21:54:40 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/05/26 21:54:41 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/05/26 21:54:41 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
                                                                                

In [12]:
df.show()

+---+----------+-------+--------------------+------+--------------------+----+
|_c0|Unnamed: 0|User_Id|          Movie_Name|Rating|               Genre|Year|
+---+----------+-------+--------------------+------+--------------------+----+
|  0|         0|      1|        Pulp Fiction|   5.0|Comedy|Crime|Dram...|1994|
|  1|         1|      1|Three Colors: Red...|   3.5|               Drama|1994|
|  2|         2|      1|Three Colors: Blu...|   5.0|               Drama|1993|
|  3|         3|      1|         Underground|   5.0|    Comedy|Drama|War|1995|
|  4|         4|      1| Singin' in the Rain|   3.5|Comedy|Musical|Ro...|1952|
|  5|         5|      1|       Dirty Dancing|   4.0|Drama|Musical|Rom...|1987|
|  6|         6|      1|        Delicatessen|   3.5|Comedy|Drama|Romance|1991|
|  7|         7|      1|                 Ran|   3.5|           Drama|War|1985|
|  8|         8|      1|Seventh Seal, The...|   5.0|               Drama|1957|
|  9|         9|      1|Bridge on the Riv...|   4.0|

24/05/26 21:55:09 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , Unnamed: 0, User_Id, Movie_Name, Rating, Genre, Year
 Schema: _c0, Unnamed: 0, User_Id, Movie_Name, Rating, Genre, Year
Expected: _c0 but found: 
CSV file: hdfs://localhost:9000/user/nhatdm2k4/modified_movies.csv


In [14]:
from pyspark.sql.functions import split, explode

df_exploded = df.select(
    col("User_Id"), 
    col("Movie_Name"), 
    explode(split(col("Genre"), r"\|")).alias("Genre")
)

In [15]:
#Explode genres into separate rows
df_exploded.show()

+-------+--------------------+--------+
|User_Id|          Movie_Name|   Genre|
+-------+--------------------+--------+
|      1|        Pulp Fiction|  Comedy|
|      1|        Pulp Fiction|   Crime|
|      1|        Pulp Fiction|   Drama|
|      1|        Pulp Fiction|Thriller|
|      1|Three Colors: Red...|   Drama|
|      1|Three Colors: Blu...|   Drama|
|      1|         Underground|  Comedy|
|      1|         Underground|   Drama|
|      1|         Underground|     War|
|      1| Singin' in the Rain|  Comedy|
|      1| Singin' in the Rain| Musical|
|      1| Singin' in the Rain| Romance|
|      1|       Dirty Dancing|   Drama|
|      1|       Dirty Dancing| Musical|
|      1|       Dirty Dancing| Romance|
|      1|        Delicatessen|  Comedy|
|      1|        Delicatessen|   Drama|
|      1|        Delicatessen| Romance|
|      1|                 Ran|   Drama|
|      1|                 Ran|     War|
+-------+--------------------+--------+
only showing top 20 rows



In [19]:
from pyspark.sql.functions import collect_list
# 2. Group by Movie and Collect Genres:
df_grouped = df_exploded.groupBy("Movie_Name").agg(collect_list("Genre").alias("Genres"))


In [20]:
df_grouped.show()

[Stage 6:>                                                          (0 + 1) / 1]

+--------------------+--------------------+
|          Movie_Name|              Genres|
+--------------------+--------------------+
|              #Stuck|[Comedy, Drama, R...|
|        #realityhigh|[Comedy, Comedy, ...|
|             '49-'17|[Comedy, Western,...|
|                 '71|[Action, Action, ...|
|             'R Xmas|[Crime, Crime, Cr...|
|  'Til There Was You|[Drama, Drama, Ro...|
|'Tis the Season f...|[Romance, Romance...|
|     'Twas the Night|[Children, Childr...|
|'Twas the Night B...|[Animation, Anima...|
|...And the Earth ...|[(no genres listed)]|
|...tick... tick.....|[Action, Drama, A...|
|.hack Liminality ...|[(no genres listed)]|
|      009 Re: Cyborg|[Action, Animatio...|
|    1 Chance 2 Dance|[(no genres liste...|
|           1 Journée|             [Drama]|
|    1 Litre of Tears|             [Drama]|
|                  1%|      [Drama, Drama]|
|       10 000 timmar|[Comedy, Comedy, ...|
|      10 Cent Pistol|[Crime, Thriller,...|
|          10 Minutes|[(no genre

                                                                                

In [21]:
# 3. Apply CountVectorizer:
cv = CountVectorizer(inputCol="Genres", outputCol="rawFeatures") 
model = cv.fit(df_grouped)
featurizedData = model.transform(df_grouped)

                                                                                

In [23]:
featurizedData.show()



+--------------------+--------------------+--------------------+
|          Movie_Name|              Genres|         rawFeatures|
+--------------------+--------------------+--------------------+
|              #Stuck|[Comedy, Drama, R...|(20,[0,1,5],[3.0,...|
|        #realityhigh|[Comedy, Comedy, ...|     (20,[1],[18.0])|
|             '49-'17|[Comedy, Western,...|(20,[1,16],[4.0,4...|
|                 '71|[Action, Action, ...|(20,[0,2,3,13],[3...|
|             'R Xmas|[Crime, Crime, Cr...|(20,[0,7],[27.0,2...|
|  'Til There Was You|[Drama, Drama, Ro...|(20,[0,5],[475.0,...|
|'Tis the Season f...|[Romance, Romance...|     (20,[5],[11.0])|
|     'Twas the Night|[Children, Childr...|      (20,[9],[5.0])|
|'Twas the Night B...|[Animation, Anima...|(20,[8,9,12],[24....|
|...And the Earth ...|[(no genres listed)]|     (20,[19],[1.0])|
|...tick... tick.....|[Action, Drama, A...|(20,[0,2],[7.0,7.0])|
|.hack Liminality ...|[(no genres listed)]|     (20,[19],[1.0])|
|      009 Re: Cyborg|[Ac

                                                                                

In [24]:
idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)


                                                                                

In [25]:
rescaledData.show()



+--------------------+--------------------+--------------------+--------------------+
|          Movie_Name|              Genres|         rawFeatures|            features|
+--------------------+--------------------+--------------------+--------------------+
|              #Stuck|[Comedy, Drama, R...|(20,[0,1,5],[3.0,...|(20,[0,1,5],[2.59...|
|        #realityhigh|[Comedy, Comedy, ...|     (20,[1],[18.0])|(20,[1],[22.67835...|
|             '49-'17|[Comedy, Western,...|(20,[1,16],[4.0,4...|(20,[1,16],[5.039...|
|                 '71|[Action, Action, ...|(20,[0,2,3,13],[3...|(20,[0,2,3,13],[2...|
|             'R Xmas|[Crime, Crime, Cr...|(20,[0,7],[27.0,2...|(20,[0,7],[23.381...|
|  'Til There Was You|[Drama, Drama, Ro...|(20,[0,5],[475.0,...|(20,[0,5],[411.34...|
|'Tis the Season f...|[Romance, Romance...|     (20,[5],[11.0])|(20,[5],[22.63031...|
|     'Twas the Night|[Children, Childr...|      (20,[9],[5.0])|(20,[9],[14.97307...|
|'Twas the Night B...|[Animation, Anima...|(20,[8,9,12

                                                                                

In [26]:
def cosine_similarity(v1, v2):
    return float(v1.dot(v2) / (v1.norm(2) * v2.norm(2)))

cosine_similarity_udf = udf(cosine_similarity)

# Calculate similarity between movies
movie_pairs = rescaledData.alias("i").join(
    rescaledData.alias("j"), col("i.Movie_Name") < col("j.Movie_Name")
).select(
    col("i.Movie_Name").alias("movie1"),
    col("j.Movie_Name").alias("movie2"),
    cosine_similarity_udf(col("i.features"), col("j.features")).alias("similarity")
)

# Get recommendations based on user's watched movies
user_id = 1  
watched_movies = df.filter(col("User_Id") == user_id).select("Movie_Name").rdd.flatMap(lambda x: x).collect()

recommendations = movie_pairs.filter(
    (col("movie1").isin(watched_movies)) | (col("movie2").isin(watched_movies))
).sort(desc("similarity"))

#Show recommendations:
recommendations.show()




+--------------------+--------------------+--------------------+
|              movie1|              movie2|          similarity|
+--------------------+--------------------+--------------------+
|            Chocolat|               Kenny|9.998180365258422E-4|
|       Dirty Dancing|               House|9.997434257065828E-4|
|          The Guilty|         Underground|9.996743847901846E-4|
|            Chocolat|     Kiss Me Goodbye|9.993493658537972E-4|
|Fanny and Alexand...|                Fear|9.992990437192216E-5|
|Bridge on the Riv...|           Dead Heat| 9.99286559551358E-4|
|            Chocolat|            Splendor|9.990127999092737E-4|
|       Dirty Dancing|                 Pan|9.987154880592904E-5|
|             My Girl|NeverEnding Story...|9.983789130123886E-4|
|            Chocolat|     Little Monsters|9.980520477442072E-4|
|Bad Education (La...|      My Blue Heaven| 9.97938881797442E-4|
|Barbarian Invasio...|            Iron Man|9.977944928804664E-6|
|               Akira|   

                                                                                