In [1]:
import findspark
import sys

In [2]:
findspark.init()

In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, LongType

from pyspark.sql import functions as func

In [4]:
spark = SparkSession.builder.appName("MovieSimilarities").getOrCreate()
spark

23/11/10 23:36:40 WARN Utils: Your hostname, bagjunhyeog-ui-noteubug.local resolves to a loopback address: 127.0.0.1; using 172.30.1.11 instead (on interface en0)
23/11/10 23:36:40 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/11/10 23:36:41 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [5]:
spark.sparkContext.setLogLevel("error")

In [6]:
movie_names_schema = StructType([
    StructField("movieID", IntegerType(), True),
    StructField("movieTitle", StringType(), True)
])

In [7]:
movies_schema = StructType([
    StructField("userID", IntegerType(), True),
    StructField("movieID", IntegerType(), True),
    StructField("rating", IntegerType(), True),
    StructField("timestamp", LongType(), True)])

In [8]:
movie_names = spark.read \
    .option("sep", "|") \
    .option("charset", "ISO-8859-1") \
    .schema(movie_names_schema) \
    .csv("file:///Users/hotamul/SparkProjects/MovieRating/ml-100k/u.item")

In [9]:
movies = spark.read \
    .option("sep", "\t") \
    .schema(movies_schema) \
    .csv("file:///Users/hotamul/SparkProjects/MovieRating/ml-100k/u.data")


In [10]:
ratings = movies.select("userId", "movieId", "rating")

In [11]:
movie_pairs = ratings.alias("ratings1") \
    .join(ratings.alias("ratings2"), (func.col("ratings1.userId") == func.col("ratings2.userId")) \
          & (func.col("ratings1.movieId") < func.col("ratings2.movieId"))) \
    .select(func.col("ratings1.movieId").alias("movie1"),
            func.col("ratings2.movieId").alias("movie2"),
            func.col("ratings1.rating").alias("rating1"),
            func.col("ratings2.rating").alias("rating2"))


In [12]:
def compute_cosine_similarity(spark, data):
    # Compute xx, xy and yy columns
    pair_scores = data \
        .withColumn("xx", func.col("rating1") * func.col("rating1")) \
        .withColumn("yy", func.col("rating2") * func.col("rating2")) \
        .withColumn("xy", func.col("rating1") * func.col("rating2"))

    # Compute numerator, denominator and numPairs columns
    calculate_similarity = pair_scores \
        .groupBy("movie1", "movie2") \
        .agg(
        func.sum(func.col("xy")).alias("numerator"),
        (func.sqrt(func.sum(func.col("xx"))) * func.sqrt(func.sum(func.col("yy")))).alias("denominator"),
        func.count(func.col("xy")).alias("numPairs")
    )

    # Calculate score and select only needed columns (movie1, movie2, score, numPairs)
    result = calculate_similarity \
        .withColumn("score",
                    func.when(func.col("denominator") != 0, func.col("numerator") / func.col("denominator")) \
                    .otherwise(0)
                    ).select("movie1", "movie2", "score", "numPairs")

    return result


In [13]:
movie_pair_similarities = compute_cosine_similarity(spark, movie_pairs).cache()

In [14]:
def get_movie_name(movie_names, movie_id):
    result = movie_names.filter(func.col("movieID") == movie_id) \
        .select("movieTitle").collect()[0]

    return result[0]


In [15]:

score_threshold = 0.97
co_occurrence_threshold = 50.0

movie_id = 50

# Filter for movies with this sim that are "good" as defined by
# our quality thresholds above
filtered_results = movie_pair_similarities.filter(
    ((func.col("movie1") == movie_id) | (func.col("movie2") == movie_id)) &
    (func.col("score") > score_threshold) & (func.col("numPairs") > co_occurrence_threshold))

# Sort by quality score.
results = filtered_results.sort(func.col("score").desc()).take(10)

print("Top 10 similar movies for " + get_movie_name(movie_names, movie_id))

for result in results:
    # Display the similarity result that isn't the movie we're looking at
    similar_movie_id = result.movie1
    if similar_movie_id == movie_id:
        similar_movie_id = result.movie2

    print(f"{get_movie_name(movie_names, similar_movie_id)}:")
    print(f"\tscore: {str(round(result.score, 3))}")
    print(f"\tstrength: {str(result.numPairs)}")


                                                                                

Top 10 similar movies for Star Wars (1977)
Empire Strikes Back, The (1980):
	score: 0.99
	strength: 345
Return of the Jedi (1983):
	score: 0.986
	strength: 480
Raiders of the Lost Ark (1981):
	score: 0.982
	strength: 380
20,000 Leagues Under the Sea (1954):
	score: 0.979
	strength: 68
12 Angry Men (1957):
	score: 0.978
	strength: 109
Close Shave, A (1995):
	score: 0.978
	strength: 92
African Queen, The (1951):
	score: 0.976
	strength: 138
Sting, The (1973):
	score: 0.975
	strength: 204
Wrong Trousers, The (1993):
	score: 0.975
	strength: 103
Wallace & Gromit: The Best of Aardman Animation (1996):
	score: 0.974
	strength: 58


In [21]:
spark.stop()