In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import Row
import numpy as np

In [2]:
spark = SparkSession.builder.appName("Movie Analysis").getOrCreate()

In [3]:
data = spark.sparkContext.textFile("../data/ml-100k/u.data")

In [4]:
data.map(lambda l: l.split()).collect()[:5]

[['196', '242', '3', '881250949'],
 ['186', '302', '3', '891717742'],
 ['22', '377', '1', '878887116'],
 ['244', '51', '2', '880606923'],
 ['166', '346', '1', '886397596']]

In [5]:
# Map ratings to key-value pairs: user ID => movie ID, rating
ratings = (
    data
    .map(lambda l: l.split()) 
    .map(lambda l: (int(l[0]), (int(l[1]), float(l[2]))))
)

In [6]:
ratings.collect()[:5]

[(196, (242, 3.0)),
 (186, (302, 3.0)),
 (22, (377, 1.0)),
 (244, (51, 2.0)),
 (166, (346, 1.0))]

In [7]:
# Emit every movie rated together by the same user. Cross Join - all combinations.
joinedRatings = ratings.join(ratings)

In [8]:
# At this point RDD consists of userID => ((movieID, rating), (movieID, rating))
joinedRatings.collect()[:5]

[(196, ((242, 3.0), (242, 3.0))),
 (196, ((242, 3.0), (393, 4.0))),
 (196, ((242, 3.0), (381, 4.0))),
 (196, ((242, 3.0), (251, 3.0))),
 (196, ((242, 3.0), (655, 5.0)))]

In [9]:
type(joinedRatings)

pyspark.rdd.PipelinedRDD

In [10]:
def filter_duplicates(user_movie_dict):
    user, ratings = user_movie_dict
    movie1_id, rating1 = ratings[0]
    movie2_id, rating2 = ratings[1]
    return movie1_id < movie2_id

In [11]:
unique_ratings = joinedRatings.filter(filter_duplicates)

In [12]:
unique_ratings.collect()[:5]

[(196, ((242, 3.0), (393, 4.0))),
 (196, ((242, 3.0), (381, 4.0))),
 (196, ((242, 3.0), (251, 3.0))),
 (196, ((242, 3.0), (655, 5.0))),
 (196, ((242, 3.0), (306, 4.0)))]

In [13]:
def make_pairs(user_movie_dict):
    user, ratings = user_movie_dict
    movie1_id, rating1 = ratings[0]
    movie2_id, rating2 = ratings[1]
    return ((movie1_id, movie2_id), (rating1, rating2))

In [14]:
movie_pairs = unique_ratings.map(make_pairs)

In [15]:
movie_pairs.collect()[:5]

[((242, 393), (3.0, 4.0)),
 ((242, 381), (3.0, 4.0)),
 ((242, 251), (3.0, 3.0)),
 ((242, 655), (3.0, 5.0)),
 ((242, 306), (3.0, 4.0))]

In [16]:
movie_pair_ratings = movie_pairs.groupByKey()

In [17]:
movie_pair_ratings.collect()[:5]

[((242, 580), <pyspark.resultiterable.ResultIterable at 0x1c6c2bb2d10>),
 ((242, 692), <pyspark.resultiterable.ResultIterable at 0x1c6c2bb3bd0>),
 ((242, 428), <pyspark.resultiterable.ResultIterable at 0x1c6c2bb3ed0>),
 ((242, 340), <pyspark.resultiterable.ResultIterable at 0x1c6c2bb3e90>),
 ((393, 1241), <pyspark.resultiterable.ResultIterable at 0x1c6c2bb3fd0>)]

## Extract similarities for the movie

In [18]:
scoreThreshold = 0.97
coOccurenceThreshold = 50
movieID = 50

In [19]:
from math import sqrt

In [20]:
def compute_cosine_similarity(ratingPairs):
    numPairs = 0
    sum_xx = sum_yy = sum_xy = 0
    for ratingX, ratingY in ratingPairs:
        sum_xx += ratingX * ratingX
        sum_yy += ratingY * ratingY
        sum_xy += ratingX * ratingY
        numPairs += 1
    numerator = sum_xy
    denominator = sqrt(sum_xx) * sqrt(sum_yy)
    score = 0
    if (denominator):
        score = (numerator / (float(denominator)))
    return (score, numPairs)

In [21]:
movie_pair_similarities = movie_pair_ratings.mapValues(
    compute_cosine_similarity).cache()

In [22]:
movie_pair_similarities.collect()[:5]

[((242, 580), (0.9443699330874624, 6)),
 ((242, 692), (0.9203762039948743, 18)),
 ((242, 428), (0.9419097988977888, 15)),
 ((242, 340), (0.9455404837184603, 32)),
 ((393, 1241), (1.0, 1))]

In [23]:
# Filter for movies with this sim that are "good" as defined by our quality thresholds above
filtered_results = movie_pair_similarities.filter(
    lambda data: (data[0][0] == movieID or data[0][1] == movieID)
    and data[1][0] > scoreThreshold
    and data[1][1] > coOccurenceThreshold
)

In [24]:
# Sort by quality score
results = filtered_results.map(lambda data: (data[1], data[0])).sortByKey(
    ascending=False).take(10)

In [27]:
movie_names = {}
with open("../data/ml-100k/u.ITEM") as f:
    for line in f:
        fields = line.split('|')
        movie_names[int(fields[0])] = fields[1]

In [28]:
print(f"Top 10 similar movies for {movie_names[movieID]}\n")
for result in results:
    (sim, pair) = result
    # Display the similarity result that isn't the movie we're looking at
    similar_movie_id = pair[0]
    if (similar_movie_id == movieID):
        similar_movie_id = pair[1]
    print(f"Movie:    {movie_names[similar_movie_id]}")
    print(f"Score:    {sim[0]}")
    print(f"Strength: {sim[1]}\n")

Top 10 similar movies for Star Wars (1977)

Movie:    Empire Strikes Back, The (1980)
Score:    0.9895522078385338
Strength: 345

Movie:    Return of the Jedi (1983)
Score:    0.9857230861253026
Strength: 480

Movie:    Raiders of the Lost Ark (1981)
Score:    0.981760098872619
Strength: 380

Movie:    20,000 Leagues Under the Sea (1954)
Score:    0.9789385605497993
Strength: 68

Movie:    12 Angry Men (1957)
Score:    0.9776576120448436
Strength: 109

Movie:    Close Shave, A (1995)
Score:    0.9775948291054827
Strength: 92

Movie:    African Queen, The (1951)
Score:    0.9764692222674887
Strength: 138

Movie:    Sting, The (1973)
Score:    0.9751512937740359
Strength: 204

Movie:    Wrong Trousers, The (1993)
Score:    0.9748681355460885
Strength: 103

Movie:    Wallace & Gromit: The Best of Aardman Animation (1996)
Score:    0.9741816128302572
Strength: 58



In [29]:
spark.stop()

In [30]:
spark.sparkContext.stop()