In [1]:
import sys
import os

In [2]:
os.environ.get('JAVA_HOME')

'C:\\Program Files\\Java\\jdk1.8.0_311'

In [3]:
import findspark

findspark.init()

In [4]:
from pyspark.sql import SparkSession
from pyspark.sql import Row
import numpy as np

In [5]:
spark = SparkSession.builder.config("spark.sql.warehouse.dir",
                                    "temp").appName("SparkSQL").getOrCreate()

In [6]:
movie_names = {}
with open("data/ml-100k/u.ITEM") as f:
    for line in f:
        fields = line.split('|')
        movie_names[int(fields[0])] = fields[1]

In [7]:
data = spark.sparkContext.textFile("data/ml-100k/u.data")

In [8]:
data.map(lambda l: l.split()).take(5)

[['196', '242', '3', '881250949'],
 ['186', '302', '3', '891717742'],
 ['22', '377', '1', '878887116'],
 ['244', '51', '2', '880606923'],
 ['166', '346', '1', '886397596']]

In [9]:
# Map ratings to key-value pairs: user ID => movie ID, rating
ratings = data.map(lambda l: l.split()) \
              .map(lambda l: (int(l[0]), (int(l[1]), float(l[2]))))

In [10]:
# Emit every movie rated together by the same user. Self-join to find every combination.
joinedRatings = ratings.join(ratings)

In [11]:
# At this point RDD consists of userID => ((movieID, rating), (movieID, rating))
joinedRatings.take(5)

[(196, ((242, 3.0), (242, 3.0))),
 (196, ((242, 3.0), (393, 4.0))),
 (196, ((242, 3.0), (381, 4.0))),
 (196, ((242, 3.0), (251, 3.0))),
 (196, ((242, 3.0), (655, 5.0)))]

In [12]:
def make_pairs(user_movie_dict):
    user, ratings = user_movie_dict
    (movie1, rating1) = ratings[0]
    (movie2, rating2) = ratings[1]
    return ((movie1, movie2), (rating1, rating2))

In [13]:
def filter_duplicates(user_movie_dict):
    user, ratings = user_movie_dict
    (movie1, rating1) = ratings[0]
    (movie2, rating2) = ratings[1]
    return movie1 < movie2

In [14]:
unique_ratings = joinedRatings.filter(filter_duplicates)

In [15]:
unique_ratings.take(5)

[(196, ((242, 3.0), (393, 4.0))),
 (196, ((242, 3.0), (381, 4.0))),
 (196, ((242, 3.0), (251, 3.0))),
 (196, ((242, 3.0), (655, 5.0))),
 (196, ((242, 3.0), (306, 4.0)))]

In [16]:
movie_pairs = unique_ratings.map(make_pairs)

In [17]:
movie_pairs.take(5)

[((242, 393), (3.0, 4.0)),
 ((242, 381), (3.0, 4.0)),
 ((242, 251), (3.0, 3.0)),
 ((242, 655), (3.0, 5.0)),
 ((242, 306), (3.0, 4.0))]

In [18]:
movie_pair_ratings = movie_pairs.groupByKey()

In [19]:
movie_pair_ratings.take(5)

[((242, 580), <pyspark.resultiterable.ResultIterable at 0x23ae684def0>),
 ((242, 692), <pyspark.resultiterable.ResultIterable at 0x23ae684d208>),
 ((242, 428), <pyspark.resultiterable.ResultIterable at 0x23ae684d780>),
 ((242, 340), <pyspark.resultiterable.ResultIterable at 0x23ae684dcf8>),
 ((393, 1241), <pyspark.resultiterable.ResultIterable at 0x23ae684d5c0>)]

## Extract similarities for the movie

In [20]:
scoreThreshold = 0.97
coOccurenceThreshold = 50
movieID = 50

In [21]:
from math import sqrt

In [22]:
def compute_cosine_similarity(ratingPairs):
    numPairs = 0
    sum_xx = sum_yy = sum_xy = 0
    for ratingX, ratingY in ratingPairs:
        sum_xx += ratingX * ratingX
        sum_yy += ratingY * ratingY
        sum_xy += ratingX * ratingY
        numPairs += 1
    numerator = sum_xy
    denominator = sqrt(sum_xx) * sqrt(sum_yy)
    score = 0
    if (denominator):
        score = (numerator / (float(denominator)))
    return (score, numPairs)

In [23]:
movie_pair_similarities = movie_pair_ratings.mapValues(
    compute_cosine_similarity).cache()

In [24]:
movie_pair_similarities.take(5)

[((242, 580), (0.9443699330874624, 6)),
 ((242, 692), (0.9203762039948743, 18)),
 ((242, 428), (0.9419097988977888, 15)),
 ((242, 340), (0.9455404837184603, 32)),
 ((393, 1241), (1.0, 1))]

In [25]:
# Filter for movies with this sim that are "good" as defined by our quality thresholds above
filtered_results = movie_pair_similarities.filter(
    lambda data: (data[0][0] == movieID or data[0][1] == movieID) \
                    and data[1][0] > scoreThreshold \
                    and data[1][1] > coOccurenceThreshold)

In [26]:
# Sort by quality score
results = filtered_results.map(lambda data: (data[1], data[0])).sortByKey(
    ascending=False).take(10)

In [27]:
print(f"Top 10 similar movies for {movie_names[movieID]}\n")
for result in results:
    (sim, pair) = result
    # Display the similarity result that isn't the movie we're looking at
    similar_movie_id = pair[0]
    if (similar_movie_id == movieID):
        similar_movie_id = pair[1]
    print(f"Movie:    {movie_names[similar_movie_id]}")
    print(f"Score:    {sim[0]}")
    print(f"Strength: {sim[1]}\n")

Top 10 similar movies for Star Wars (1977)

Movie:    Empire Strikes Back, The (1980)
Score:    0.9895522078385338
Strength: 345

Movie:    Return of the Jedi (1983)
Score:    0.9857230861253026
Strength: 480

Movie:    Raiders of the Lost Ark (1981)
Score:    0.981760098872619
Strength: 380

Movie:    20,000 Leagues Under the Sea (1954)
Score:    0.9789385605497993
Strength: 68

Movie:    12 Angry Men (1957)
Score:    0.9776576120448436
Strength: 109

Movie:    Close Shave, A (1995)
Score:    0.9775948291054827
Strength: 92

Movie:    African Queen, The (1951)
Score:    0.9764692222674887
Strength: 138

Movie:    Sting, The (1973)
Score:    0.9751512937740359
Strength: 204

Movie:    Wrong Trousers, The (1993)
Score:    0.9748681355460885
Strength: 103

Movie:    Wallace & Gromit: The Best of Aardman Animation (1996)
Score:    0.9741816128302572
Strength: 58

