In [1]:
%reload_ext autoreload
%autoreload 2

In [2]:
import sys
from pyspark import SparkConf, SparkContext
from math import sqrt
from pathlib import Path
from toolz import take
import shutil

In [3]:
import psutil
psutil.cpu_count()

4

In [4]:
DATA_TOP = Path().home()/'data'

In [5]:
conf = SparkConf().setMaster("local[*]").setAppName("MovieSimilarities")
sc = SparkContext(conf = conf)

In [14]:
def loadMovieNames():
    movieNames = {}
    infile = DATA_TOP / 'ml-100k/u.ITEM'
    with infile.open(encoding='ascii', errors='ignore') as f:
        
        reader = ( line.split('|') for line in f)
        #reader = take(100, reader)
        movieNames = {int(fields[0]): fields[1] for fields in reader}
    return movieNames

In [7]:
def filterDuplicates( userRatings ):
    ratings = userRatings[1]
    (movie1, rating1) = ratings[0]
    (movie2, rating2) = ratings[1]
    return movie1 < movie2

In [8]:
def makePairs( userRatings ):
    ratings = userRatings[1]
    (movie1, rating1) = ratings[0]
    (movie2, rating2) = ratings[1]
    return ((movie1, movie2), (rating1, rating2))

In [9]:
def computeCosineSimilarity(ratingPairs):
    numPairs = 0
    sum_xx = sum_yy = sum_xy = 0
    for ratingX, ratingY in ratingPairs:
        sum_xx += ratingX * ratingX
        sum_yy += ratingY * ratingY
        sum_xy += ratingX * ratingY
        numPairs += 1

    numerator = sum_xy
    denominator = sqrt(sum_xx) * sqrt(sum_yy)

    score = 0
    if (denominator):
        score = (numerator / (float(denominator)))

    return (score, numPairs)

In [15]:
print("\nLoading movie names...")
nameDict = loadMovieNames()


Loading movie names...


In [11]:
%%time 
infile = f"file:///{DATA_TOP}/ml-100k/u.data"
outfile = DATA_TOP/ 'movie-sims'
if outfile.is_dir():
    shutil.rmtree(outfile)
    
data = sc.textFile(infile)
#output: ['196\t242\t3\t881250949', '186\t302\t3\t891717742', '22\t377\t1\t878887116']
    
# load and parse data
ratings = (data
           .map(lambda l: l.split())
           .map(lambda l: (int(l[0]), (int(l[1]), float(l[2]))))
          )
# output: [(196, (242, 3.0)), (186, (302, 3.0)), (22, (377, 1.0))]

# create pair between all movie ratings for a user
joinedRatings = ratings.join(ratings)
# output:  userID => ((movieID, rating), (movieID, rating))

# filter out duplicates per user
uniqueJoinedRatings = joinedRatings.filter(filterDuplicates)
# output: same

# Now key by (movie1, movie2) pairs.
moviePairs = uniqueJoinedRatings.map(makePairs)
# output: (movie_id1, movie_id1) => (rating1, rating2)
# output: [((242, 393), (3.0, 4.0)), ((242, 381), (3.0, 4.0)), ((242, 251), (3.0, 3.0))]

# Now collect all ratings for each movie pair
moviePairRatings = moviePairs.groupByKey()
# output: (movie_id1, movie_id2) => ( (rating1, rating2), ...)

# Can now compute similarities.
moviePairSimilarities = moviePairRatings.mapValues(computeCosineSimilarity).cache()
# Save the results if desired
#moviePairSimilarities.sortByKey()
#moviePairSimilarities.saveAsTextFile(str(outfile))
ret = moviePairSimilarities.take(3)

CPU times: user 30 ms, sys: 30 ms, total: 60 ms
Wall time: 34.8 s


In [12]:
ret

[((242, 580), (0.9443699330874624, 6)),
 ((242, 692), (0.9203762039948743, 18)),
 ((242, 428), (0.9419097988977888, 15))]

In [13]:
print(983206/34.8)

28253.045977011498


In [20]:
%%time

scoreThreshold = 0.97
coOccurenceThreshold = 50
movieID = 8

# Filter for movies with this sim that are "good" as defined by
# our quality thresholds above
filteredResults = moviePairSimilarities.filter(lambda pairSim: \
    (pairSim[0][0] == movieID or pairSim[0][1] == movieID) \
    and pairSim[1][0] > scoreThreshold and pairSim[1][1] > coOccurenceThreshold)

# Sort by quality score.
results = filteredResults.map(lambda pairSim: (pairSim[1], pairSim[0])).sortByKey(ascending = False).take(10)

print("Top 10 similar movies for " + nameDict[movieID])
for result in results:
    (sim, pair) = result
    # Display the similarity result that isn't the movie we're looking at
    similarMovieID = pair[0]
    if (similarMovieID == movieID):
        similarMovieID = pair[1]
    print(nameDict[similarMovieID] + "\tscore: " + str(sim[0]) + "\tstrength: " + str(sim[1]))

Top 10 similar movies for Babe (1995)
Vertigo (1958)	score: 0.9760932292330491	strength: 92
North by Northwest (1959)	score: 0.9727185836891374	strength: 82
To Kill a Mockingbird (1962)	score: 0.9724024830984216	strength: 101
Maltese Falcon, The (1941)	score: 0.9717924592249771	strength: 62
In the Line of Fire (1993)	score: 0.9716013212709038	strength: 84
Manchurian Candidate, The (1962)	score: 0.9707481140710963	strength: 66
Glory (1989)	score: 0.9702703226524435	strength: 86
CPU times: user 30 ms, sys: 10 ms, total: 40 ms
Wall time: 1.25 s
