In [45]:
from pyspark.sql import SparkSession, DataFrame
from pyspark.sql import functions as sf
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, LongType
import argparse
import os

In [46]:
# def parse_args() -> None:
#     parser = argparse.ArgumentParser(
#         prog='movie-recommendations',
#         description='Suggest 10 movie recommendations based on similarity to movieID argument')
#     parser.add_argument(
#         'movieID', type=int,
#         help='movieID integer to base recommendations on')
#     return parser.parse_args()

def getMovieNames(spark: SparkSession, resource_path: str) -> DataFrame:
    movieNamesSchema = StructType([ 
        StructField("movieID", IntegerType(), True),
        StructField("movieTitle", StringType(), True)
    ])
    return \
        spark.read\
            .option('sep', '|')\
            .option('charset', 'ISO-8859-1')\
            .schema(movieNamesSchema)\
            .csv(resource_path)

def getMovies(spark: SparkSession, resource_path: str) -> DataFrame:
    moviesSchema = StructType([
        StructField("userID", IntegerType(), True),
        StructField("movieID", IntegerType(), True),
        StructField("rating", IntegerType(), True),
        StructField("timestamp", LongType(), True)
    ])
    return \
        spark.read\
            .option('sep', '\t')\
            .schema(moviesSchema)\
            .csv(resource_path)


In [47]:
movieID = 50
spark = SparkSession.builder.appName("MovieRecommendations").master("local[*]").getOrCreate()
pwd = os.path.abspath('')
movieNames = getMovieNames(spark, f"file:///{pwd}/../data/ml-100k/u.item")
movies = getMovies(spark, f"file:///{pwd}/../data/ml-100k/u.data")
ratings = movies.select('userID', 'movieID', 'rating')

In [48]:
def getMovieName(movieNames: DataFrame, movieID: int) -> str:
    return \
        movieNames.where(sf.col("movieID") == movieID)\
            .first()['movieTitle']

movieName = getMovieName(movieNames, movieID)
print(f"Calculating recommendations for movie: '{movieName}'")

Calculating recommendations for movie: 'Star Wars (1977)'


In [6]:
ratings.printSchema()
ratings.show(5)

root
 |-- userID: integer (nullable = true)
 |-- movieID: integer (nullable = true)
 |-- rating: integer (nullable = true)

+------+-------+------+
|userID|movieID|rating|
+------+-------+------+
|   196|    242|     3|
|   186|    302|     3|
|    22|    377|     1|
|   244|     51|     2|
|   166|    346|     1|
+------+-------+------+
only showing top 5 rows



In [15]:
def getMoviePairs(ratings: DataFrame) -> DataFrame:
    joined = ratings.alias('x')\
        .join(ratings.alias('y'), on=(
            (sf.col('x.userID') == sf.col('y.userID'))
            & (sf.col('x.movieID') < sf.col('y.movieID'))
        ))\
        .select(
            # x
            sf.col('x.movieID').alias('movie_x'),
            sf.col('x.rating').alias('rating_x'),
            # y
            sf.col('y.movieID').alias('movie_y'),
            sf.col('y.rating').alias('rating_y'),
        )
    return joined

moviePairs = getMoviePairs(ratings)
moviePairs.printSchema()
moviePairs.show(10)

root
 |-- movie_x: integer (nullable = true)
 |-- rating_x: integer (nullable = true)
 |-- movie_y: integer (nullable = true)
 |-- rating_y: integer (nullable = true)

+-------+--------+-------+--------+
|movie_x|rating_x|movie_y|rating_y|
+-------+--------+-------+--------+
|    242|       3|    269|       3|
|    242|       3|    845|       4|
|    242|       3|   1022|       4|
|    242|       3|    762|       3|
|    242|       3|    411|       4|
|    242|       3|   1007|       4|
|    242|       3|   1241|       3|
|    242|       3|    285|       5|
|    242|       3|    382|       4|
|    242|       3|    287|       3|
+-------+--------+-------+--------+
only showing top 10 rows



In [40]:
from itertools import combinations_with_replacement
def cosineSimilarity(pairs: DataFrame) -> DataFrame:
    scores = pairs
    for i, j in combinations_with_replacement('xy', 2):
        scores = scores.withColumn(i+j, sf.col(f"rating_{i}") * sf.col(f"rating_{j}"))
    similarity = scores\
        .groupBy('movie_x', 'movie_y')\
        .agg(
            sf.sum('xy')\
                .alias('numerator'),
            (sf.sqrt(sf.sum('xx')) * sf.sqrt(sf.sum('yy')))\
                .alias('denominator'),
            sf.count('xy')\
                .alias('coOccurrence')
        )\
        .select(
            'movie_x', 'movie_y',
            sf.when(
                sf.col('denominator')!=0, 
                sf.col('numerator')/sf.col('denominator')
            ).alias('score'),
            'coOccurrence'
        )
    return similarity

similarity = cosineSimilarity(moviePairs)
similarity.printSchema()
similarity.show(5)

root
 |-- movie_x: integer (nullable = true)
 |-- movie_y: integer (nullable = true)
 |-- score: double (nullable = true)
 |-- coOccurrence: long (nullable = false)

+-------+-------+------------------+------------+
|movie_x|movie_y|             score|coOccurrence|
+-------+-------+------------------+------------+
|     51|    924|0.9465030160396292|          15|
|    451|    529|0.8700048504395461|          30|
|     86|    318|0.9562989269248869|          95|
|     40|    167|0.9488483124502475|          23|
|    274|   1211|0.9799118698777318|           7|
+-------+-------+------------------+------------+
only showing top 5 rows



In [44]:
def getTopSimilarTo(similarity: DataFrame,
                movieID: int, limit: int = 10
                ) -> DataFrame:
    scoreThreshold = .97
    coOccurrenceThreshold = 50.0
    topSimilar = similarity\
        .where(
            (
                (sf.col('movie_x')==movieID)
                | (sf.col('movie_y')==movieID)
            )
            & (sf.col('score') > scoreThreshold)
            & (sf.col('coOccurrence') > coOccurrenceThreshold)
        )\
        .sort('score', ascending=False)\
        .limit(limit)\
        .select(
            sf.when(sf.col('movie_x') != movieID,
                    sf.col('movie_x')
                ).otherwise(sf.col('movie_y'))\
                .alias('movieID'),
            sf.round('score', 3).alias('score'),
            sf.col('coOccurrence').alias('strength')
        )
    return topSimilar

starWarsSimilar = getTopSimilarTo(similarity, movieID)\
    .join(movieNames, on='movieID').sort('score', ascending=False)

starWarsSimilar.show(truncate=False)

+-------+-----+--------+-------------------------+
|movieID|score|strength|movieTitle               |
+-------+-----+--------+-------------------------+
|480    |0.972|101     |North by Northwest (1959)|
+-------+-----+--------+-------------------------+

