In [1]:
// Workaround notebook namespace conflict issue
val spark2 = spark
import spark2.implicits._

In [2]:
val movieDf = (spark.read
                    .format("CSV")
                    .option("header", "true")
                    .option("inferSchema", "true")
                    .load("ml-latest-small/movies.csv"))

In [3]:
val ratingDf = (spark.read
                    .format("CSV")
                    .option("header", "true")
                    .option("inferSchema", "true")
                    .load("ml-latest-small/ratings.csv")
                    .select("userId", "movieId", "rating"))

In [4]:
movieDf.show(3, false)

+-------+-----------------------+-------------------------------------------+
|movieId|title                  |genres                                     |
+-------+-----------------------+-------------------------------------------+
|1      |Toy Story (1995)       |Adventure|Animation|Children|Comedy|Fantasy|
|2      |Jumanji (1995)         |Adventure|Children|Fantasy                 |
|3      |Grumpier Old Men (1995)|Comedy|Romance                             |
+-------+-----------------------+-------------------------------------------+
only showing top 3 rows



In [5]:
ratingDf.show(3)

+------+-------+------+
|userId|movieId|rating|
+------+-------+------+
|     1|     31|   2.5|
|     1|   1029|   3.0|
|     1|   1061|   3.0|
+------+-------+------+
only showing top 3 rows



In [6]:
val moviePairs = (ratingDf.as("r1")
                          .join(ratingDf.as("r2"), "userId")
                          .filter($"r1.movieId" < $"r2.movieId")
                          .select($"r1.movieId".as("m1"), $"r2.movieId".as("m2"), $"r1.rating".as("r1"), $"r2.rating".as("r2")))

In [7]:
moviePairs.show(3)

+---+----+---+---+
| m1|  m2| r1| r2|
+---+----+---+---+
| 31|3671|2.5|3.0|
| 31|2968|2.5|1.0|
| 31|2455|2.5|2.5|
+---+----+---+---+
only showing top 3 rows



In [8]:
import org.apache.spark.sql.functions._

val temp = (moviePairs.groupBy("m1", "m2")
                      .agg(count(lit(1))
                      .as("count"), sum($"r1"*$"r1")
                      .as("r1*r1"), sum($"r1"*$"r2")
                      .as("r1*r2"), sum($"r2"*$"r2")
                      .as("r2*r2")))

In [9]:
temp.show(3)

+---+---+-----+------+------+------+
| m1| m2|count| r1*r1| r1*r2| r2*r2|
+---+---+-----+------+------+------+
| 17|357|   45|759.25| 688.5| 677.5|
|110|585|   25| 373.5|289.75|278.75|
|273|454|   12| 131.0| 129.0| 135.0|
+---+---+-----+------+------+------+
only showing top 3 rows



In [10]:
import scala.math.sqrt

// Calculate cosine similarity
val getSimilarity = udf((col1:Double, col2:Double, col3:Double) => col2/(sqrt(col1)*sqrt(col3)))
val movieSimilarities = (temp.withColumn("similarity", getSimilarity($"r1*r1", $"r1*r2", $"r2*r2"))
                             .select("m1", "m2", "count", "similarity")).cache()

In [11]:
movieSimilarities.show(3)

+---+---+-----+------------------+
| m1| m2|count|        similarity|
+---+---+-----+------------------+
| 17|357|   45|0.9599683054983534|
|110|585|   25| 0.897988352320666|
|273|454|   12|0.9700344948091243|
+---+---+-----+------------------+
only showing top 3 rows



In [12]:
movieSimilarities.count

10987079

In [13]:
movieSimilarities.printSchema

root
 |-- m1: integer (nullable = true)
 |-- m2: integer (nullable = true)
 |-- count: long (nullable = false)
 |-- similarity: double (nullable = true)



In [14]:
// Let's find the similar movies of Star Wars (1977) with count > 70

val result = (movieSimilarities.filter($"count" > 70 && ($"m1" === 260 || $"m2" === 260))
                               .orderBy($"similarity".desc))
result.show(10, false)

                                                                                +---+-----+-----+------------------+
|m1 |m2   |count|similarity        |
+---+-----+-----+------------------+
|260|1196 |203  |0.9896402714789055|
|260|1210 |187  |0.9891531995341264|
|260|1198 |177  |0.9828753537963838|
|260|58559|74   |0.9816392821664428|
|260|4993 |147  |0.9802403864565682|
|260|1610 |73   |0.9771664798038402|
|260|858  |134  |0.9769180232956557|
|260|1221 |91   |0.9768131139614421|
|50 |260  |123  |0.9762391561714363|
|260|8961 |90   |0.9762020399001995|
+---+-----+-----+------------------+
only showing top 10 rows



In [15]:
// Find and display similar movie names

val selectId = udf((col1:Int, col2:Int) => if (col1 != 260) col1 else col2)

(result.withColumn("movieId", selectId(col("m1"), col("m2")))
       .join(movieDf, "movieId")
       .select("title", "similarity", "count")
       .show(10, false))

+------------------------------------------------------------------------------+------------------+-----+
|title                                                                         |similarity        |count|
+------------------------------------------------------------------------------+------------------+-----+
|Star Wars: Episode V - The Empire Strikes Back (1980)                         |0.9896402714789055|203  |
|Star Wars: Episode VI - Return of the Jedi (1983)                             |0.9891531995341264|187  |
|Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981)|0.9828753537963838|177  |
|Dark Knight, The (2008)                                                       |0.9816392821664428|74   |
|Lord of the Rings: The Fellowship of the Ring, The (2001)                     |0.9802403864565682|147  |
|Hunt for Red October, The (1990)                                              |0.9771664798038402|73   |
|Godfather, The (1972)                        

In [16]:
// Let's use the information in Genres columns to find similar movies
// Basically the movie's both genres similarity and count must > given thresholds, and ordered by gensim then similarity

def findSimilarMovies(mid: Int, count: Long, gensim: Int) = {
    
    println("Find the similar movies of:\n")
    movieDf.filter($"movieId"===mid).show(false)
    
    // Get mid's genres list
    val mgen = movieDf.filter($"movieId"===mid).select($"genres").collect().mkString.replaceAll("^.|.$", "").split('|').toList
    
    // udf to pick up similar movie's movieId
    val selectId = udf((col1:Int, col2:Int) => if (col1 != mid) col1 else col2)
    
    // udf to calculate genres similarity genScore
    val getGenSim = udf((col: String) => col.split('|').toList.intersect(mgen).length)
    
    val result = (movieSimilarities.withColumn("movieId", selectId(col("m1"), col("m2")))
                                   .join(movieDf, "movieId")
                                   .withColumn("gensim", getGenSim($"genres"))
                                   .filter($"gensim" > gensim && $"count" > count && ($"m1" === mid || $"m2" === mid)))
                          
    result.orderBy($"gensim".desc, $"similarity".desc).select("title", "gensim", "similarity", "count")
}

In [17]:
findSimilarMovies(260, 70, 2).show(false)

Find the similar movies of:

+-------+-----------------------------------------+-----------------------+
|movieId|title                                    |genres                 |
+-------+-----------------------------------------+-----------------------+
|260    |Star Wars: Episode IV - A New Hope (1977)|Action|Adventure|Sci-Fi|
+-------+-----------------------------------------+-----------------------+

+-----------------------------------------------------+------+------------------+-----+
|title                                                |gensim|similarity        |count|
+-----------------------------------------------------+------+------------------+-----+
|Star Wars: Episode V - The Empire Strikes Back (1980)|3     |0.9896402714789055|203  |
|Star Wars: Episode VI - Return of the Jedi (1983)    |3     |0.9891531995341264|187  |
|Aliens (1986)                                        |3     |0.9693337395902628|100  |
|Total Recall (1990)                                  |3     |

In [18]:
findSimilarMovies(2706, 30, 1).show(false)

Find the similar movies of:

+-------+-------------------+--------------+
|movieId|title              |genres        |
+-------+-------------------+--------------+
|2706   |American Pie (1999)|Comedy|Romance|
+-------+-------------------+--------------+

+----------------------------------------------------+------+------------------+-----+
|title                                               |gensim|similarity        |count|
+----------------------------------------------------+------+------------------+-----+
|True Lies (1994)                                    |2     |0.9677178837684531|50   |
|Clueless (1995)                                     |2     |0.9574978247356769|40   |
|Princess Bride, The (1987)                          |2     |0.9534643777538385|48   |
|Forrest Gump (1994)                                 |2     |0.9531921179264798|85   |
|Amelie (Fabuleux destin d'Amélie Poulain, Le) (2001)|2     |0.9523943996049813|42   |
|Shrek 2 (2004)                                  

In [19]:
findSimilarMovies(648, 50, 2).show(false)

Find the similar movies of:

+-------+--------------------------+---------------------------------+
|movieId|title                     |genres                           |
+-------+--------------------------+---------------------------------+
|648    |Mission: Impossible (1996)|Action|Adventure|Mystery|Thriller|
+-------+--------------------------+---------------------------------+

+------------------------------------+------+------------------+-----+          
|title                               |gensim|similarity        |count|
+------------------------------------+------+------------------+-----+
|Minority Report (2002)              |3     |0.9763691247739893|54   |
|Rock, The (1996)                    |3     |0.975463863491967 |96   |
|Bourne Identity, The (2002)         |3     |0.9754433707092234|53   |
|Total Recall (1990)                 |3     |0.9678625938467216|53   |
|Spider-Man (2002)                   |3     |0.9670889687924121|62   |
|True Lies (1994)                    