In [1]:
import findspark

In [2]:
findspark.init()

In [3]:
findspark.find()

'C:\\apps\\spark-3.0.1-bin-hadoop2.7'

In [4]:
import pyspark
from pyspark import SparkContext
from pyspark import SparkConf
from pyspark.sql import SparkSession

In [5]:
conf = pyspark.SparkConf().setAppName("Movie recommender").setMaster("local[*]")
conf.set("spark.driver.memory","2g")
conf.set("spark.executor.memory","2g")
sc = pyspark.SparkContext(conf = conf)
spark = SparkSession(sc)

### lets load movie data

In [6]:
movies_raw_data = spark.read.option("header","true").csv("file:///C:/mystuff/courses/recommendation_engine/data/ml-latest-small/movies.csv")

In [7]:
movies_raw_data.show(5)

+-------+--------------------+--------------------+
|movieId|               title|              genres|
+-------+--------------------+--------------------+
|      1|    Toy Story (1995)|Adventure|Animati...|
|      2|      Jumanji (1995)|Adventure|Childre...|
|      3|Grumpier Old Men ...|      Comedy|Romance|
|      4|Waiting to Exhale...|Comedy|Drama|Romance|
|      5|Father of the Bri...|              Comedy|
+-------+--------------------+--------------------+
only showing top 5 rows



### Convert string genres to mathematical format or numbers

In [8]:
sample = [0] * 19
genres_dict = {'unknown': 0, 'Action': 1, 'Adventure': 2, 'Animation': 3, 'Children': 4, 'Comedy': 5, 'Crime': 6, 'Documentary': 7, 'Drama': 8, 'Fantasy': 9, 'FilmNoir': 10, 'Horror': 11, 'Musical': 12, 'Mystery': 13, 'Romance': 14, 'Sci-Fi': 15, 'Thriller': 16, 'War': 17, 'Western': 18}
print(genres_dict)
print(sample)

{'unknown': 0, 'Action': 1, 'Adventure': 2, 'Animation': 3, 'Children': 4, 'Comedy': 5, 'Crime': 6, 'Documentary': 7, 'Drama': 8, 'Fantasy': 9, 'FilmNoir': 10, 'Horror': 11, 'Musical': 12, 'Mystery': 13, 'Romance': 14, 'Sci-Fi': 15, 'Thriller': 16, 'War': 17, 'Western': 18}
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


### function will convert string genres to numbers format

In [9]:
def fetchGenVals(m_genres):
    genres_vals = [0]*19 
    genres_lst = m_genres.split('|')
    for v in genres_lst:
        if v in genres_dict.keys():
            genres_vals[int(genres_dict[v])] = 1
        else:
            genres_vals[0] = 1        
    return genres_vals

In [10]:
print(fetchGenVals('Horror|Musical'))

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0]


In [11]:
movies_data = movies_raw_data.rdd.map(lambda x : (x[0],x[1],x[2],fetchGenVals(x[2]))).toDF(['movie_id','title','genres','genre_vals'])

In [12]:
movies_data.show(5)

+--------+--------------------+--------------------+--------------------+
|movie_id|               title|              genres|          genre_vals|
+--------+--------------------+--------------------+--------------------+
|       1|    Toy Story (1995)|Adventure|Animati...|[0, 0, 1, 1, 1, 1...|
|       2|      Jumanji (1995)|Adventure|Childre...|[0, 0, 1, 0, 1, 0...|
|       3|Grumpier Old Men ...|      Comedy|Romance|[0, 0, 0, 0, 0, 1...|
|       4|Waiting to Exhale...|Comedy|Drama|Romance|[0, 0, 0, 0, 0, 1...|
|       5|Father of the Bri...|              Comedy|[0, 0, 0, 0, 0, 1...|
+--------+--------------------+--------------------+--------------------+
only showing top 5 rows



In [13]:
import math
from scipy.spatial import distance

def cosine_genre(genre1, genre2):
    return str(distance.cosine(genre1, genre2))

## we will do a join of toy story with other movies in this dataset
## lets use sql for simplicity

In [14]:
movies_data.createOrReplaceTempView("movies")
toy_story_df = spark.sql("select * from movies where movie_id = 1")
other_movies_df = spark.sql("select * from movies where movie_id != 1")

In [15]:
toy_story_df.createOrReplaceTempView("toy_story")
other_movies_df.createOrReplaceTempView("all_other_movies")

In [16]:
combined_data_ds = spark.sql("select m1.movie_id,m1.title,m1.genre_vals,m2.movie_id,m2.title,m2.genre_vals from toy_story m1, all_other_movies m2 where m1.movie_id != m2.movie_id")

In [17]:
combined_data_ds.show(5)

+--------+----------------+--------------------+--------+--------------------+--------------------+
|movie_id|           title|          genre_vals|movie_id|               title|          genre_vals|
+--------+----------------+--------------------+--------+--------------------+--------------------+
|       1|Toy Story (1995)|[0, 0, 1, 1, 1, 1...|       2|      Jumanji (1995)|[0, 0, 1, 0, 1, 0...|
|       1|Toy Story (1995)|[0, 0, 1, 1, 1, 1...|       3|Grumpier Old Men ...|[0, 0, 0, 0, 0, 1...|
|       1|Toy Story (1995)|[0, 0, 1, 1, 1, 1...|       4|Waiting to Exhale...|[0, 0, 0, 0, 0, 1...|
|       1|Toy Story (1995)|[0, 0, 1, 1, 1, 1...|       5|Father of the Bri...|[0, 0, 0, 0, 0, 1...|
|       1|Toy Story (1995)|[0, 0, 1, 1, 1, 1...|       6|         Heat (1995)|[0, 1, 0, 0, 0, 0...|
+--------+----------------+--------------------+--------+--------------------+--------------------+
only showing top 5 rows



### lets do the similarity comparison

In [18]:
result = combined_data_ds.rdd.map(lambda x : (x[0],x[1],x[3],x[4],cosine_genre(x[2],x[5]))).toDF(['id1','movie1','id2','movie2','similarity'])

In [19]:
result.show(5)

+---+----------------+---+--------------------+------------------+
|id1|          movie1|id2|              movie2|        similarity|
+---+----------------+---+--------------------+------------------+
|  1|Toy Story (1995)|  2|      Jumanji (1995)|0.2254033307585167|
|  1|Toy Story (1995)|  3|Grumpier Old Men ...| 0.683772233983162|
|  1|Toy Story (1995)|  4|Waiting to Exhale...|0.7418011102528389|
|  1|Toy Story (1995)|  5|Father of the Bri...|0.5527864045000421|
|  1|Toy Story (1995)|  6|         Heat (1995)|               1.0|
+---+----------------+---+--------------------+------------------+
only showing top 5 rows



In [20]:
result.filter(result['similarity'] < 0.01).orderBy('similarity', ascending = True).show(10)

+---+----------------+------+--------------------+----------+
|id1|          movie1|   id2|              movie2|similarity|
+---+----------------+------+--------------------+----------+
|  1|Toy Story (1995)|  2294|         Antz (1998)|       0.0|
|  1|Toy Story (1995)|  3114|  Toy Story 2 (1999)|       0.0|
|  1|Toy Story (1995)|  3754|Adventures of Roc...|       0.0|
|  1|Toy Story (1995)|  4016|Emperor's New Gro...|       0.0|
|  1|Toy Story (1995)|  4886|Monsters, Inc. (2...|       0.0|
|  1|Toy Story (1995)| 45074|    Wild, The (2006)|       0.0|
|  1|Toy Story (1995)| 53121|Shrek the Third (...|       0.0|
|  1|Toy Story (1995)| 65577|Tale of Despereau...|       0.0|
|  1|Toy Story (1995)| 91355|Asterix and the V...|       0.0|
|  1|Toy Story (1995)|103755|        Turbo (2013)|       0.0|
+---+----------------+------+--------------------+----------+
only showing top 10 rows

