In [1]:
from pyspark.ml.linalg import Vectors
import pyspark.sql.functions as f
from math import sqrt
from numpy.linalg import norm
from pyspark.sql.window import Window
from pyspark.sql.functions import col, row_number, rank
from operator import add
from pyspark.sql import Row

In [2]:
def cosine(a,b):
    s = 0
    for i,j in zip(a,b):
        s = s + i*j
    return s/(norm(a)*norm(b))

In [3]:
def cosine_map(cross_join_df):
    new_rdd = cross_join_df.rdd.map(lambda x: (cosine(x.feature,x.candidate_feature), x.rating, x.candidate_id))
    return new_rdd 

In [5]:
movies = spark.read.options(delimiter=',', header=True) \
               .csv("data/movies.csv")
movies.show(5)

+-------+--------------------+--------------------+
|movieId|               title|              genres|
+-------+--------------------+--------------------+
|      1|    Toy Story (1995)|Adventure|Animati...|
|      2|      Jumanji (1995)|Adventure|Childre...|
|      3|Grumpier Old Men ...|      Comedy|Romance|
|      4|Waiting to Exhale...|Comedy|Drama|Romance|
|      5|Father of the Bri...|              Comedy|
+-------+--------------------+--------------------+
only showing top 5 rows



In [6]:
movies = movies.withColumn('genres_array', f.split(f.col('genres'), '\|'))\
    .withColumn('genres', f.explode('genres_array')).select('movieId', 'genres', 'title')
movies.show(5)

+-------+---------+----------------+
|movieId|   genres|           title|
+-------+---------+----------------+
|      1|Adventure|Toy Story (1995)|
|      1|Animation|Toy Story (1995)|
|      1| Children|Toy Story (1995)|
|      1|   Comedy|Toy Story (1995)|
|      1|  Fantasy|Toy Story (1995)|
+-------+---------+----------------+
only showing top 5 rows



In [7]:
movies_pivot = movies.groupBy('movieId').pivot('genres').agg(f.count('genres')).drop('(no genres listed)')
movies_pivot.show(5)

+-------+------+---------+---------+--------+------+-----+-----------+-----+-------+---------+------+----+-------+-------+-------+------+--------+----+-------+
|movieId|Action|Adventure|Animation|Children|Comedy|Crime|Documentary|Drama|Fantasy|Film-Noir|Horror|IMAX|Musical|Mystery|Romance|Sci-Fi|Thriller| War|Western|
+-------+------+---------+---------+--------+------+-----+-----------+-----+-------+---------+------+----+-------+-------+-------+------+--------+----+-------+
|   2294|  null|        1|        1|       1|     1| null|       null| null|      1|     null|  null|null|   null|   null|   null|  null|    null|null|   null|
|   2162|  null|        1|     null|       1|  null| null|       null| null|      1|     null|  null|null|   null|   null|   null|  null|    null|null|   null|
|   3210|  null|     null|     null|    null|     1| null|       null|    1|   null|     null|  null|null|   null|   null|      1|  null|    null|null|   null|
|   3959|     1|        1|     null|    

In [8]:
movies_pivot = movies_pivot.na.fill(0)
movies_pivot.show(5)

+-------+------+---------+---------+--------+------+-----+-----------+-----+-------+---------+------+----+-------+-------+-------+------+--------+---+-------+
|movieId|Action|Adventure|Animation|Children|Comedy|Crime|Documentary|Drama|Fantasy|Film-Noir|Horror|IMAX|Musical|Mystery|Romance|Sci-Fi|Thriller|War|Western|
+-------+------+---------+---------+--------+------+-----+-----------+-----+-------+---------+------+----+-------+-------+-------+------+--------+---+-------+
|   2294|     0|        1|        1|       1|     1|    0|          0|    0|      1|        0|     0|   0|      0|      0|      0|     0|       0|  0|      0|
|   2162|     0|        1|        0|       1|     0|    0|          0|    0|      1|        0|     0|   0|      0|      0|      0|     0|       0|  0|      0|
|   3210|     0|        0|        0|       0|     1|    0|          0|    1|      0|        0|     0|   0|      0|      0|      1|     0|       0|  0|      0|
|   3959|     1|        1|        0|       0| 

In [9]:
movies_feature = movies_pivot.select("movieId", f.array(movies_pivot.columns[1:]).alias("feature"))
movies_feature.show(5)

+-------+--------------------+
|movieId|             feature|
+-------+--------------------+
|   2294|[0, 1, 1, 1, 1, 0...|
|   2162|[0, 1, 0, 1, 0, 0...|
|   3210|[0, 0, 0, 0, 1, 0...|
|   3959|[1, 1, 0, 0, 0, 0...|
|    467|[0, 0, 0, 0, 1, 0...|
+-------+--------------------+
only showing top 5 rows



In [10]:
movies_feature = movies_feature.rdd.map(lambda row: Row(
        movieId=row["movieId"], 
        feature=Vectors.dense(row["feature"]),
    )).toDF()
movies_feature.printSchema()


[Stage 34:>                                                         (0 + 1) / 1]

root
 |-- movieId: string (nullable = true)
 |-- feature: vector (nullable = true)




                                                                                

In [11]:
movies_feature.show(5)

+-------+--------------------+
|movieId|             feature|
+-------+--------------------+
|   2294|[0.0,1.0,1.0,1.0,...|
|   2162|[0.0,1.0,0.0,1.0,...|
|   3210|[0.0,0.0,0.0,0.0,...|
|   3959|[1.0,1.0,0.0,0.0,...|
|    467|[0.0,0.0,0.0,0.0,...|
+-------+--------------------+
only showing top 5 rows



In [13]:
ratings = spark.read.options(delimiter=',', header=True) \
               .csv("data/ratings.csv").select('userId', 'movieId', 'rating')
ratings.show(5)

+------+-------+------+
|userId|movieId|rating|
+------+-------+------+
|     1|      1|   4.0|
|     1|      3|   4.0|
|     1|      6|   4.0|
|     1|     47|   5.0|
|     1|     50|   5.0|
+------+-------+------+
only showing top 5 rows



In [14]:
df = ratings.join(movies_feature, on='movieId').select('userId', 'movieId', 'feature', 'rating')
df.show(5)

+------+-------+--------------------+------+
|userId|movieId|             feature|rating|
+------+-------+--------------------+------+
|   608|   2294|[0.0,1.0,1.0,1.0,...|   4.0|
|   600|   2294|[0.0,1.0,1.0,1.0,...|   2.5|
|   596|   2294|[0.0,1.0,1.0,1.0,...|   3.0|
|   580|   2294|[0.0,1.0,1.0,1.0,...|   4.0|
|   561|   2294|[0.0,1.0,1.0,1.0,...|   2.0|
+------+-------+--------------------+------+
only showing top 5 rows



In [15]:
def get_user_vectors(df, userId):
    return df.filter(df.userId == userId).select(col("feature"), col("rating"))

In [16]:
def get_not_rating_vectors(df, userId):
    return df.filter(df.userId != userId).select(col("feature").alias("candidate_feature"), col("movieId").alias("candidate_id")).dropDuplicates()

In [17]:
vec = get_user_vectors(df, 1)
vec = vec.withColumn("rating", vec["rating"].cast("float"))
vec.show(5)

+--------------------+------+
|             feature|rating|
+--------------------+------+
|[0.0,0.0,0.0,0.0,...|   3.0|
|[0.0,0.0,0.0,0.0,...|   4.0|
|[1.0,0.0,0.0,0.0,...|   5.0|
|[0.0,0.0,0.0,0.0,...|   4.0|
|[0.0,0.0,1.0,0.0,...|   5.0|
+--------------------+------+
only showing top 5 rows



In [18]:
not_vec = get_not_rating_vectors(df, 1).head(20)


[Stage 52:>                                                         (0 + 1) / 1]

                                                                                

In [19]:
crossjoin_df = vec.crossJoin(spark.createDataFrame(not_vec))

In [20]:
crossjoin_df.show()

+--------------------+------+--------------------+------------+
|             feature|rating|   candidate_feature|candidate_id|
+--------------------+------+--------------------+------------+
|[0.0,0.0,0.0,0.0,...|   3.0|[1.0,0.0,0.0,0.0,...|        4580|
|[0.0,0.0,0.0,0.0,...|   3.0|[1.0,1.0,0.0,0.0,...|        5872|
|[0.0,0.0,0.0,0.0,...|   4.0|[1.0,0.0,0.0,0.0,...|        4580|
|[0.0,0.0,0.0,0.0,...|   4.0|[1.0,1.0,0.0,0.0,...|        5872|
|[1.0,0.0,0.0,0.0,...|   5.0|[1.0,0.0,0.0,0.0,...|        4580|
|[1.0,0.0,0.0,0.0,...|   5.0|[1.0,1.0,0.0,0.0,...|        5872|
|[0.0,0.0,0.0,0.0,...|   4.0|[1.0,0.0,0.0,0.0,...|        4580|
|[0.0,0.0,0.0,0.0,...|   4.0|[1.0,1.0,0.0,0.0,...|        5872|
|[0.0,0.0,1.0,0.0,...|   5.0|[1.0,0.0,0.0,0.0,...|        4580|
|[0.0,0.0,1.0,0.0,...|   5.0|[1.0,1.0,0.0,0.0,...|        5872|
|[0.0,1.0,0.0,1.0,...|   5.0|[1.0,0.0,0.0,0.0,...|        4580|
|[0.0,1.0,0.0,1.0,...|   5.0|[1.0,1.0,0.0,0.0,...|        5872|
|[0.0,1.0,0.0,1.0,...|   5.0|[1.0,0.0,0.

In [21]:
def KNN_ranked_list(cross_join_df, k):
    rdd = cosine_map(cross_join_df)
    
    columns = ["cosine_score", "rating", "candidate_id"]
    
    df1 = rdd.map(lambda x: [float(x[0]),x[1], x[2]]).toDF(columns)
    
    windowDept = Window.partitionBy("candidate_id").orderBy(col("cosine_score").desc(), col("rating").desc())
    
    df2 = df1.withColumn("row",row_number().over(windowDept)).filter(col("row") <= k)
    
    df3 = df2.rdd.map(lambda x: (x.candidate_id, x.cosine_score*x.rating)).reduceByKey(add)
    
    return df3.map(lambda x: (x[0], x[1]/k))

In [22]:
result = KNN_ranked_list(crossjoin_df, 5)



In [23]:
result.foreach(print)

('103228', 4.330127018922194)
('128', 0.0)
('1307', 3.8531972647421804)
('161032', 4.0824829046386295)
('180497', 3.1565965239697253)
('1965', 3.7760883751542678)
('2596', 4.3999999999999995)
('2759', 5.0)
('2835', 4.330127018922194)
('3577', 4.176800626856454)
('4580', 4.0494897427831775)
('4833', 3.3491886513788316)
('5502', 3.053197264742181)
('5872', 5.000000000000001)
('6616', 3.646494472526361)
('72694', 4.121320343559643)
('7320', 3.1112698372208087)
('78160', 5.0)
('79536', 3.535533905932737)
('81537', 5.0)

                                                                                