In [1]:
from math import sqrt
from pyspark.ml.linalg import Vectors
from numpy.linalg import norm
import numpy as np
from pyspark.sql import Row
import pyspark.sql.functions as f
from operator import add

In [2]:
def cosine(a,b):
    """
       return cosine similirity of 2 two vector a and b
    """
    s = 0
    for i,j in zip(a,b):
        s = s + i*j
    return s/(norm(a)*norm(b))

In [3]:
def cosine_weight(a,b):
    c = [ i*b for i in a]
    return c

In [4]:
def sum_weight(a,b):
    c = [i+j for i,j in zip(a,b)]
    return np.array(c)

In [42]:
ratings = spark.read.options(delimiter=',', header=True).csv("/home/thanhhung/Downloads/ratings.csv").select('userId', 'movieId', 'rating')
ratings = ratings.limit(500)
ratings = ratings.withColumn("rating",ratings.rating.cast('float'))
ratings.show(5)

+------+-------+------+
|userId|movieId|rating|
+------+-------+------+
|     1|      1|   4.0|
|     1|      3|   4.0|
|     1|      6|   4.0|
|     1|     47|   5.0|
|     1|     50|   5.0|
+------+-------+------+
only showing top 5 rows



In [6]:
# rating_feature = rating_feature.rdd.map(lambda row: Row(userId=row["userId"], feature=Vectors.dense(row["feature"]))).toDF()

In [30]:
class Collaborative():
    
    def __init__(self, userId, rating_df, k):
        self.userId = userId
        self.k = k
        self.rating_df = rating_df
        self.RMSE = 0
        self.ranked_list = None
        
        
    def train(self):
        
        k = self.k
        rating_df = self.rating_df
        userId = self.userId
        
        ratings_pivot = rating_df.groupBy('userId').pivot('movieId').sum('rating')
        ratings_pivot = ratings_pivot.na.fill(0)
        
        rating_feature = ratings_pivot.select("userId", f.array(ratings_pivot.columns[1:]).alias("feature"))
        
        target_user = rating_feature.filter(rating_feature.userId == self.userId)\
                                    .select(f.col("feature").alias("target_feature"))\
                                    .rdd.collect()[0].target_feature
        
        target_vector = sc.broadcast(target_user)
        
        
        cosine_mapped_df = rating_feature\
                        .rdd.map(lambda x: (x.userId, x.feature, float(cosine(x.feature, target_vector.value))))\
                        .toDF(["userId","feature", "cosine_score"])\
                        .orderBy(f.col("cosine_score").desc()).limit(k)
        
        cosine_weight_df = cosine_mapped_df.rdd.map(lambda x: (1, cosine_weight(x.feature, x.cosine_score)))\
                                                .toDF(["key", "cosine_weight"])
        
        
        filled_rating_rdd = cosine_weight_df.rdd.reduceByKey(sum_weight)
        
        count = cosine_weight_df.count()
        
        print(count)
        
        predicted_rating_list = filled_rating_rdd.map(lambda x: x[1]/count).collect()[0]
        
        
        item_list = ratings_pivot.columns[1:]
        
        predicted_rating_df = spark.createDataFrame([(i,float(j)) for i,j in \
                                    zip(item_list, predicted_rating_list)],["movieId", "predicted_rating"])\
                                    .orderBy(f.col("predicted_rating").desc())
        
        
        compare_df = predicted_rating_df.join(rating_df,["movieId"]).filter(f.col("userId") == userId)
#         test_size = int(compare_df.count()/5)
#         compare_df = compare_df.orderBy(f.rand()).limit(test_size)
        
        self.ranked_list = predicted_rating_df.subtract(compare_df.select("movieId", "predicted_rating"))\
                                                .orderBy(f.col("predicted_rating").desc())
        
        squared_error = compare_df.select("predicted_rating", "rating").rdd.map(lambda x: (x[0] - x[1])**2)
        
        self.RMSE =  squared_error.reduce(add)/squared_error.count()
        
        
        return
    
        

In [46]:
a = Collaborative(1,ratings, 3)
a.train()

22/12/12 18:38:47 ERROR CodeGenerator: failed to compile: org.codehaus.janino.InternalCompilerException: Compiling "GeneratedClass" in "generated.java": Code of method "project_doConsume_0$(Lorg/apache/spark/sql/catalyst/expressions/GeneratedClass$GeneratedIteratorForCodegenStage3;Lorg/apache/spark/sql/catalyst/InternalRow;Lorg/apache/spark/sql/catalyst/util/ArrayData;)V" of class "org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage3" grows beyond 64 KB
org.codehaus.janino.InternalCompilerException: Compiling "GeneratedClass" in "generated.java": Code of method "project_doConsume_0$(Lorg/apache/spark/sql/catalyst/expressions/GeneratedClass$GeneratedIteratorForCodegenStage3;Lorg/apache/spark/sql/catalyst/InternalRow;Lorg/apache/spark/sql/catalyst/util/ArrayData;)V" of class "org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage3" grows beyond 64 KB
	at org.codehaus.janino.UnitCompiler.compile2(UnitCompiler.java:36

22/12/12 18:38:48 ERROR CodeGenerator: failed to compile: org.codehaus.janino.InternalCompilerException: Compiling "GeneratedClass" in "generated.java": Code of method "processNext()V" of class "org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage3" grows beyond 64 KB
org.codehaus.janino.InternalCompilerException: Compiling "GeneratedClass" in "generated.java": Code of method "processNext()V" of class "org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage3" grows beyond 64 KB
	at org.codehaus.janino.UnitCompiler.compile2(UnitCompiler.java:366)
	at org.codehaus.janino.UnitCompiler.access$000(UnitCompiler.java:226)
	at org.codehaus.janino.UnitCompiler$1.visitCompilationUnit(UnitCompiler.java:336)
	at org.codehaus.janino.UnitCompiler$1.visitCompilationUnit(UnitCompiler.java:333)
	at org.codehaus.janino.Java$CompilationUnit.accept(Java.java:363)
	at org.codehaus.janino.UnitCompiler.compileUnit(UnitCompiler.java:333)
	

3


In [47]:
a.RMSE

8.478740536550642

In [48]:
a.ranked_list.show()

+-------+-------------------+
|movieId|   predicted_rating|
+-------+-------------------+
|    914|0.34498079907169066|
|   1188|0.33502742209764674|
|    538|0.33502742209764674|
|    920|0.33502742209764674|
|    215|0.33502742209764674|
|   1046|0.33502742209764674|
|    492|0.33502742209764674|
|   1203|0.33502742209764674|
|   2186|0.33502742209764674|
|   3851|0.33502742209764674|
|    176|0.33502742209764674|
|    319|0.33502742209764674|
|   3508|0.33502742209764674|
|   1103|0.33502742209764674|
|   2390|0.33502742209764674|
|   2843|0.33502742209764674|
|   1077|0.33502742209764674|
|   1733|0.33502742209764674|
|    930|0.33502742209764674|
|   1086|0.33502742209764674|
+-------+-------------------+
only showing top 20 rows

