In [52]:
import pyspark
from pyspark.sql import SparkSession
import pandas as pd
import matplotlib.pyplot as plt
import random

#### I take first 1000 rows to test my code -- 'limit (1000)'

In [53]:
spark = SparkSession.builder.appName('proj_1').getOrCreate()
ratings = spark.read.csv('ml-20m/ratings.csv', header = True, inferSchema=True).limit(1000)
ratings.printSchema()

root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: double (nullable = true)
 |-- timestamp: integer (nullable = true)



In [54]:
pd.DataFrame(ratings.take(5), columns=ratings.columns)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,1112486027
1,1,29,3.5,1112484676
2,1,32,3.5,1112484819
3,1,47,3.5,1112484727
4,1,50,3.5,1112484580


In [55]:
random.seed(100)

In [60]:
def subsample(movies, n, p):
    """
    The function subsample the list of movies that each user rates, based on following rule:
    (i) If user has rated fewer than n movies, we keep all ratings
    (ii) If user has rated a lot of movies, we keep only p percent of them via random selection
    (iii) If number of movies * p percent is less than n, we randomly sample n movies and keep these ratings
    """
    if len(movies) <= n:
        return movies
    elif int(p * len(movies)) <= n:
        return random.sample(movies, n)
    else:
        return random.sample(movies, int(p * len(movies)))

In [61]:
# collect all movies each user has rated
ratings_rdd = ratings.select(['userId', 'movieId']).rdd.map(list)
users_rated_movies = ratings_rdd.groupByKey().mapValues(list)

In [62]:
N = 5
P = 0.2
# call subsample on rated movies
subsampled_users_rated_movies= users_rated_movies.mapValues(lambda m: subsample(m, N, P))
subsampled_users_rated_movies

PythonRDD[176] at RDD at PythonRDD.scala:53

#### Now restore these ratings back to dataframe

In [63]:
subsampled_ratings = subsampled_users_rated_movies.flatMapValues(lambda x: x)
subsampled_ratings = spark.createDataFrame(subsampled_ratings, ['userId', 'movieId'])

In [11]:
pd.DataFrame(subsampled_ratings.take(5), columns=subsampled_ratings.columns)

Unnamed: 0,userId,movieId
0,1,4911
1,1,2628
2,1,4105
3,1,1240
4,1,1358


#### Check if our subsample methods have removed all ratings for a certain movie

In [13]:
def iszero(c):
    return 1 if c==0 else 0

movie_ratings_count = subsampled_ratings.rdd.map(list).map(lambda (x, y): (y, x)).groupByKey().mapValues(len)
# mark nonzero count as 1, zero counts as 0
movie_ratings_binary = movie_ratings_count.map(lambda t: (iszero(t[1]), 1))
zero_rated_movies = movie_ratings_binary.reduce(lambda t1, t2: (t1[0] + t2[0], t1[1] + t2[1]))
zero_rated_movies_percentage = zero_rated_movies[0] / zero_rated_movies[1]
print('{}% of movies have zero ratings'.format(zero_rated_movies_percentage * 100))

SyntaxError: invalid syntax (<ipython-input-13-545b738bc29c>, line 4)

In this case, we did not remove all ratings for amy movie due to downsampling, at least in first 1000 rows. This indicates that we perhaps can use a stricter downsampling proportion.

#### Join them back to original dataset to get ratings and time stamps, using userId and movieId as key

Persist a sparkRDD so it doesn't keep changing

In [64]:
subsampled_ratings = subsampled_ratings.persist()

In [65]:
train_set = subsampled_ratings.join(ratings, ['userId','movieid'], 'inner')

In [81]:
test_set = ratings.join(subsampled_ratings, ['userId','movieid']
                                   , how = 'leftanti')

In [73]:
subsampled_ratings.take(10)

[Row(userId=1, movieId=253),
 Row(userId=1, movieId=5898),
 Row(userId=1, movieId=2648),
 Row(userId=1, movieId=260),
 Row(userId=1, movieId=4128),
 Row(userId=1, movieId=1200),
 Row(userId=1, movieId=2944),
 Row(userId=1, movieId=1198),
 Row(userId=1, movieId=3889),
 Row(userId=1, movieId=1036)]

Confrim they are working as intended

In [69]:
ratings.where('userId =1').where('movieId =253').collect()

[Row(userId=1, movieId=253, rating=4.0, timestamp=1112484940)]

In [70]:
subsampled_ratings.where('userId =1').where('movieId =253').collect()

[Row(userId=1, movieId=253)]

In [76]:
train_set.where('userId =1').where('movieId =253').collect()

[Row(userId=1, movieId=253, rating=4.0, timestamp=1112484940)]

In [84]:
test_set.where('userId =1').where('movieId =253').collect()

[]

Fit Matrix Factorization Model

In [90]:
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator

In [86]:
als = ALS(maxIter=5, regParam=0.01, userCol="userId", itemCol="movieId", ratingCol="rating",
          coldStartStrategy="drop")
model = als.fit(train_set)

In [91]:
predictions = model.transform(test_set)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",
                                predictionCol="prediction")
rmse = evaluator.evaluate(predictions)

In [92]:
rmse

4.399184892382423