In [1]:
import pyspark
from pyspark.sql import SparkSession
import pandas as pd
import matplotlib.pyplot as plt
import random

#### I take first 1000 rows to test my code -- 'limit (1000)'

In [2]:
spark = SparkSession.builder.appName('proj_1').getOrCreate()
ratings = spark.read.csv('ml-20m/ratings.csv', header = True, inferSchema=True).limit(1000)
ratings.printSchema()

root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: double (nullable = true)
 |-- timestamp: integer (nullable = true)



In [3]:
pd.DataFrame(ratings.take(5), columns=ratings.columns)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,1112486027
1,1,29,3.5,1112484676
2,1,32,3.5,1112484819
3,1,47,3.5,1112484727
4,1,50,3.5,1112484580


In [4]:
random.seed(100)

In [5]:
def subsample(movies, n, p):
    """
    The function subsample the list of movies that each user rates, based on following rule:
    (i) If user has rated fewer than n movies, we keep all ratings
    (ii) If user has rated a lot of movies, we keep only p percent of them via random selection
    (iii) If number of movies * p percent is less than n, we randomly sample n movies and keep these ratings
    """
    if len(movies) <= n:
        return movies
    elif int(p * len(movies)) <= n:
        return random.sample(movies, n)
    else:
        return random.sample(movies, int(p * len(movies)))

In [6]:
# collect all movies each user has rated
ratings_rdd = ratings.select(['userId', 'movieId']).rdd.map(list)
users_rated_movies = ratings_rdd.groupByKey().mapValues(list)

In [7]:
N = 5
P = 0.2
# call subsample on rated movies
subsampled_users_rated_movies= users_rated_movies.mapValues(lambda m: subsample(m, N, P))
subsampled_users_rated_movies

PythonRDD[25] at RDD at PythonRDD.scala:53

#### Now restore these ratings back to dataframe

In [8]:
subsampled_ratings = subsampled_users_rated_movies.flatMapValues(lambda x: x)
subsampled_ratings = spark.createDataFrame(subsampled_ratings, ['userId', 'movieId'])

In [9]:
pd.DataFrame(subsampled_ratings.take(5), columns=subsampled_ratings.columns)

Unnamed: 0,userId,movieId
0,1,2174
1,1,3932
2,1,4896
3,1,112
4,1,919


#### Check if our subsample methods have removed all ratings for a certain movie

In [13]:
def iszero(c):
    return 1 if c==0 else 0

movie_ratings_count = subsampled_ratings.rdd.map(list).map(lambda (x, y): (y, x)).groupByKey().mapValues(len)
# mark nonzero count as 1, zero counts as 0
movie_ratings_binary = movie_ratings_count.map(lambda t: (iszero(t[1]), 1))
zero_rated_movies = movie_ratings_binary.reduce(lambda t1, t2: (t1[0] + t2[0], t1[1] + t2[1]))
zero_rated_movies_percentage = zero_rated_movies[0] / zero_rated_movies[1]
print('{}% of movies have zero ratings'.format(zero_rated_movies_percentage * 100))

SyntaxError: invalid syntax (<ipython-input-13-545b738bc29c>, line 4)

In this case, we did not remove all ratings for amy movie due to downsampling, at least in first 1000 rows. This indicates that we perhaps can use a stricter downsampling proportion.

### Answer to 
As data scientists of a digital media company, state your objectives in building a
recommendation system. For example, what metrics do you care about, who is this system
built to serve (users or your boss?), and what business rules may you care to introduce?

As a data scientist, you want to create a solution that serves the users but as well as can satisify your coworkers such as your boss. Having key stakeholders bought into your idea is integral in adding value to the company. So here we list out a few metrics and ideas that we can focus on and also point out who they serve.

Users: <br/>
We want our users to feel engaged with our content. That means we want to push out recommendations that they can relate to and enjoy. Within the context of our project, we can measure accuracy of our model such as RMSE. However, there can be some other things we can measure as well that's a bit out of scope of this project such as serendipity of our recommendations and implicit feedbacks such as how long they listen or watch our recommendations even if they don't explicitly rate them.
    
Stakeholders: <br/>
Stakeholders want to make sure that the solutions that we recommend are indeed better than what they can do or have done in the past. They also want to understand how this system is affecting the product that they own. This means that measurements such as accuracy is also useful for stakeholders, but we can additionally focus on model methods that help them interpret the model's output. This means using simpler methods such as KNN or exploring Matrix Factorization methods to see if there can be any patterns in a reduced dimension space that can make sense to human rationality. 

#### Join them back to original dataset to get ratings and time stamps, using userId and movieId as key

Persist a sparkRDD so it doesn't keep changing

In [10]:
subsampled_ratings = subsampled_ratings.persist()

In [11]:
train_set = subsampled_ratings.join(ratings, ['userId','movieid'], 'inner')

In [12]:
test_set = ratings.join(subsampled_ratings, ['userId','movieid']
                                   , how = 'leftanti')

In [13]:
subsampled_ratings.take(10)

[Row(userId=1, movieId=2291),
 Row(userId=1, movieId=2253),
 Row(userId=1, movieId=5146),
 Row(userId=1, movieId=1304),
 Row(userId=1, movieId=6754),
 Row(userId=1, movieId=4226),
 Row(userId=1, movieId=1258),
 Row(userId=1, movieId=1387),
 Row(userId=1, movieId=112),
 Row(userId=1, movieId=4467)]

Confrim they are working as intended

In [14]:
ratings.where('userId =1').where('movieId =253').collect()

[Row(userId=1, movieId=253, rating=4.0, timestamp=1112484940)]

In [15]:
subsampled_ratings.where('userId =1').where('movieId =253').collect()

[]

In [16]:
train_set.where('userId =1').where('movieId =253').collect()

[]

In [17]:
test_set.where('userId =1').where('movieId =253').collect()

[Row(userId=1, movieId=253, rating=4.0, timestamp=1112484940)]

Fit Matrix Factorization Model

In [27]:
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.mllib.evaluation import RegressionMetrics, RankingMetrics

In [19]:
als = ALS(maxIter=5, regParam=0.01, userCol="userId", itemCol="movieId", ratingCol="rating",
          coldStartStrategy="drop")
model = als.fit(train_set)

In [20]:
predictions = model.transform(test_set)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",
                                predictionCol="prediction")
rmse = evaluator.evaluate(predictions)

In [54]:
rmse

4.664841914751033

### Evaluation Set-up

In [49]:
predictionsData = predictions.rdd.map(lambda r: ((r.userId, r.movieId), r.prediction))

In [50]:
ratingsTuple = test_set.rdd.map(lambda r: ((r.userId, r.movieId), r.rating))
scoreAndLabels = predictionsData.join(ratingsTuple).map(lambda tup: tup[1])

In [51]:
metrics = RegressionMetrics(scoreAndLabels)

In [55]:
ranking_metrics =  RankingMetrics(scoreAndLabels)

In [52]:
metrics.r2

-32.46977413804647

In [53]:
metrics.rootMeanSquaredError

4.664841914751033

Cross Validation Set Up

In [None]:
evaluation = [] 
for train_set, test_set in data_generator: 
    als = ALS(maxIter=5, regParam=0.01, userCol="userId", itemCol="movieId", ratingCol="rating",
          coldStartStrategy="drop")
    model = als.fit(train_set)
    predictions = model.transform(test_set)
    rmse = evaluator.evaluate(predictions)
    predictionsData = predictions.rdd.map(lambda r: ((r.userId, r.movieId), r.prediction))
    ratingsTuple = test_set.rdd.map(lambda r: ((r.userId, r.movieId), r.rating))
    scoreAndLabels = predictionsData.join(ratingsTuple).map(lambda tup: tup[1])
    metrics = RegressionMetrics(scoreAndLabels)
    evaluation.append({'rmse': metrics.rootMeanSquaredError, 'r2': metrics.r2})

HyperParameter Set up

In [139]:
grid = {'maxIter':[5,10], 'regParam': [1,2]}

In [143]:
param_vals = []
for key,val in grid.items():
    param_vals.append(val)

In [147]:
for i in itertools.product(*param_vals):
    print(i)

(5, 1)
(5, 2)
(10, 1)
(10, 2)


In [149]:
grid

{'maxIter': [5, 10], 'regParam': [1, 2]}

In [154]:
inputs

{'maxIter': 10, 'regParam': 2}

In [None]:
final_results = dict()
for i in itertools.product(*param_vals):
    evaluation = [] 
    for train_set, test_set in data_generator: 
        inputs = dict()
        for j,(key,val) in enumerate(grid.items()):
            inputs[key] = i[j]
        als = ALS(**inputs)
        model = als.fit(train_set)
        predictions = model.transform(test_set)
        rmse = evaluator.evaluate(predictions)
        predictionsData = predictions.rdd.map(lambda r: ((r.userId, r.movieId), r.prediction))
        ratingsTuple = test_set.rdd.map(lambda r: ((r.userId, r.movieId), r.rating))
        scoreAndLabels = predictionsData.join(ratingsTuple).map(lambda tup: tup[1])
        metrics = RegressionMetrics(scoreAndLabels)
        evaluation.append({'rmse': metrics.rootMeanSquaredError, 'r2': metrics.r2})
    final_results[i] = evaluation    