In [43]:
import pyspark
from pyspark.sql import SparkSession
import pandas as pd
import matplotlib.pyplot as plt
import random

#### I take first 1000 rows to test my code -- 'limit (1000)'

In [44]:
spark = SparkSession.builder.appName('proj_1').getOrCreate()
ratings = spark.read.csv('gs://moviercommendation/ml-latest/ratings.csv', header = True, inferSchema=True).limit(1000)
ratings.printSchema()

root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: double (nullable = true)
 |-- timestamp: integer (nullable = true)



In [45]:
pd.DataFrame(ratings.take(5), columns=ratings.columns)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,307,3.5,1256677221
1,1,481,3.5,1256677456
2,1,1091,1.5,1256677471
3,1,1257,4.5,1256677460
4,1,1449,4.5,1256677264


In [72]:
random.seed(100)

In [56]:
def subsample(movies, n, p):
    """
    The function subsample the list of movies that each user rates, based on following rule:
    (i) If user has rated fewer than n movies, we keep all ratings
    (ii) If user has rated a lot of movies, we keep only p percent of them via random selection
    (iii) If number of movies * p percent is less than n, we randomly sample n movies and keep these ratings
    """
    if len(movies) <= n:
        return movies
    elif int(p * len(movies)) <= n:
        return random.sample(movies, n)
    else:
        return random.sample(movies, int(p * len(movies)))

In [None]:
# collect all movies each user has rated
ratings_rdd = ratings.select(['userId', 'movieId']).rdd.map(list)
users_rated_movies = ratings_rdd.groupByKey().mapValues(list)

In [57]:
N = 5
P = 0.2
# call subsample on rated movies
subsampled_users_rated_movies= users_rated_movies.mapValues(lambda m: subsample(m, N, P))
subsampled_users_rated_movies

PythonRDD[129] at RDD at PythonRDD.scala:52

In [59]:
subsampled_users_rated_movies.collect()

[(1, [1591, 2840, 1590, 2986, 1091]),
 (2, [2746, 170, 1296, 3363, 1962]),
 (3, [2024, 2028, 3171, 1985, 828]),
 (4,
  [7153,
   2054,
   33437,
   2542,
   2949,
   3868,
   1445,
   474,
   4776,
   33004,
   2683,
   1376,
   43558,
   3889,
   1015,
   1,
   3793,
   361,
   3298,
   3052,
   31878,
   45,
   1396,
   4719,
   7143,
   3578,
   110,
   4844,
   514,
   818,
   5464,
   1760,
   2311,
   3740,
   1968,
   4104,
   5438,
   7569,
   292,
   53000,
   3825,
   1207,
   1126,
   2762,
   1769,
   2004,
   1616,
   5954,
   5378,
   3316,
   2541,
   2478,
   2058,
   2889,
   2770,
   1500,
   5418,
   4226,
   382,
   253,
   204,
   4034,
   2600,
   2947,
   3247,
   1882,
   39231,
   1805,
   8464,
   5563,
   34319,
   4621,
   2000,
   405,
   1653,
   5903,
   344,
   2006,
   6377,
   1265,
   5459,
   4011,
   198,
   5945,
   1393,
   8861,
   1615,
   1375,
   5049,
   6550,
   1370,
   480,
   44759,
   42721,
   2193,
   454,
   7348,
   6016,
   1606,
  

#### Now restore these ratings back to dataframe

In [63]:
subsampled_ratings = subsampled_users_rated_movies.flatMapValues(lambda x: x)
subsampled_ratings = spark.createDataFrame(subsampled_ratings, ['userId', 'movieId'])

In [64]:
pd.DataFrame(subsampled_ratings.take(5), columns=subsampled_ratings.columns)

Unnamed: 0,userId,movieId
0,1,3020
1,1,3826
2,1,1449
3,1,1257
4,1,2840


#### Check if our subsample methods have removed all ratings for a certain movie

In [71]:
def iszero(c):
    return 1 if c==0 else 0

movie_ratings_count = subsampled_ratings.rdd.map(list).map(lambda (x, y): (y, x)).groupByKey().mapValues(len)
# mark nonzero count as 1, zero counts as 0
movie_ratings_binary = movie_ratings_count.map(lambda t: (iszero(t[1]), 1))
zero_rated_movies = movie_ratings_binary.reduce(lambda t1, t2: (t1[0] + t2[0], t1[1] + t2[1]))
zero_rated_movies_percentage = zero_rated_movies[0] / zero_rated_movies[1]
print('{}% of movies have zero ratings'.format(zero_rated_movies_percentage * 100))

0% of movies have zero ratings


In this case, we did not remove all ratings for amy movie due to downsampling, at least in first 1000 rows. This indicates that we perhaps can use a stricter downsampling proportion.

#### Join them back to original dataset to get ratings and time stamps, using userId and movieId as key