In [1]:
from pyspark import SparkContext, SparkConf
conf = SparkConf().setAppName("Movies")
sc = SparkContext(conf = conf) 

In [2]:
df_users = sc.textFile('./moviedata/users.csv')
df_movies = sc.textFile('./moviedata/movies.csv')
df_rating = sc.textFile('./moviedata/rating.csv')

In [3]:
#Getting users between 10 and 18
#Divide string in columns
df_users_columns = df_users.map(lambda x: x.split(","))
#Filter users on the basis of age
possible_userid_columns = df_users_columns.filter(lambda p: int(p[1])>=10 and int(p[1])<=18)
#Getting only userid
possible_userid = possible_userid_columns.map(lambda x: (x[0],  ["epsilon"]))

In [4]:
possible_userid.count()

53

In [5]:
#p[7] - children genre
def select_genre(p):
    return int(p[7])==1

In [6]:
#movieids which have children as a genre
#Divide string in columns
movieid_columns = df_movies.map(lambda x: x.split(","))
#Filter on the basis on genre
possible_movieid_columns = movieid_columns.filter(select_genre)
#Getting only movieid
possible_movieid = possible_movieid_columns.map(lambda x: (x[0], ["epsilon"]))

In [7]:
#Sample
possible_movieid.take(5)

[(u'304', ['epsilon']),
 (u'308', ['epsilon']),
 (u'314', ['epsilon']),
 (u'374', ['epsilon']),
 (u'377', ['epsilon'])]

In [8]:
#Divide string in columns
df_rating_columns = df_rating.map(lambda x: x.split(","))
#RDD with userid as key, and movieid as value
userid_movieid = df_rating_columns.map(lambda x: (x[0], [(x[1])]))
#RDD with movieid as key, and userid as value
movieid_userid = userid_movieid.map(lambda x: (x[1], x[0])) 

In [9]:
#Sample
userid_movieid.take(5)

[(u'253', [u'97']),
 (u'284', [u'269']),
 (u'106', [u'526']),
 (u'121', [u'180']),
 (u'62', [u'86'])]

In [10]:
combined_rdd = userid_movieid.union(possible_userid)

In [11]:
#Sample
combined_rdd.take(5)

[(u'253', [u'97']),
 (u'284', [u'269']),
 (u'106', [u'526']),
 (u'121', [u'180']),
 (u'62', [u'86'])]

In [12]:
def find_epsilon(p):
    for i in p[1]:
        if(i == "epsilon"):
            return True 
    return False

In [13]:
#g - Group data by userID, epsilon is present in the movielist of users of age between 10 and 18
filtered_combined_rdd = combined_rdd.reduceByKey(lambda x, y: x+y).filter(find_epsilon)

In [14]:
#h - Records in which users are between 10 and 18
#h - MovieID - UserID
filtered_movieid_userid = filtered_combined_rdd.flatMap(lambda x: ((i, [x[0]]) for i in x[1] if i != "epsilon"))

In [15]:
filtered_movieid_userid.take(5)

[(u'252', [u'674']),
 (u'763', [u'674']),
 (u'300', [u'674']),
 (u'118', [u'674']),
 (u'50', [u'674'])]

In [16]:
combined_rdd2 = possible_movieid.union(filtered_movieid_userid)

In [17]:
#Sample
combined_rdd2.take(5)

[(u'304', ['epsilon']),
 (u'308', ['epsilon']),
 (u'314', ['epsilon']),
 (u'374', ['epsilon']),
 (u'377', ['epsilon'])]

In [18]:
#Group users based on the movie they rated and then choose only children movies
filtered_combined_rdd2 = combined_rdd2.reduceByKey(lambda x, y: x + y).filter(find_epsilon)

In [19]:
#Sample
filtered_combined_rdd2.take(2)

[(u'812', ['epsilon', u'642']),
 (u'261', ['epsilon', u'397', u'451', u'851', u'592', u'761'])]

In [20]:
#Generate UserID - MovieID list from `filtered_combined_rdd2`
filtered_userid_movieid = filtered_combined_rdd2.flatMap(lambda x: ((i, [x[0]]) for i in x[1] if i != "epsilon")).reduceByKey(lambda x,y: x+ y)
#Getting only users
filtered_userid = filtered_userid_movieid.map(lambda x: x[0])


In [21]:
filtered_userid.collect()

[u'632',
 u'142',
 u'849',
 u'289',
 u'588',
 u'179',
 u'434',
 u'101',
 u'851',
 u'642',
 u'451',
 u'761',
 u'52',
 u'347',
 u'206',
 u'67',
 u'507',
 u'813',
 u'880',
 u'482',
 u'471',
 u'621',
 u'887',
 u'461',
 u'57',
 u'609',
 u'618',
 u'270',
 u'281',
 u'582',
 u'528',
 u'620',
 u'787',
 u'453',
 u'859',
 u'592',
 u'341',
 u'674',
 u'700',
 u'863',
 u'397',
 u'580',
 u'628',
 u'257',
 u'550',
 u'646']