### Analysis of Movie Ratings across user Gender
#### using RDD Pertisting

In [1]:
#import movie ratings into RDD
ratingLines = sc.textFile("C:/SparkCourse/ml-100k/u.data")
#import user details into RDD
userLines = sc.textFile("///SparkCourse/ml-100k/u.user")
#import movie data into RDD
movieLines = sc.textFile("C:/SparkCourse/ml-100k/u.item")
#import genre data into RDD
genreLines = sc.textFile("C:/SparkCourse/ml-100k/u.genre")

In [25]:
#split on delimiter functions
def splitRatingTab(line):
    line = line.split('\t')
    return (int(line[0]), int(line[1]), int(line[2])) #(movieid, user, rating)
def splitUserPipe(line):
    line = line.split('|')
    return (int(line[0]), line[2], line[3]) #(user, gender, occupation)
def splitMoviePipe(line):
    line = line.split('|')
    return (int(line[0]), line[1]) #(movieid, moviename)
def splitGenrePipe(line):
    line = line.split('|')    
    return (int(line[1]), line[0]) #(genreId, genre)


# Transform to RDD as [(movieid, user, rating)] for movies with rating >=3, which are liked by viewers
ratingRDD = ratingLines.map(lambda line: splitRatingTab(line))
print ('ratingRDD:\n',ratingRDD.top(5))

# Transform to RDD as [(user, gender, occupation)]
userRDD = userLines.map(splitUserPipe)
print ('userRDD:\n',userRDD.top(3))

# Transform to RDD as [(movieid, moviename)]
movieRDD = movieLines.map(splitMoviePipe)
print ('movieRDD:\n',movieRDD.top(3))

# Transform to RDD as [(genreId, genre)]
genreRDD = genreLines.map(lambda line: splitGenrePipe(line))
print ('genreRDD:\n',genreRDD.take(3))

ratingRDD:
 [(943, 1330, 3), (943, 1228, 3), (943, 1188, 3), (943, 1074, 4), (943, 1067, 2)]
userRDD:
 [(943, 'M', 'student'), (942, 'F', 'librarian'), (941, 'M', 'student')]
movieRDD:
 [(1682, 'Scream of Stone (Schrei aus Stein) (1991)'), (1681, 'You So Crazy (1994)'), (1680, 'Sliding Doors (1998)')]
genreRDD:
 [(0, 'unknown'), (1, 'Action'), (2, 'Adventure')]


### Percentage of different RatingTypes given for Movies, across Genders

In [8]:
%%time
#Join the RDDs [(user, rating)] and [(user, gender)] to get RDD [(user, (rating, gender)]  and PERSIST it
joinRatingUser = ratingRDD.map(lambda line : (line[1], (line[2]))).join(userRDD.map(lambda user: (user[0], user[1]))).persist()
print (joinRatingUser.top(3))

[(943, (5, 'M')), (943, (5, 'M')), (943, (5, 'M'))]
Wall time: 14.3 s


In [9]:
%%time
# Transform to RDD [((Gender, 1))] 
genderRatingsRDD = joinRatingUser.map(lambda line: (line[1][1], 1))
print (genderRatingsRDD.top(3))

# Transform to RDD [((Gender, Rating), 1)]
genderRatingTypesRDD = joinRatingUser.map(lambda line: ((line[1][1], line[1][0]), 1))
print (genderRatingTypesRDD.top(3))

[('M', 1), ('M', 1), ('M', 1)]
[(('M', 5), 1), (('M', 5), 1), (('M', 5), 1)]
Wall time: 8.13 s


In [10]:
%%time

# Transform to RDD [(Gender, totalRatings)]
genderRatingsTotal = genderRatingsRDD.reduceByKey(lambda x, y: x + y)
#print (genderRatingsTotal.top(3))

# Transform to RDD [((Gender, Rating), totalGenderRatings)]
calcRatingsTypesTotal = genderRatingTypesRDD.reduceByKey(lambda x, y: x + y).sortBy(lambda x: x[0])

# Transform to RDD [(Gender, (Rating, totalGenderRatings))]
genderRatingsTypesTotal = calcRatingsTypesTotal.map(lambda x: (x[0][0], (x[0][1], x[1])))
#print (genderRatingsTypesTotal.top(3))

#lkup = genderRatingsTotal.collect()

#Transform RDD to [(gender, ratingType, totalGenderRatings, totalRatings)]
lookupGenderRatingsTotal = genderRatingsTypesTotal.join(genderRatingsTotal).map(lambda x: (x[0], x[1][0][0], x[1][0][1], x[1][1]))

#Transform RDD to [(gender, ratingType, %Ratings, totalGenderRatings)]
percentGenderRatings = lookupGenderRatingsTotal.map(lambda x: (x[0], x[1], round(float((x[2]*100)/x[3]),2), x[2]))
#print (percentGenderRatings.top(3))

print ('gender \t rating \t percent \t countRatings')
for gender, rating, percent, totalGenderRatings in percentGenderRatings.collect():
    print (gender,'\t', rating,'\t\t', percent,'% \t', totalGenderRatings)

gender 	 rating 	 percent 	 countRatings
F 	 1 		 5.42 % 	 1292
F 	 2 		 10.65 % 	 2539
F 	 3 		 27.35 % 	 6523
F 	 4 		 35.15 % 	 8383
F 	 5 		 21.44 % 	 5113
M 	 1 		 5.34 % 	 3565
M 	 2 		 10.75 % 	 7182
M 	 3 		 26.53 % 	 17721
M 	 4 		 34.92 % 	 23325
M 	 5 		 22.46 % 	 15000
Wall time: 42.1 s


### Testing the reduceByKey 

In [11]:
rdd = sc.parallelize([("a", 1), ("b", 1), ("a", 1)])
cnt = rdd.reduceByKey(lambda x, y: x + y)
print (cnt.top(5))

[('b', 1), ('a', 2)]


In [12]:
# RDD do not have anything similar to SQL window functions
# to compute aggregations, need to group by Key and have to perform some kind of agregation funciton on all the values
rdd1 = sc.parallelize([("a", (1, 5)), ("b", (1, 2)), ("a", (1, 3)), ("b", (1, 4))])
cnt1 = rdd1.reduceByKey(lambda x, y: (x[0] + y[0], max(x[1], y[1])))
print (cnt1.top(5))

[('b', (2, 4)), ('a', (2, 5))]
