In [2]:
#import movie ratings into RDD
ratingLines = sc.textFile("C:/SparkCourse/ml-100k/u.data")
#import user details into RDD
userLines = sc.textFile("///SparkCourse/ml-100k/u.user")
#import movie data into RDD
movieLines = sc.textFile("C:/SparkCourse/ml-100k/u.item")
#import genre data into RDD
genreLines = sc.textFile("C:/SparkCourse/ml-100k/u.genre")

In [3]:
#split on delimiter functions
def splitRatingTab(line):
    line = line.split('\t')
    return (int(line[0]), int(line[1]), int(line[2])) #(movieid, user, rating)
def splitUserPipe(line):
    line = line.split('|')
    return (int(line[0]), line[2], line[3]) #(user, gender, occupation)
def splitMoviePipe(line):
    line = line.split('|')
    return (int(line[0]), line[1]) #(movieid, moviename)
def splitGenrePipe(line):
    line = line.split('|')    
    return (int(line[1]), line[0]) #(genreId, genre)


# Transform to RDD as [(movieid, user, rating)] for movies with rating >=3, which are liked by viewers
ratingRDD = ratingLines.map(lambda line: splitRatingTab(line))
print ('ratingRDD:\n',ratingRDD.top(5))

# Transform to RDD as [(user, gender, occupation)]
userRDD = userLines.map(splitUserPipe)
print ('userRDD:\n',userRDD.top(3))

# Transform to RDD as [(movieid, moviename)]
movieRDD = movieLines.map(splitMoviePipe)
print ('movieRDD:\n',movieRDD.top(3))

# Transform to RDD as [(genreId, genre)]
genreRDD = genreLines.map(lambda line: splitGenrePipe(line))
print ('genreRDD:\n',genreRDD.take(3))

ratingRDD:
 [(943, 1330, 3), (943, 1228, 3), (943, 1188, 3), (943, 1074, 4), (943, 1067, 2)]
userRDD:
 [(943, 'M', 'student'), (942, 'F', 'librarian'), (941, 'M', 'student')]
movieRDD:
 [(1682, 'Scream of Stone (Schrei aus Stein) (1991)'), (1681, 'You So Crazy (1994)'), (1680, 'Sliding Doors (1998)')]
genreRDD:
 [(0, 'unknown'), (1, 'Action'), (2, 'Adventure')]


In [4]:
%%time
#Join the RDDs [(user, rating)] and [(user, gender)] to get RDD [(user, (rating, occupation)]  and PERSIST it
joinRatingUser = ratingRDD.map(lambda line : (line[1], (line[2]))).join(userRDD.map(lambda user: (user[0], user[2]))).persist()
print (joinRatingUser.top(3))

[(943, (5, 'student')), (943, (5, 'student')), (943, (5, 'student'))]
Wall time: 15.9 s


In [5]:
%%time
# Transform to RDD [((Occupation, 1))] 
occupationRatingsRDD = joinRatingUser.map(lambda line: (line[1][1], 1))
print (occupationRatingsRDD.top(3))

# Transform to RDD [((Occupation, Rating), 1)]
occupationRatingTypesRDD = joinRatingUser.map(lambda line: ((line[1][1], line[1][0]), 1))
print (occupationRatingTypesRDD.top(3))

[('writer', 1), ('writer', 1), ('writer', 1)]
[(('writer', 5), 1), (('writer', 5), 1), (('writer', 5), 1)]
Wall time: 8.53 s


In [6]:
%%time

# Transform to RDD [(Gender, totalRatings)]
occupationRatingsTotal = occupationRatingsRDD.reduceByKey(lambda x, y: x + y)
#print (genderRatingsTotal.top(3))

# Transform to RDD [((Gender, Rating), totalGenderRatings)]
calcRatingsTypesTotal = occupationRatingTypesRDD.reduceByKey(lambda x, y: x + y).sortBy(lambda x: x[0])

# Transform to RDD [(Gender, (Rating, totalGenderRatings))]
occupationRatingsTypesTotal = calcRatingsTypesTotal.map(lambda x: (x[0][0], (x[0][1], x[1])))


#Transform RDD to [(gender, ratingType, totalGenderRatings, totalRatings)]
lookupOccupRatingsTotal = occupationRatingsTypesTotal.join(occupationRatingsTotal).map(lambda x: (x[0], x[1][0][0], x[1][0][1], x[1][1]))

#Transform RDD to [(gender, ratingType, %Ratings, totalGenderRatings)]
percentOccupRatings = lookupOccupRatingsTotal.map(lambda x: (x[0], x[1], round(float((x[2]*100)/x[3]),2), x[2]))

print ('occupation \t rating \t percent \t countRatings')
for occupation, rating, percent, totalOccupRatings in percentOccupRatings.collect():
    print (occupation.rjust(15),'\t', rating,'\t\t', percent,'% \t', totalOccupRatings)

occupation 	 rating 	 percent 	 countRatings
       engineer 	 1 		 4.99 % 	 327
       engineer 	 2 		 10.78 % 	 706
       engineer 	 3 		 28.1 % 	 1840
       engineer 	 4 		 36.59 % 	 2396
       engineer 	 5 		 19.53 % 	 1279
         artist 	 1 		 5.4 % 	 151
         artist 	 2 		 11.49 % 	 321
         artist 	 3 		 27.52 % 	 769
         artist 	 4 		 35.08 % 	 980
         artist 	 5 		 20.51 % 	 573
  administrator 	 1 		 4.81 % 	 436
  administrator 	 2 		 10.8 % 	 979
  administrator 	 3 		 25.49 % 	 2310
  administrator 	 4 		 35.79 % 	 3243
  administrator 	 5 		 23.1 % 	 2093
          other 	 1 		 5.41 % 	 453
          other 	 2 		 11.47 % 	 960
          other 	 3 		 27.5 % 	 2301
          other 	 4 		 33.14 % 	 2773
          other 	 5 		 22.47 % 	 1880
      marketing 	 1 		 4.86 % 	 145
      marketing 	 2 		 9.39 % 	 280
      marketing 	 3 		 24.35 % 	 726
      marketing 	 4 		 34.42 % 	 1026
      marketing 	 5 		 26.97 % 	 804
         writer 	 1 		 4.66 % 	