# Movie Recommender System
Use existing movie ratings history of the users to predict which movies to recommend to each user. Here is a solution using the item collaborative filtering algorithm. A more accurate model should also include other information of the movie such as movie type, release year, actors etc., and a combination with the user collaborative filtering method.

In [1]:
from pyspark import SparkContext, SparkConf
sc = SparkContext()

In [29]:
# read input file
input_text_RDD = sc.textFile('RecommenderSystem/userRating.txt')
input_text_RDD.take(5)
# format:
# uid,movie_id,rating

[u'1,10001,5.0',
 u'1,10002,3.0',
 u'1,10003,2.5',
 u'2,10001,2.0',
 u'2,10002,2.5']

In [30]:
# group by user
def split_input_line(line):
    uid, movie_id, rating = line.strip().split(',')
    return (int(uid), (int(movie_id), float(rating)) )

user_ratings_RDD = input_text_RDD.map(split_input_line)
user_ratings_RDD.take(5)
# format: <uid, [(movie, rate)...]>

[(1, (10001, 5.0)),
 (1, (10002, 3.0)),
 (1, (10003, 2.5)),
 (2, (10001, 2.0)),
 (2, (10002, 2.5))]

In [31]:
# build co-occurance count
def create_co_occur_pairs(entry):
    uid, movie_pairs = entry
    movie_list = [x[0] for x in movie_pairs]
    l = len(movie_list)
    ans = []
    for i in xrange(l):
        for j in xrange(l):
            ans.append( ( (movie_list[i], movie_list[j]), 1) ) 
            # note: intentional count the pair AND the reversed pair
    return ans

co_occur_RDD = user_ratings_RDD.groupByKey().flatMap(create_co_occur_pairs).reduceByKey(lambda a, b : a+b)
co_occur_RDD.take(5)
# format: <(movieA, movieB), count>

[((10002, 10006), 1),
 ((10005, 10005), 2),
 ((10004, 10004), 4),
 ((10003, 10003), 4),
 ((10006, 10004), 2)]

In [32]:
# normalize by row
def pair_to_transition(pair_count):
    pair, count = pair_count
    return (pair[0], (pair[1], count))

def normalize_transitions(from_tos):
    from_movie, to_movies = from_tos      # rating pulled from from_movie to to_movie, 
    sum = 0
    for to_movie in to_movies:
        sum += to_movie[1]
    return [(to_movie[0], (from_movie, float(to_movie[1]) / sum)) for to_movie in to_movies]

norm_co_occur_RDD = co_occur_RDD.map(pair_to_transition).groupByKey().flatMap(normalize_transitions)
norm_co_occur_RDD.take(5)
# format: <to_movie, (from_movie, ratio)>

[(10006, (10002, 0.07692307692307693)),
 (10004, (10002, 0.15384615384615385)),
 (10002, (10002, 0.23076923076923078)),
 (10003, (10002, 0.23076923076923078)),
 (10001, (10002, 0.23076923076923078))]

In [33]:
# multiplication: predicted = co_occur * existing  (for each user)
def split_line_to_movie_entry(line):
    uid, movie_id, rating = line.strip().split(',')
    return ( int(movie_id), (int(uid), float(rating), 'r') )

movie_rating_RDD = input_text_RDD.map(split_line_to_movie_entry)
#movie_rating_RDD.take(5)
# format: <movie_id : (uid, rating, 'r')>

def unit_mult(movie_info):
    movie_id, info_list = movie_info
    co_occur = {}
    user_rating = {}
    for info in info_list:
        if len(info) == 2:     # co-occur
            co_occur[info[0]] = info[1]
        else:      # user rating
            user_rating[info[0]] = info[1]
    
    ans = []
    for user, rating in user_rating.iteritems():
        for from_movie, ratio in co_occur.iteritems():
            ans.append(( (user, from_movie), rating * ratio ))
    return ans

def multiply(existing_rating, co_occur):
    mat = existing_rating.union(co_occur)
    return mat.groupByKey().flatMap(unit_mult).reduceByKey(lambda a, b : a+b)

predicted_rating_RDD = multiply(movie_rating_RDD, norm_co_occur_RDD)
predicted_rating_RDD.take(5)
# format: <(uid, movie_id) : prediction>

[((1, 10005), 1.55),
 ((4, 10002), 2.8461538461538467),
 ((2, 10004), 1.9999999999999998),
 ((3, 10007), 3.875),
 ((1, 10001), 2.0952380952380953)]

In [35]:
# Group by user and only take top k for each user
top_k = 5

predicted_user_rating_RDD = predicted_rating_RDD.map(lambda x: (x[0][0], (x[0][1], x[1], 'p')))
#predicted_user_rating_RDD.take(5)
# format: <uid : (movie, predict_rating, 'p')>

def find_top_predicts(user_ratings):
    user, ratings = user_ratings
    watched = set()
    valid_predictions = []
    for rating in ratings:
        if len(rating) == 2:    # given data
            watched.add(rating[0])
    for rating in ratings:
        if len(rating) == 3:     # prediction
            if rating[0] not in watched:
                valid_predictions.append( (rating[1], rating[0]) )
    valid_predictions.sort(reverse = True)
    return ( user, [pair[1] for pair in valid_predictions[:top_k]] )

ratings_RDD = predicted_user_rating_RDD.union(user_ratings_RDD).groupByKey()
predicted_top_RDD = ratings_RDD.map(find_top_predicts)
predicted_top_RDD.collect()

[(1, [10004, 10006, 10005, 10007]),
 (2, [10006, 10005, 10007]),
 (3, [10006, 10003, 10002]),
 (4, [10002, 10005, 10007]),
 (5, [10007])]