# Data Reading

In [1]:
import pandas as pd
import numpy as np
import csv
import operator
from math import sqrt, pow
from operator import itemgetter
from itertools import combinations
from collections import namedtuple, defaultdict

In [2]:
userratings = pd.read_csv('../data/ml-100k/u.data', sep='\t', names=['user_id', 'movie_id', 'rating'],usecols=range(3))
moviestitles = pd.read_csv('../data/ml-100k/u.item', sep='|', names=['movie_id','movie_title'],usecols=range(2),encoding='latin-1')

In [3]:
# movies : {'movie_id' : 'title', ...}
movies = defaultdict() 
for line in open('../data/ml-100k/u.item',encoding='latin-1'):
    record = line.strip().split('|')
    movie_id, movie_name = record[0], record[1]
    movies[movie_id] = movie_name
    
# raings : (MovieRating('user_id', 'movie_id', rating, timestamp), ...)
MovieRating = namedtuple('MovieRating', 'user_id, movie_id, rating, timestamp')
ratings = (record for record in map(MovieRating._make, csv.reader(open('../data/ml-100k/u.data'), delimiter='\t'))) 

# Movie Recommendation Analysis

## I. What are the recommended movies given a certain movie?

In [4]:
# usersRatings : {'user_id' : [('movie_id', rating),...]}
usersRatings = defaultdict(dict) 
u = defaultdict(dict)
for line in ratings:
    u[line.user_id][line.movie_id] = int(line.rating)
for k, v in u.items():
    usersRatings[k] = sorted(v.items())

In [5]:
def coratings():
    '''
    For each user, create the pair of corated movies using combination,
    then accumulate the base data used for the correlation coeff. by
    pair
    Returns a dict with each pair and its data
    '''
    coratings = defaultdict(dict)
    for user, ratings in usersRatings.items():
    # for ratings in usersRatings, make movie_rating pairs for each 'user_id' : [[('movie1', rating1),('movie2', rating2)],...]
        for pair in [list(zip(*corating)) for corating in combinations(ratings, 2)]:
            # pair : [(movie1,movie2),(raing1,rating2)]
            movie_pair = pair[0]
            rating_pair = pair[1]
            if movie_pair not in coratings:
                coratings[movie_pair] = defaultdict(float)
            coratings[movie_pair]['N'] += 1.0
            coratings[movie_pair]['ratingSum'] += rating_pair[0]
            coratings[movie_pair]['rating2Sum'] += rating_pair[1]
            coratings[movie_pair]['ratingSqSum'] += pow(rating_pair[0], 2)
            coratings[movie_pair]['rating2SqSum'] += pow(rating_pair[1], 2)
            coratings[movie_pair]['dotProductSum'] += rating_pair[0] * rating_pair[1]       
    return coratings # coratings : {(movie1,movie2):defaultdict({N: ,dotProductSum: ,rating2SqSum: ,rating2Sum: ,ratingSqSum: ,ratingSum: }), ...}

In [6]:
def recommendations(minimum_coratings=50):
    '''
    Actually builds the dict with the correlation coeff. for each pair of movies, 
    given that there is a sufficient number of coratings
    '''
    rec = defaultdict(dict)
    for movie_pair, rating_data in coratings().items():
        movie_1 = movie_pair[0]
        movie_2 = movie_pair[1]
        if rating_data['N'] >= minimum_coratings:
            num = rating_data['N'] * rating_data['dotProductSum'] - rating_data['ratingSum'] * rating_data['rating2Sum']
            den = sqrt(rating_data['N'] * rating_data['ratingSqSum'] - rating_data['ratingSum'] * rating_data['ratingSum']) * sqrt(rating_data['N'] * rating_data['rating2SqSum'] - rating_data['rating2Sum'] * rating_data['rating2Sum'])
            rec[movie_1][movie_2] = num/den
            rec[movie_2][movie_1] = num/den
    return rec # rec : {movie1:{movie2:rec}, ...}

In [7]:
def show_recommendations(top_n=5):
    '''
    Print the top n recommendations for each movie
    '''
    #lu = movies
    moviesRecommendation = pd.DataFrame(columns=['Title','Recommended Movie|Coeff.','Recommended Movie|Coeff.','Recommended Movie|Coeff.','Recommended Movie|Coeff.','Recommended Movie|Coeff.'])
    for movie, correlations in recommendations().items():
        
        recommended = []      
        # print movie, [e for i, e in enumerate(sorted(correlations.items(), key=itemgetter(1), reverse=True)) if i <= top_n-1]
        for i, related_movies in enumerate(sorted(correlations.items(), key=itemgetter(1), reverse=True)):
            if i < top_n:
                #print
                #break
                #print(movie, movies[movie], i+1, related_movies[0], movies[related_movies[0]], related_movies[1])
                recommended.append(movies[related_movies[0]]+' | '+str(format(related_movies[1],'.3f')))
                
            else: 
                break
        list_of_row = [movies[movie]]
        list_of_row.extend(recommended)
        list_of_row += [''] * (top_n + 1 - len(list_of_row))
        #print(len(list_of_row))
        df = pd.DataFrame([list_of_row],columns=['Title','Recommended Movie|Coeff.','Recommended Movie|Coeff.','Recommended Movie|Coeff.','Recommended Movie|Coeff.','Recommended Movie|Coeff.'])
        moviesRecommendation = moviesRecommendation.append(df)
    return moviesRecommendation.reset_index(drop=True)

In [8]:
show_recommendations().head(10)

Unnamed: 0,Title,Recommended Movie|Coeff.,Recommended Movie|Coeff..1,Recommended Movie|Coeff..2,Recommended Movie|Coeff..3,Recommended Movie|Coeff..4
0,Miller's Crossing (1990),"Graduate, The (1967) | 0.530",Seven (Se7en) (1995) | 0.468,GoodFellas (1990) | 0.429,Quiz Show (1994) | 0.426,Monty Python and the Holy Grail (1974) | 0.423
1,Striptease (1996),Twister (1996) | 0.422,Independence Day (ID4) (1996) | 0.394,Mission: Impossible (1996) | 0.272,Toy Story (1995) | 0.227,Return of the Jedi (1983) | 0.141
2,Bed of Roses (1996),"Truth About Cats & Dogs, The (1996) | 0.242",Jerry Maguire (1996) | 0.121,Toy Story (1995) | 0.071,,
3,George of the Jungle (1997),Jungle2Jungle (1997) | 0.400,Liar Liar (1997) | 0.346,Volcano (1997) | 0.322,"Rock, The (1996) | 0.303",Terminator 2: Judgment Day (1991) | 0.285
4,Maverick (1994),Home Alone (1990) | 0.482,Groundhog Day (1993) | 0.404,Indiana Jones and the Last Crusade (1989) | 0.395,"Net, The (1995) | 0.381",Speed (1994) | 0.367
5,Bonnie and Clyde (1967),GoodFellas (1990) | 0.617,Chinatown (1974) | 0.563,"Shining, The (1980) | 0.516",Butch Cassidy and the Sundance Kid (1969) | 0.453,"Grifters, The (1990) | 0.449"
6,Outbreak (1995),Crimson Tide (1995) | 0.570,"Time to Kill, A (1996) | 0.555","Rock, The (1996) | 0.504",In the Line of Fire (1993) | 0.485,GoldenEye (1995) | 0.480
7,Good Will Hunting (1997),Speed (1994) | 0.593,Seven (Se7en) (1995) | 0.538,Kiss the Girls (1997) | 0.533,"Rock, The (1996) | 0.480",Mr. Holland's Opus (1995) | 0.465
8,Hoodlum (1997),Conspiracy Theory (1997) | 0.247,Air Force One (1997) | 0.052,Liar Liar (1997) | 0.036,"Game, The (1997) | -0.003",Scream (1996) | -0.050
9,"Third Man, The (1949)",Casablanca (1942) | 0.604,Rear Window (1954) | 0.350,Fargo (1996) | 0.307,"Sting, The (1973) | 0.306",Citizen Kane (1941) | 0.210


## II. How to recommend movies to a given user?

In [9]:
ratings_df = pd.merge(userratings, moviestitles, on='movie_id')[['user_id', 'movie_title', 'movie_id','rating']]
ratings_df.head()  

Unnamed: 0,user_id,movie_title,movie_id,rating
0,196,Kolya (1996),242,3
1,63,Kolya (1996),242,3
2,226,Kolya (1996),242,5
3,154,Kolya (1996),242,3
4,306,Kolya (1996),242,5


In [10]:
# Convert the ratings_df to a matrix with a user per row and a movie per column
ratings_mtx_df = ratings_df.pivot_table(values='rating', index='user_id', columns='movie_title')  
ratings_mtx_df.fillna(0, inplace=True)
movie_index = ratings_mtx_df.columns
ratings_mtx_df.head()  

movie_title,'Til There Was You (1997),1-900 (1994),101 Dalmatians (1996),12 Angry Men (1957),187 (1997),2 Days in the Valley (1996),"20,000 Leagues Under the Sea (1954)",2001: A Space Odyssey (1968),3 Ninjas: High Noon At Mega Mountain (1998),"39 Steps, The (1935)",...,Yankee Zulu (1994),Year of the Horse (1997),You So Crazy (1994),Young Frankenstein (1974),Young Guns (1988),Young Guns II (1990),"Young Poisoner's Handbook, The (1995)",Zeus and Roxanne (1997),unknown,Á köldum klaka (Cold Fever) (1994)
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,2.0,5.0,0.0,0.0,3.0,4.0,0.0,0.0,...,0.0,0.0,0.0,5.0,3.0,0.0,0.0,0.0,4.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,2.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,...,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,4.0,0.0


In [11]:
# Use the numpy.corrcoef function, that calculates the Pearson Product Moment Correlation Coefficient (PMCC) between each item pair. 
corr_matrix = np.corrcoef(ratings_mtx_df.T)  # the PMCC has a value between -1 and 1 that measures the correlation (positive or negative) between two variables. 

In [12]:
# Sum the correlations of movies with all the other ones and return a list of movies sorted by their total correlation with the user
def get_movie_similarity(movie_title):  
    '''Returns correlation vector for a movie'''
    movie_idx = list(movie_index).index(movie_title)
    return corr_matrix[movie_idx]

def get_movie_recommendations(user_movies):  
    '''given a set of movies, it returns all the movies sorted by their correlation with the user'''
    movie_similarities = np.zeros(corr_matrix.shape[0])
    for movie_id in user_movies:
        movie_similarities = movie_similarities + get_movie_similarity(movie_id)
    similarities_df = pd.DataFrame({
        'movie_title': movie_index,
        'sum_similarity': movie_similarities
        })
    similarities_df = similarities_df[~(similarities_df.movie_title.isin(user_movies))]
    similarities_df = similarities_df.sort_values(by=['sum_similarity'], ascending=False)
    return similarities_df

In [13]:
# Provide movie recommendations to a given user by using her list of rated movies as an input.
def recommendationsForUser(user_id):
    sample_user_movies = ratings_df[ratings_df.user_id==user_id].movie_title.tolist()  
    recommendations = get_movie_recommendations(sample_user_movies)
    #We get the top 10 recommended movies
    return recommendations.movie_title.head(10)  

In [14]:
# Example output
recommendationsForUser(1)

446     E.T. the Extra-Terrestrial (1982)
132                         Batman (1989)
1533                     True Lies (1994)
1373                         Speed (1994)
319                       Clueless (1995)
1013                Mrs. Doubtfire (1993)
1386                   Stand by Me (1986)
312       Clear and Present Danger (1994)
667                       Heathers (1989)
380                           Dave (1993)
Name: movie_title, dtype: object

In [15]:
# Example output
recommendationsForUser(2)

642          Grosse Pointe Blank (1997)
883                    Lone Star (1996)
1176                 Primal Fear (1996)
345           Courage Under Fire (1996)
1059    Night Falls on Manhattan (1997)
171                    Big Night (1996)
1338                    Sleepers (1996)
481             Extreme Measures (1996)
1146                  Phenomenon (1996)
76            Angels and Insects (1995)
Name: movie_title, dtype: object