In [7]:
# Importing the required libraries.
import pandas as pd
from math import pow, sqrt

# Reading ratings dataset into a pandas dataframe object.
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings = pd.read_csv('data/movie-recommendation/ratings.dat', sep='::', names=r_cols, encoding='latin-1')

# Getting number of users and movies from the dataset.
user_ids = ratings.user_id.unique().tolist()
movie_ids = ratings.movie_id.unique().tolist()
print('Number of Users: {}'.format(len(user_ids)))
print('Number of Movies: {}'.format(len(movie_ids)))
ratings

  return func(*args, **kwargs)


Number of Users: 6040
Number of Movies: 3706


Unnamed: 0,user_id,movie_id,rating,unix_timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291
...,...,...,...,...
1000204,6040,1091,1,956716541
1000205,6040,1094,5,956704887
1000206,6040,562,5,956704746
1000207,6040,1096,4,956715648


In [8]:
# Reading movies dataset into a pandas dataframe object.
m_cols = ['movie_id', 'movie_title', 'genre']
movies = pd.read_csv('data/movie-recommendation/movies.dat', sep='::', names=m_cols, encoding='latin-1')
movies

  return func(*args, **kwargs)


Unnamed: 0,movie_id,movie_title,genre
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
3878,3948,Meet the Parents (2000),Comedy
3879,3949,Requiem for a Dream (2000),Drama
3880,3950,Tigerland (2000),Drama
3881,3951,Two Family House (2000),Drama


In [9]:
# Getting series of lists by applying split operation.
movies.genre = movies.genre.str.split('|')
# Getting distinct genre types for generating columns of genre type.
genre_columns = list(set([j for i in movies['genre'].tolist() for j in i]))
# Iterating over every list to create and fill values into columns.
for j in genre_columns:
    movies[j] = 0
for i in range(movies.shape[0]):
    for j in genre_columns:
        if(j in movies['genre'].iloc[i]):
            movies.loc[i,j] = 1
# Separting movie title and year part using split function.
split_values = movies['movie_title'].str.split("(", n = 1, expand = True)
# setting 'movie_title' values to title part.
movies.movie_title = split_values[0]
# creating 'release_year' column.
movies['release_year'] = split_values[1]
# Cleaning the release_year series.
movies['release_year'] = movies.release_year.str.replace(')','')
# dropping 'genre' columns as it has already been one hot encoded.
movies.drop('genre',axis=1,inplace=True)
movies

  movies['release_year'] = movies.release_year.str.replace(')','')


Unnamed: 0,movie_id,movie_title,Western,Mystery,War,Musical,Children's,Horror,Comedy,Documentary,...,Action,Adventure,Film-Noir,Sci-Fi,Thriller,Fantasy,Animation,Crime,Drama,release_year
0,1,Toy Story,0,0,0,0,1,0,1,0,...,0,0,0,0,0,0,1,0,0,1995
1,2,Jumanji,0,0,0,0,1,0,0,0,...,0,1,0,0,0,1,0,0,0,1995
2,3,Grumpier Old Men,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1995
3,4,Waiting to Exhale,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,1995
4,5,Father of the Bride Part II,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1995
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3878,3948,Meet the Parents,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,2000
3879,3949,Requiem for a Dream,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,2000
3880,3950,Tigerland,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,2000
3881,3951,Two Family House,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,2000


In [10]:
# Getting the rating given by a user to a movie.
def get_rating_(userid, movieid):
    return (ratings.loc[(ratings.user_id==userid) & (ratings.movie_id == movieid),'rating'].iloc[0])
# Getting the list of all movie ids the specified user has rated.
def get_movieids_(userid):
    return (ratings.loc[(ratings.user_id==userid), 'movie_id'].tolist())
# Getting the movie titles against the movie id.
def get_movie_title_(movieid):
    return (movies.loc[(movies.movie_id == movieid), 'movie_title'].iloc[0])

In [43]:
def distance_similarity_score(user1, user2):
    rating1 = ratings.loc[ratings['user_id']==user1, ['user_id', 'movie_id', 'rating']]
    rating2 = ratings.loc[ratings['user_id']==user2, ['user_id', 'movie_id', 'rating']]
    inner_rating = pd.merge(rating1, rating2, how='inner', on='movie_id')
    inner_rating['distance'] = inner_rating.apply(lambda x: (x['rating_x']-x['rating_y'])**2, axis=1)
    
    # Adding one to the denominator to avoid divide by zero error.
    return sqrt(sum(inner_rating['distance']))/(inner_rating.shape[0]+1)

print('Distance based similarity between user ids 1 & 310: {}'.format(distance_similarity_score(1, 310)))

Distance based similarity between user ids 1 & 310: 0.22754153011921602


In [54]:
def pearson_correlation_score(user1,user2):
    rating1 = ratings.loc[ratings['user_id']==user1, ['user_id', 'movie_id', 'rating']]
    rating2 = ratings.loc[ratings['user_id']==user2, ['user_id', 'movie_id', 'rating']]
    inner_rating = pd.merge(rating1, rating2, how='inner', on='movie_id')
    return pd.Series.corr(rating['rating_x'], rating['rating_y'])

print('Pearson Corelation between user ids 11 & 30: {}'.format(pearson_correlation_score(11,30)))

Pearson Corelation between user ids 11 & 30: 0.20425716847526612


In [55]:
def most_similar_users_(user1, number_of_users, metric='pearson'):
    '''
    user1 : Targeted User
    number_of_users : number of most similar users you want to user1.
    metric : metric to be used to calculate inter-user similarity score. ('pearson' or else)
    '''
    # Getting distinct user ids.
    user_ids = ratings.user_id.unique().tolist()
    
    # Getting similarity score between targeted and every other suer in the list(or subset of the list).
    if(metric == 'pearson'):
        similarity_score = [(pearson_correlation_score(user1,nth_user),nth_user) for nth_user in user_ids if nth_user != user1]
    else:
        similarity_score = [(distance_similarity_score(user1,nth_user),nth_user) for nth_user in user_ids if nth_user != user1]
    
    # Sorting in descending order.
    similarity_score.sort()
    similarity_score.reverse()
    
    # Returning the top most 'number_of_users' similar users. 
    return similarity_score[:number_of_users]
print(most_similar_users_(23,5))

[(0.20425716847526612, 6040), (0.20425716847526612, 6039), (0.20425716847526612, 6038), (0.20425716847526612, 6037), (0.20425716847526612, 6036)]


In [14]:
def get_recommendation_(userid):
    user_ids = ratings.user_id.unique().tolist()
    total = {}
    similariy_sum = {}
    
    # Iterating over subset of user ids.
    for user in user_ids[:100]:
        
        # not comparing the user to itself (obviously!)
        if user == userid:
            continue
        
        # Getting similarity score between the users.
        score = pearson_correlation_score(userid,user)
        
        # not considering users having zero or less similarity score.
        if score <= 0:
            continue
        
        # Getting weighted similarity score and sum of similarities between both the users.
        for movieid in get_movieids_(user):
            # Only considering not watched/rated movies
            if movieid not in get_movieids_(userid) or get_rating_(userid,movieid) == 0:
                total[movieid] = 0
                total[movieid] += get_rating_(user,movieid) * score
                similariy_sum[movieid] = 0
                similariy_sum[movieid] += score
    
    # Normalizing ratings
    ranking = [(tot/similariy_sum[movieid],movieid) for movieid,tot in total.items()]
    ranking.sort()
    ranking.reverse()
    
    # Getting movie titles against the movie ids.
    recommendations = [get_movie_title_(movieid) for score,movieid in ranking]
    return recommendations[:10]
print(get_recommendation_(32))

['Invisible Man, The ', 'Creature From the Black Lagoon, The ', 'Hellraiser ', 'Almost Famous ', 'Way of the Gun, The ', 'Shane ', 'Naked Gun 2 1/2: The Smell of Fear, The ', "Kelly's Heroes ", 'Official Story, The ', 'Everything You Always Wanted to Know About Sex ']
