In [1]:
import numpy as np
import numpy.ma as ma
import pandas as pd
from sklearn.decomposition import TruncatedSVD
from scipy.sparse.linalg import svds
import warnings
warnings.filterwarnings('ignore')

In [2]:
import os
data_dir = 'C:/Users/a1381/Desktop/UIUC/Python/Project/Recommender_system/ml-latest-small/'

ratings_file = os.path.join(data_dir, 'ratings.csv')
movies_file = os.path.join(data_dir, 'movies.csv')

ratings = pd.read_csv(ratings_file)
movies = pd.read_csv(movies_file)

In [3]:
print("movie shape: ", movies.shape)
movies.tail()

movie shape:  (9125, 3)


Unnamed: 0,movieId,title,genres
9120,162672,Mohenjo Daro (2016),Adventure|Drama|Romance
9121,163056,Shin Godzilla (2016),Action|Adventure|Fantasy|Sci-Fi
9122,163949,The Beatles: Eight Days a Week - The Touring Y...,Documentary
9123,164977,The Gay Desperado (1936),Comedy
9124,164979,"Women of '69, Unboxed",Documentary


In [4]:
print("rating shape: ", ratings.shape)
ratings.tail()

rating shape:  (100004, 4)


Unnamed: 0,userId,movieId,rating,timestamp
99999,671,6268,2.5,1065579370
100000,671,6269,4.0,1065149201
100001,671,6365,4.0,1070940363
100002,671,6385,2.5,1070979663
100003,671,6565,3.5,1074784724


In [5]:
print('{} users and {} movies'.format(len(pd.unique(ratings['userId'])), len(pd.unique(ratings['movieId']))))

671 users and 9066 movies


In [6]:
mv_lens = pd.merge(movies, ratings)
mv_lens.head()

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,7,3.0,851866703
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,9,4.0,938629179
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,13,5.0,1331380058
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,15,2.0,997938310
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,19,3.0,855190091


In [7]:
# Display the top 5 most commonly rated movies
mv_lens.title.value_counts().head()

Forrest Gump (1994)                          341
Pulp Fiction (1994)                          324
Shawshank Redemption, The (1994)             311
Silence of the Lambs, The (1991)             304
Star Wars: Episode IV - A New Hope (1977)    291
Name: title, dtype: int64

In [8]:
# To see whether there are NAs in the dataset
mv_lens.rating.isnull().sum()

0

In [9]:
# Make a new Data structure that holds the movie, number of ratings, and the average rating.
# agg   https://pandas.pydata.org/pandas-docs/stable/generated/pandas.core.groupby.DataFrameGroupBy.agg.html
mv_stats = mv_lens.groupby('title').agg({'rating': [np.size, np.mean]}) # it can accept a list of functin e.g. [np.size, np.mean]

# Number of ratings to consider top movie
rating_count = 20

# Display most popular movies.
top_movies = mv_stats['rating']['size'] >= rating_count
mv_stats[top_movies].sort_values(by=('rating', 'mean'), ascending=False).head(10)

Unnamed: 0_level_0,rating,rating
Unnamed: 0_level_1,size,mean
title,Unnamed: 1_level_2,Unnamed: 2_level_2
"Godfather, The (1972)",200.0,4.4875
"Shawshank Redemption, The (1994)",311.0,4.487138
On the Waterfront (1954),29.0,4.448276
All About Eve (1950),38.0,4.434211
Ran (1985),26.0,4.423077
"African Queen, The (1951)",50.0,4.42
Roger & Me (1989),42.0,4.392857
"Maltese Falcon, The (1941)",62.0,4.387097
Band of Brothers (2001),22.0,4.386364
"Godfather: Part II, The (1974)",135.0,4.385185


In [10]:
mvrs = ratings.groupby(by='movieId').size().sort_values(ascending=False)
# Only consider those movies that are being rated over 20 times
tmp_ratings = ratings.ix[mvrs[mvrs > 20].index].dropna()
tmp_ratings.head()

Unnamed: 0_level_0,userId,movieId,rating,timestamp
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
356,5.0,231.0,3.5,1163374000.0
296,4.0,2263.0,3.0,949896300.0
318,4.0,2791.0,5.0,949811500.0
593,8.0,457.0,4.5,1154400000.0
260,4.0,2005.0,5.0,949896100.0


In [11]:
tmp_df = tmp_ratings.pivot(index='userId', columns='movieId', values='rating')

In [12]:
the_data = tmp_df.applymap(lambda x: 1 if x > 3 else 0).as_matrix()
print('The matrix may look like this and the shape:', the_data.shape)
print('Row means users and column means movie \n1 mens the user has seen this movie before and love it')
the_data[1:10, 1:10]

The matrix may look like this and the shape: (136, 928)
Row means users and column means movie 
1 mens the user has seen this movie before and love it


array([[0, 0, 0, 0, 0, 0, 0, 1, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 1, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0]], dtype=int64)

In [13]:
def cosine_similarity(u, v):
    return(np.dot(u, v)/np.sqrt((np.dot(u, u) * np.dot(v, v))))

In [14]:
# Example of cosine similarity
a = np.array([1, 1, 1, 0, 0])
b = np.array([0, 0, 0, 1, 1])
c = np.array([0, 1, 0, 1, 1])

print('cosine similarity(a, b) = {0:4.3f}'.format(cosine_similarity(a, b)))
print('cosine similarity(a, c) = {0:4.3f}'.format(cosine_similarity(a, c)))
print('cosine similarity(b, c) = {0:4.3f}'.format(cosine_similarity(b, c)))
print('cosine similarity(a, a) = {0:4.3f}'.format(cosine_similarity(a, a)))

cosine similarity(a, b) = 0.000
cosine similarity(a, c) = 0.333
cosine similarity(b, c) = 0.816
cosine similarity(a, a) = 1.000


In [15]:
# user-movie matrix
x = the_data

# create a fake user
y = np.zeros(the_data.shape[1], dtype=np.int32)
y[6] = 1 ; y[10] = 1; y[15] = 1; y[64] = 1; y[136] = 1
y[180] = 1; y[230] = 1; y[339] = 1; y[622] = 1; y[703] = 1


# Add a special index column to map the row in the x matrix to the userIds
tmp_df.tmp_idx = np.array(range(x.shape[0]))

In [16]:
sims = np.apply_along_axis(cosine_similarity, 1, x, y)
# Return the maximum of an array or maximum along an axis, ignoring any NaNs
mx = np.nanmax(sims)

# Find the best matching user
usr_idx = np.where(sims==mx)[0][0]

# Print the first thirty reviews of test user and matched user.
print('Fake user:\n', y[:30])
print('The most highly correlated user:\n', x[usr_idx, :30])
print('Similarity between them \n {}'.format(cosine_similarity(y, x[usr_idx])))

# Now we subtract the vectors
# (any negative value is a movie to recommend)
mov_vec = y - x[usr_idx]

# We want a mask aray, so we zero out any recommended movie.
mov_vec[mov_vec >= 0] = 1
mov_vec[mov_vec < 0] = 0


Fake user:
 [0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
The most highly correlated user:
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0]
Similarity between them 
 0.18257418583505536


In [17]:
# Print out the number of movies recommended.
print('{0} Movie Recommendations for User = {1}'.format(mov_vec[mov_vec == 0].shape[0], 
                                                        tmp_df[tmp_df.tmp_idx == usr_idx].index[0]))

2 Movie Recommendations for User = 43.0


In [18]:
# Get the columns (movieIds) for the current user
mov_ids = tmp_df[tmp_df.tmp_idx == usr_idx].columns

In [19]:
# Now make a masked array to find movies to recommend
# values are the movie ids, mask is the movies the most
# similar user liked.

ma_mov_idx = ma.array(mov_ids, mask = mov_vec)
mov_idx = ma_mov_idx[~ma_mov_idx.mask]

In [20]:
# Now make a DataFrame of the moves of interest and display

mv_df = movies.ix[movies.movieId.isin(mov_idx)].dropna()

print(60*'-')

for movie in mv_df.title.values:
    print(movie)

print(60*'-', end='\n\n')

------------------------------------------------------------
Braveheart (1995)
Jurassic Park (1993)
------------------------------------------------------------



# Model-based

In [21]:
combined_movies_data = pd.merge(ratings, movies, on='movieId')
combined_movies_data.head(10)

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,31,2.5,1260759144,Dangerous Minds (1995),Drama
1,7,31,3.0,851868750,Dangerous Minds (1995),Drama
2,31,31,4.0,1273541953,Dangerous Minds (1995),Drama
3,32,31,4.0,834828440,Dangerous Minds (1995),Drama
4,36,31,3.0,847057202,Dangerous Minds (1995),Drama
5,39,31,3.0,832525157,Dangerous Minds (1995),Drama
6,73,31,3.5,1255591860,Dangerous Minds (1995),Drama
7,88,31,3.0,1239755559,Dangerous Minds (1995),Drama
8,96,31,2.5,1223256331,Dangerous Minds (1995),Drama
9,110,31,4.0,840100695,Dangerous Minds (1995),Drama


In [22]:
mat = combined_movies_data.pivot_table('rating', 'userId', 'title', fill_value=0)
mat.head(10)

title,"""Great Performances"" Cats (1998)",$9.99 (2008),'Hellboy': The Seeds of Creation (2004),'Neath the Arizona Skies (1934),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),...,Zulu (1964),Zulu (2013),[REC] (2007),eXistenZ (1999),loudQUIETloud: A Film About the Pixies (2006),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931),İtirazım Var (2014)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0,0.0,...,0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0
2,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0,0.0,...,0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0
3,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0,0.0,...,0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0
4,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0,0.0,...,0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0
5,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0,0.0,...,0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0
6,0.0,0.0,0,0.0,0.0,0.0,0.0,4.0,0,0.0,...,0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0
7,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0,0.0,...,0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0
8,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0,0.0,...,0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0
9,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0,0.0,...,0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0
10,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0,0.0,...,0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0


In [23]:
mat.shape

(671, 9064)

In [24]:
# Prepare user and movie matrix
X_u = mat.values.T
X_m = mat.values

In [25]:
# For here, we choose 12 components (it's arbitrary)
svd = TruncatedSVD(n_components=12, random_state=17)

result_u = svd.fit_transform(X_u)
result_m = svd.fit_transform(X_m)
print('Shape of user matrix:{}\nShape of movie matrix:{}'.format(result_u.shape, result_m.shape))

Shape of user matrix:(9064, 12)
Shape of movie matrix:(671, 12)


In [26]:
# For here we just use features form user matrix
corr_mat = np.corrcoef(result_u)

In [27]:
movies_names = combined_movies_data.title
movie_list = list(movies_names)

# Try to find out which movies are correlated with movie Dangerous Minds (1995)
target = movie_list.index('Dangerous Minds (1995)')

In [28]:
cor_target = corr_mat[target]
cor_target.shape
cor_target

array([ 1.        ,  0.25887328,  0.0182122 , ...,  0.41584853,
       -0.05987688,  0.13845601])

In [29]:
# Set the threshold between 0.9 and 1.0
list(movies_names[(cor_target < 1.0) & (cor_target > 0.9)])

['Deer Hunter, The (1978)', 'Ben-Hur (1959)', 'Braveheart (1995)']