#### USER BASED COLLABORATIVE FILTERING & RECOMMENDATION

In [1]:
import numpy as np
import numpy.ma as ma
import pandas as pd
# We do this to ignore several specific Pandas warnings
import warnings
warnings.filterwarnings("ignore")
#https://docs.scipy.org/doc/numpy-1.15.0/reference/maskedarray.generic.html

In [3]:
ratings = pd.read_csv('ratings.csv')
movies= pd.read_csv('movies.csv')

ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205
5,1,1263,2.0,1260759151
6,1,1287,2.0,1260759187
7,1,1293,2.0,1260759148
8,1,1339,3.5,1260759125
9,1,1343,2.0,1260759131


In [6]:
movies
#len(movies)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
5,6,Heat (1995),Action|Crime|Thriller
6,7,Sabrina (1995),Comedy|Romance
7,8,Tom and Huck (1995),Adventure|Children
8,9,Sudden Death (1995),Action
9,10,GoldenEye (1995),Action|Adventure|Thriller


In [4]:
ratings.head()
#len(ratings)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [5]:
movie_lens=pd.merge(movies,ratings)
len(movie_lens)

100004

In [6]:
movie_lens.head(50)

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,7,3.0,851866703
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,9,4.0,938629179
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,13,5.0,1331380058
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,15,2.0,997938310
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,19,3.0,855190091
5,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,20,3.5,1238729767
6,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,23,3.0,1148729853
7,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,26,5.0,1360087980
8,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,30,4.0,944943070
9,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,37,4.0,981308121


In [7]:
# 10 most commonly rated movies
movie_lens.title.value_counts().head(10)

Forrest Gump (1994)                          341
Pulp Fiction (1994)                          324
Shawshank Redemption, The (1994)             311
Silence of the Lambs, The (1991)             304
Star Wars: Episode IV - A New Hope (1977)    291
Jurassic Park (1993)                         274
Matrix, The (1999)                           259
Toy Story (1995)                             247
Schindler's List (1993)                      244
Terminator 2: Judgment Day (1991)            237
Name: title, dtype: int64

In [8]:
# Make a new Data structure that holds the movie, number of ratings, and the average rating.
#generating rating and size together. Movies got at least 25 feed backs.
mov_stats = movie_lens.groupby('title').agg({'rating': [np.size, np.mean]})

In [9]:
mov_stats.head()

Unnamed: 0_level_0,rating,rating
Unnamed: 0_level_1,size,mean
title,Unnamed: 1_level_2,Unnamed: 2_level_2
"""Great Performances"" Cats (1998)",2.0,1.75
$9.99 (2008),3.0,3.833333
'Hellboy': The Seeds of Creation (2004),1.0,2.0
'Neath the Arizona Skies (1934),1.0,0.5
'Round Midnight (1986),2.0,2.25


In [10]:
# Display most popular movies.
top_movies = mov_stats['rating']['size'] >= 25
mov_stats[top_movies].sort_values(by=('rating', 'mean'), ascending=False).head(20)

Unnamed: 0_level_0,rating,rating
Unnamed: 0_level_1,size,mean
title,Unnamed: 1_level_2,Unnamed: 2_level_2
"Godfather, The (1972)",200.0,4.4875
"Shawshank Redemption, The (1994)",311.0,4.487138
On the Waterfront (1954),29.0,4.448276
All About Eve (1950),38.0,4.434211
Ran (1985),26.0,4.423077
"African Queen, The (1951)",50.0,4.42
Roger & Me (1989),42.0,4.392857
"Maltese Falcon, The (1941)",62.0,4.387097
"Godfather: Part II, The (1974)",135.0,4.385185
It Happened One Night (1934),25.0,4.38


In [11]:
# Collaborative Filtering for Users
# Make a pivot table containing ratings indexed by user id and movie id
df = ratings.pivot(index='userId', columns='movieId', values='rating')

In [12]:
df.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,161084,161155,161594,161830,161918,161944,162376,162542,162672,163949
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,4.0,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,4.0,...,,,,,,,,,,
5,,,4.0,,,,,,,,...,,,,,,,,,,


In [15]:
data = df.applymap(lambda x: 1 if x > 3 else 0).as_matrix()
print(data.shape)

(671, 9066)


In [16]:
data

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0]])

In [17]:
"""
We should generate a new pivot table, 
first grouping the movies together, 
then sorting into descending order, and selecting only those movies 
that have been reviewed by multiple people
"""
#one review with scoring of five stars is not as good as 100 reviews with scoring 4.4 stars.
mvrs = ratings.groupby('movieId').size().sort_values(ascending=False)
tmp_ratings = ratings.ix[mvrs[mvrs > 25].index].dropna()
tmp_df = tmp_ratings.pivot(index='userId', columns='movieId', values='rating')

ddata = tmp_df.applymap(lambda x: 1 if x > 3 else 0).as_matrix()
print(ddata.shape)

(116, 773)


In [18]:
# Define the Cosine Similarity function

def cosine_similarity(u, v):
    return(np.dot(u, v)/np.sqrt((np.dot(u, u) * np.dot(v, v))))

In [19]:
a = np.array([1, 1, 1, 0, 0])
b = np.array([0, 0, 0, 1, 1])
c = np.array([0, 1, 0, 1, 1])

print('cosine similarity(a, b) = {0:4.3f}'.format(cosine_similarity(a, b)))
print('cosine similarity(a, c) = {0:4.3f}'.format(cosine_similarity(a, c)))
print('cosine similarity(b, c) = {0:4.3f}'.format(cosine_similarity(b, c)))

print('cosine similarity(a, a) = {0:4.3f}'.format(cosine_similarity(a, a)))


cosine similarity(a, b) = 0.000
cosine similarity(a, c) = 0.333
cosine similarity(b, c) = 0.816
cosine similarity(a, a) = 1.000


In [43]:
# The user-movie matrix

x = ddata

# Make a fake user
y = np.zeros(ddata.shape[1], dtype=np.int32)
y[6] = 1 ; y[10] = 1; y[15] = 1; y[64] = 1; y[136] = 1
y[180] = 1; y[230] = 1; y[339] = 1; y[622] = 1; y[703] = 1

# Add a special index column to map the row in the x matrix to the userIds
tmp_df.tmp_idx = np.array(range(x.shape[0]))

In [44]:
# Compute similarity, find maximum value
sims = np.apply_along_axis(cosine_similarity, 1, x, y)
mx = np.nanmax(sims)

# Find the best matching user
usr_idx = np.where(sims==mx)[0][0]

# 15 reviews of test user and matched user.
print(y[:15])
print(x[usr_idx, :15])

print('\nCosine Similarity(y, x[{0:d}]) = {1:4.3f}' \
      .format(usr_idx, cosine_similarity(y, x[usr_idx])), end='\n\n')

# Now we subtract the vectors
# (any negative value is a movie to recommend)
mov_vec = y - x[usr_idx]
#print(mov_vec[40:60])
# We want a mask aray, so we zero out any recommended movie.
mov_vec[mov_vec >= 0] = 1
mov_vec[mov_vec < 0] = 0

print(mov_vec[40:60])
# if it is underlize 0, we score to 0.
# if it is over 1, we round to 1.

[0 0 0 0 0 0 1 0 0 0 1 0 0 0 0]
[0 0 0 0 0 0 1 0 0 0 0 0 0 0 0]

Cosine Similarity(y, x[14]) = 0.115

[1 1 1 1 0 1 1 1 0 1 0 0 1 0 1 1 1 1 1 1]


In [42]:
i, = np.where(x[14] == 1)
i

array([  6,  44,  48,  50,  51,  53,  61,  62,  64,  81,  90, 100, 111,
       113, 115, 116, 121, 136, 140, 149, 155, 191, 195, 196, 198, 200,
       201, 203, 204, 205, 206, 210, 213, 216, 217, 219, 220, 225, 230,
       237, 238, 239, 242, 252, 254, 257, 260, 277, 280, 282, 285, 286,
       297, 310, 314, 316, 322, 323, 324, 332, 333, 340, 341, 348, 353,
       365, 367, 411, 415, 421, 436, 443, 450, 458, 463, 469, 475, 481,
       488, 489, 491, 501, 502, 505, 543, 546, 548, 556, 564, 569, 570,
       575, 581, 582, 586, 588, 592, 600, 601, 604, 614, 619, 641, 652,
       659, 664, 679, 683, 696, 702, 704, 709, 711, 713, 722, 731, 736,
       743, 751, 768])

In [31]:
# Print out the number of movies we will recommend.
print('\n{0} Movie Recommendations for User = {1}' \
      .format(mov_vec[mov_vec == 0].shape[0], 
              tmp_df[tmp_df.tmp_idx == usr_idx].index[0]))



116 Movie Recommendations for User = 15.0


In [32]:
# Get the columns (movieIds) for the current user
mov_ids = tmp_df[tmp_df.tmp_idx == usr_idx].columns

In [33]:

# Now make a masked array to find movies to recommend
# values are the movie ids, mask is the movies the most
# similar user liked.

ma_mov_idx = ma.array(mov_ids, mask = mov_vec)
mov_idx = ma_mov_idx[~ma_mov_idx.mask]

In [34]:
# Now make a DataFrame of the moves of interest and display

mv_df = movies.ix[movies.movieId.isin(mov_idx)].dropna()



for movie in mv_df.title.values[:10]:
    
    print(movie)
    
print(60*'-', end='\n\n')

Clerks (1994)
Exotica (1994)
Forget Paris (1995)
Hoop Dreams (1994)
Star Wars: Episode IV - A New Hope (1977)
Léon: The Professional (a.k.a. The Professional) (Léon) (1994)
Pulp Fiction (1994)
Red Rock West (1992)
Fugitive, The (1993)
Executive Decision (1996)
------------------------------------------------------------



In [36]:
from sklearn.model_selection import train_test_split

x, y = ddata, range(ddata.shape[0])

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.1, random_state=42)

# Add an index into the user-movie DataFrame for the movies that are in the
# user-movie matrix.
tmp_df.tmp_idx = np.array(y)

#### MULTIPLE RECOMMENDATIONS

In [69]:
#Iterate through each user in test set.
for idx, user in enumerate(x_test):
    
    # Compute similarity, find maximum value
    sims = np.apply_along_axis(cosine_similarity, 1, x_train, user)
    mx = np.nanmax(sims)
    
    # If maximum value is a real value    
    if mx > 0:
        
        # Find the index in the similarity matrix with maximum value
        train_idx = np.where(sims==mx)[0][0]
        
        # Now we subtract the vectors 
        # (any negative value is a movie to recommend)
        mov_vec = user - x_train[train_idx]
        
        # We make a mask aray, so we zero out any recommended movie.
        mov_vec[mov_vec >= 0] = 1
        mov_vec[mov_vec < 0] = 0
        
        # We use the fact that y_train has the indices into the original
        # temporary data frame

        user_idx = tmp_df[tmp_df.tmp_idx == y_train[train_idx]]

        # State how many movies are being recommend for this user id
        print('{0} Movie Recommendations for User = {1}' \
              .format(mov_vec[mov_vec == 0].shape[0], \
                      tmp_df[tmp_df.tmp_idx == y_test[idx]].index[0]))
        
        print(60*'-')
        # Now make a masked array to find movies to recommend
        # values are the movie ids, mask is the movies the most
        # similar user liked.
        ma_mov_idx = ma.array(user_idx.columns, mask = mov_vec)
        mov_idx = ma_mov_idx[~ma_mov_idx.mask]
        
        # Now make a DataFrame of the moves of interest and display
        mv_df = movies.ix[movies.movieId.isin(mov_idx)].dropna()
        for movie in mv_df.title.values:
            print(movie)
            
        print(60*'-', end='\n\n')

119 Movie Recommendations for User = 408.0
------------------------------------------------------------
Casino (1995)
Clerks (1994)
Exotica (1994)
Forget Paris (1995)
Hoop Dreams (1994)
Star Wars: Episode IV - A New Hope (1977)
Léon: The Professional (a.k.a. The Professional) (Léon) (1994)
Pulp Fiction (1994)
Three Colors: White (Trzy kolory: Bialy) (1994)
Red Rock West (1992)
Fugitive, The (1993)
Executive Decision (1996)
Blade Runner (1982)
Thirty-Two Short Films About Glenn Gould (1993)
True Romance (1993)
War Room, The (1993)
Batman (1989)
Rock, The (1996)
Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb (1964)
She's the One (1996)
Charade (1963)
When We Were Kings (1996)
Sex, Lies, and Videotape (1989)
Star Wars: Episode V - The Empire Strikes Back (1980)
Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981)
Aliens (1986)
Good, the Bad and the Ugly, The (Buono, il brutto, il cattivo, Il) (1966)
Clockwork Orange, A (1971)
Apocalypse Now 

  after removing the cwd from sys.path.
  


Recap: First we developed the algorithm to make a recommendation for a single user. To do this, we first create a fake user, by selecting several movies as favorable (effectively we simply make a new user vector). Given this new vector, we compute the cosine similarity between this new user and all users in our reduced data user-movie matrix. We identify the user who is most similar by selecting the row in the user-movie matrix with the highest cosine similarity, extract the movies favorably rated by this particular user, remove any that have already been rated by our fake user, and display the results.

To simplify the identification of the correct movie title, we add a new column to the DataFrame that holds the joined rating and movie data to map between movieId in our user-movie matrix and the movieID used in the original data.