# Movie Recommendation System

### 1\. Load and pre-precess data

In [None]:
import pandas as pd
import os

data_file_path = os.getenv('HOME') + '/workplace/Miniprojects/Movie_Recommendation/data/ratings.dat'
data_cols = ['user_id', 'movie_id', 'rating']
data = pd.read_csv(data_file_path, sep='::', usecols=[0,1,2], names=data_cols, engine='python')
orginal_data_size = len(data)
data.head()

In [2]:
# delete data with ratings below 3
data = data[data['rating']>=3]
filtered_data_size = len(data)

print(f'orginal_data_size: {orginal_data_size}, filtered_data_size: {filtered_data_size}')
print(f'Ratio of Remaining Data is {filtered_data_size / orginal_data_size:.2%}')


orginal_data_size: 1000209, filtered_data_size: 836478
Ratio of Remaining Data is 83.63%


In [3]:
# load the datasets with movie names
movies_file_path = os.getenv('HOME') + '/workplace/Miniprojects/Movie_Recommendation/data/movies.dat'
movies_cols = ['movie_id', 'movie_name', 'genres']
movies = pd.read_csv(movies_file_path, sep='::', names=movies_cols, engine='python')
movies.head()

Unnamed: 0,movie_id,movie_name,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
# add movie names to the data
data = data.merge(movies, how='left', on='movie_id').drop(columns=['genres'])
data.head()

Unnamed: 0,user_id,movie_id,rating,movie_name
0,1,1193,5,One Flew Over the Cuckoo's Nest (1975)
1,1,661,3,James and the Giant Peach (1996)
2,1,914,3,My Fair Lady (1964)
3,1,3408,4,Erin Brockovich (2000)
4,1,2355,5,"Bug's Life, A (1998)"


### 2\. Explore data

In [5]:
# number of users and movies
num_user, num_movie = data['user_id'].nunique(), data['movie_id'].nunique()
print(f'number of users: {num_user}, number of movies: {num_movie}')

number of users: 6039, number of movies: 3628


In [6]:
# most popular movies
movie_count = data.groupby('movie_name')['user_id'].count()
movie_count.sort_values(ascending=False).head(30)

movie_name
American Beauty (1999)                                   3211
Star Wars: Episode IV - A New Hope (1977)                2910
Star Wars: Episode V - The Empire Strikes Back (1980)    2885
Star Wars: Episode VI - Return of the Jedi (1983)        2716
Saving Private Ryan (1998)                               2561
Terminator 2: Judgment Day (1991)                        2509
Silence of the Lambs, The (1991)                         2498
Raiders of the Lost Ark (1981)                           2473
Back to the Future (1985)                                2460
Matrix, The (1999)                                       2434
Jurassic Park (1993)                                     2413
Sixth Sense, The (1999)                                  2385
Fargo (1996)                                             2371
Braveheart (1995)                                        2314
Men in Black (1997)                                      2297
Schindler's List (1993)                                  22

### 3\. Add my own data

In [7]:
def data_update(user_name, movie_list):
    global data
    n = len(movie_list)
    my_playlist = pd.DataFrame({'user_id': [user_name]*n, 'movie_name': list(movie_list.keys()), 
                                'rating': list(movie_list.values())}).merge(
        movies, how='left', on='movie_name').drop(columns=['genres'])
    if not data.isin({'user_id':[user_name]})['user_id'].any():  
        data = data.append(my_playlist)

my_list = {'While You Were Sleeping (1995)':5, 'Terminator 2: Judgment Day (1991)':4,
           'Titanic (1997)':4, 'Parent Trap, The (1998)':5, 'Matrix, The (1999)':5}       

data_update('Theo', my_list)

In [8]:
data.tail(15)

Unnamed: 0,user_id,movie_id,rating,movie_name
836468,6040,2021,3,Dune (1984)
836469,6040,2022,5,"Last Temptation of Christ, The (1988)"
836470,6040,2028,5,Saving Private Ryan (1998)
836471,6040,1080,4,Monty Python's Life of Brian (1979)
836472,6040,1089,4,Reservoir Dogs (1992)
836473,6040,1090,3,Platoon (1986)
836474,6040,1094,5,"Crying Game, The (1992)"
836475,6040,562,5,Welcome to the Dollhouse (1995)
836476,6040,1096,4,Sophie's Choice (1982)
836477,6040,1097,4,E.T. the Extra-Terrestrial (1982)


### 4\. Build CSR Matrix

In [9]:
user_to_idx = {v:k for k,v in enumerate(data['user_id'].unique())}
movie_to_idx = {v:k for k,v in enumerate(data['movie_id'].unique())}
name_to_idx = {v:k for k,v in enumerate(data['movie_name'].unique())}

In [10]:
def data_indexing():
    temp_user_data = data['user_id'].map(user_to_idx.get).dropna()
    if len(temp_user_data) == len(data):
        print('user_id column indexing OK!!')
        data['user_id'] = temp_user_data
    else:
        print('user_id column indexing Fail!!')

    temp_movie_data = data['movie_id'].map(movie_to_idx.get).dropna()
    if len(temp_movie_data) == len(data):
        print('movie column indexing OK!! \n')
        data['movie_id'] = temp_movie_data
    else:
        print('movie column indexing Fail!! \n')

In [11]:
print(data)

data_indexing()

print(data)

   user_id  movie_id  rating                              movie_name
0        1      1193       5  One Flew Over the Cuckoo's Nest (1975)
1        1       661       3        James and the Giant Peach (1996)
2        1       914       3                     My Fair Lady (1964)
3        1      3408       4                  Erin Brockovich (2000)
4        1      2355       5                    Bug's Life, A (1998)
..     ...       ...     ...                                     ...
0     Theo       339       5          While You Were Sleeping (1995)
1     Theo       589       4       Terminator 2: Judgment Day (1991)
2     Theo      1721       4                          Titanic (1997)
3     Theo      2059       5                 Parent Trap, The (1998)
4     Theo      2571       5                      Matrix, The (1999)

[836483 rows x 4 columns]
user_id column indexing OK!!
movie column indexing OK!! 

    user_id  movie_id  rating                              movie_name
0         0      

In [12]:
from scipy.sparse import csr_matrix

csr_data = csr_matrix((data.rating, (data.user_id, data.movie_id)), 
                      shape=(data['user_id'].nunique(), data['movie_id'].nunique()))
csr_data

<6040x3628 sparse matrix of type '<class 'numpy.int64'>'
	with 836483 stored elements in Compressed Sparse Row format>

### 5\. Build ALS Model

In [13]:
from implicit.als import AlternatingLeastSquares
import os
import numpy as np

os.environ['OPENBLAS_NUM_THREADS']='1'
os.environ['KMP_DUPLICATE_LIB_OK']='True'
os.environ['MKL_NUM_THREADS']='1'

In [14]:
als_model = AlternatingLeastSquares(factors=100, regularization=0.01, use_gpu=False, iterations=15, dtype=np.float32)
als_model.fit(csr_data.T)

HBox(children=(FloatProgress(value=0.0, max=15.0), HTML(value='')))




### 6\. Get prediction score from the system

In [15]:
def prediction_score(user, movie):
    user_vector, movie_vector = als_model.user_factors[user_to_idx[user]], als_model.item_factors[name_to_idx[movie]] 
    return np.dot(user_vector, movie_vector)

In [16]:
prediction_score('Theo', 'Titanic (1997)')

0.5774534

In [17]:
prediction_score('Theo', 'Clueless (1995)')

0.056908254

### 7\. Find movies that are similar to favorite movie

In [18]:
idx_to_name = {v:k for k,v in name_to_idx.items()}

def get_similar_movie(movie_name, N=10):
    # will drop the top pick because it always return the inputted movie
    similar_movie = als_model.similar_items(name_to_idx[movie_name], N=N+1) 
    similar_movie = [(idx_to_name[i[0]],i[1]) for i in similar_movie]
    return pd.DataFrame(similar_movie[1:], index=range(1,N+1), columns=['movie_name', 'similarity']).merge(
        movies, how='left', on='movie_name').drop(columns=['movie_id'])

In [19]:
get_similar_movie('While You Were Sleeping (1995)', 15)

Unnamed: 0,movie_name,similarity,genres
0,My Best Friend's Wedding (1997),0.821211,Comedy|Romance
1,Sabrina (1995),0.813601,Comedy|Romance
2,You've Got Mail (1998),0.78259,Comedy|Romance
3,Sleepless in Seattle (1993),0.765284,Comedy|Romance
4,"Truth About Cats & Dogs, The (1996)",0.734693,Comedy|Romance
5,Pretty Woman (1990),0.715697,Comedy|Romance
6,I.Q. (1994),0.687633,Comedy|Romance
7,Fools Rush In (1997),0.66786,Comedy|Romance
8,Notting Hill (1999),0.646262,Comedy|Romance
9,French Kiss (1995),0.645604,Comedy|Romance


### 8\. Get recommendation from the system that user will like

In [20]:
def movie_recommendation(user, N=20):
    movie_recommended = als_model.recommend(user_to_idx[user], csr_data, N=N, filter_already_liked_items=True)
    movie_recommended = [(idx_to_name[i[0]],i[1]) for i in movie_recommended]
    return pd.DataFrame(movie_recommended, index=range(1,N+1), columns=['movie_name', 'score']).merge(
        movies, how='left', on='movie_name').drop(columns=['movie_id'])

In [21]:
movie_recommendation('Theo', 30)

Unnamed: 0,movie_name,score,genres
0,Jurassic Park (1993),0.359388,Action|Adventure|Sci-Fi
1,Men in Black (1997),0.343828,Action|Adventure|Comedy|Sci-Fi
2,"Terminator, The (1984)",0.283614,Action|Sci-Fi|Thriller
3,Total Recall (1990),0.261715,Action|Adventure|Sci-Fi|Thriller
4,You've Got Mail (1998),0.245169,Comedy|Romance
5,Star Wars: Episode IV - A New Hope (1977),0.24015,Action|Adventure|Fantasy|Sci-Fi
6,Alien (1979),0.240015,Action|Horror|Sci-Fi|Thriller
7,Sleepless in Seattle (1993),0.229046,Comedy|Romance
8,"Sixth Sense, The (1999)",0.219925,Thriller
9,Speed (1994),0.217038,Action|Romance|Thriller


In [22]:
def movie_explain(user, movie):
    explain = als_model.explain(user_to_idx[user], csr_data, itemid=name_to_idx[movie])
    explain = [(idx_to_name[i[0]], i[1]) for i in explain[1]]
    return pd.DataFrame(explain, index=range(1,len(explain)+1), columns=['movie_name', 'score']).merge(
        movies, how='left', on='movie_name').drop(columns=['movie_id'])

In [23]:
movie_explain('Theo','Jurassic Park (1993)')

Unnamed: 0,movie_name,score,genres
0,Terminator 2: Judgment Day (1991),0.205665,Action|Sci-Fi|Thriller
1,"Matrix, The (1999)",0.121103,Action|Sci-Fi|Thriller
2,Titanic (1997),0.034162,Drama|Romance
3,While You Were Sleeping (1995),0.0061,Comedy|Romance
4,"Parent Trap, The (1998)",-0.012813,Children's|Drama
