<a href="https://colab.research.google.com/github/zhukuixi/Udacity_DataScientistNanoDegree/blob/main/Recommendation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!git clone https://github.com/zhukuixi/Udacity_DataScientistNanoDegree

Cloning into 'Udacity_DataScientistNanoDegree'...
remote: Enumerating objects: 283, done.[K
remote: Counting objects: 100% (106/106), done.[K
remote: Compressing objects: 100% (75/75), done.[K
remote: Total 283 (delta 55), reused 69 (delta 29), pack-reused 177[K
Receiving objects: 100% (283/283), 135.38 MiB | 16.17 MiB/s, done.
Resolving deltas: 100% (111/111), done.
Updating files: 100% (46/46), done.


# 1.Data Preparation

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import progressbar

%matplotlib inline

# Read in the datasets
movies = pd.read_csv('/content/Udacity_DataScientistNanoDegree/MovieTweet/data/original_movies.dat',
                     delimiter='::',
                     header=None,
                     names=['movie_id', 'movie', 'genre'],
                     dtype={'movie_id': object}, engine='python')

reviews = pd.read_csv('/content/Udacity_DataScientistNanoDegree/MovieTweet/data/original_ratings.dat',
                      delimiter='::',
                      header=None,
                      names=['user_id', 'movie_id', 'rating', 'timestamp'],
                      dtype={'movie_id': object, 'user_id': object, 'timestamp': object},
                      engine='python')

# Reduce the size reviews dataset
reviews = reviews.loc[:100000,:]

## 1.1 Data Exploration


In [None]:
# Check the data
print(movies.head())
print("\n")
print(reviews.head())
print("\n")

# Check the missing value
print(movies.isna().mean())
print("\n")
print(reviews.isna().mean())

  movie_id                                              movie  \
0  0000008      Edison Kinetoscopic Record of a Sneeze (1894)   
1  0000010                La sortie des usines Lumière (1895)   
2  0000012                      The Arrival of a Train (1896)   
3       25  The Oxford and Cambridge University Boat Race ...   
4  0000091                         Le manoir du diable (1896)   

               genre  
0  Documentary|Short  
1  Documentary|Short  
2  Documentary|Short  
3                NaN  
4       Short|Horror  


  user_id  movie_id  rating   timestamp
0       1   0114508       8  1381006850
1       2   0208092       5  1586466072
2       2   0358273       9  1579057827
3       2  10039344       5  1578603053
4       2   6751668       9  1578955697


movie_id    0.000000
movie       0.000000
genre       0.007018
dtype: float64


user_id      0.0
movie_id     0.0
rating       0.0
timestamp    0.0
dtype: float64


In [None]:
dict_sol1 = {
'The number of movies in the dataset': movies.shape[0],
'The number of ratings in the dataset': reviews.shape[0],
'The number of different genres': movies['genre'].str.split("|").to_frame('genre').explode('genre')['genre'].nunique(),
'The number of unique users in the dataset': reviews['user_id'].nunique(),
'The number missing ratings in the reviews dataset': sum(pd.isna(reviews['rating'])),
'The average rating given across all ratings': reviews['rating'].mean(),
'The minimum rating given across all ratings': reviews['rating'].min(),
'The maximum rating given across all ratings': reviews['rating'].max()
}
dict_sol1

{'The number of movies in the dataset': 35479,
 'The number of ratings in the dataset': 100001,
 'The number of different genres': 28,
 'The number of unique users in the dataset': 8022,
 'The number missing ratings in the reviews dataset': 0,
 'The average rating given across all ratings': 7.397666023339767,
 'The minimum rating given across all ratings': 0,
 'The maximum rating given across all ratings': 10}

## 1.2 Data Cleaning

we need to pull some additional relevant information out of the existing columns.  

For each of the datasets, there are a couple of cleaning steps we need to take care of:  

Movies  
- Pull the date from the title and create new column  
- Dummy the date column with 1's and 0's for each century of a movie (1800's, 1900's, and 2000's)  
- Dummy column the genre with 1's and 0's for each genre  

Reviews  
- Create a date out of time stamp  



In [None]:
movies['date'] = movies['movie'].str[-5:-1]
dummy_time = pd.get_dummies(movies['date'].str[:2]+"00's")
movies_new = pd.concat([movies,dummy_time],axis=1)

In [None]:
#amenities = np.unique(np.concatenate(listings['amenities'].map(lambda amns: amns.split("|"))))

total_genres = set()
for gen in movies_new['genre'].dropna().str.split("|"):
  for g in gen:
    total_genres.add(g)

def getCategory(x,g):
  if pd.isna(x):
    return 0
  return 1 if g in x else 0

for g in total_genres:
  movies_new[g] = movies_new['genre'].map(lambda x:getCategory(x,g))




In [None]:
movies_new.head()

Unnamed: 0,movie_id,movie,genre,date,1800's,1900's,2000's,Family,Fantasy,Drama,...,Short,War,Talk-Show,Adventure,Adult,Reality-TV,Mystery,Comedy,Horror,Documentary
0,8,Edison Kinetoscopic Record of a Sneeze (1894),Documentary|Short,1894,1,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
1,10,La sortie des usines Lumière (1895),Documentary|Short,1895,1,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
2,12,The Arrival of a Train (1896),Documentary|Short,1896,1,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
3,25,The Oxford and Cambridge University Boat Race ...,,1895,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,91,Le manoir du diable (1896),Short|Horror,1896,1,0,0,0,0,0,...,1,0,0,0,0,0,0,0,1,0


In [None]:
from datetime import datetime
reviews['date'] = reviews['timestamp'].apply(lambda x:datetime.fromtimestamp(int(x)))
reviews_new = reviews
reviews_new.head()

Unnamed: 0,user_id,movie_id,rating,timestamp,date
0,1,114508,8,1381006850,2013-10-05 21:00:50
1,2,208092,5,1586466072,2020-04-09 21:01:12
2,2,358273,9,1579057827,2020-01-15 03:10:27
3,2,10039344,5,1578603053,2020-01-09 20:50:53
4,2,6751668,9,1578955697,2020-01-13 22:48:17


# 2.Knowledge Based Recommendation
Here we tried to implement a recommendation system based on most popular items.

## 2.1 Part I: How To Find The Most Popular Movies?    
For this part, we have a single task. The task is that no matter the user, we need to provide a list of the recommendations based on simply the most popular items.    

For this task, we will consider what is "most popular" based on the following criteria:  

A movie with the highest **average rating** is considered best
With ties, movies that have **more ratings** are better
A movie must have a **minimum of 5 ratings** to be considered among the best movies
If movies are tied in their average rating and number of ratings, the ranking is determined by the movie that is the **most recent rating**
  
With these criteria, the goal for this notebook is to take a user_id and provide back the n_top recommendations. Use the function below as the scaffolding that will be used for all the future recommendations as well.
  
Before you implement your code for popular_recommendations function, we will provide a helper function, called create_ranked_df. This helper function transforms movies and reviews dataframes into a ranked_movies dataframe of movies that are sorted by the highest average rating & time and have more than 4 ratings.

In [None]:
# This helper function transforms `movies` and `reviews` dataframes
# into a `ranked_movies` dataframe of movies that are sorted
# by the highest average rating & time and have more than 4 ratings.

def create_ranked_df(movies, reviews):
        '''
        INPUT
        movies - the movies dataframe
        reviews - the reviews dataframe

        OUTPUT
        ranked_movies - a dataframe with movies that are sorted by highest avg rating, more reviews,
                        then time, and must have more than 4 ratings
        '''

        # Pull the average ratings and number of ratings for each movie
        movie_ratings = reviews.groupby('movie_id')['rating']
        avg_ratings = movie_ratings.mean()
        num_ratings = movie_ratings.count()
        last_rating = pd.DataFrame(reviews.groupby('movie_id').max()['date'])
        last_rating.columns = ['last_rating']

        # Add Dates
        rating_count_df = pd.DataFrame({'avg_rating': avg_ratings, 'num_ratings': num_ratings})
        rating_count_df = rating_count_df.join(last_rating)

        # merge with the movies dataset
        movie_recs = movies.set_index('movie_id').join(rating_count_df)

        # sort by top avg rating and number of ratings
        ranked_movies = movie_recs.sort_values(['avg_rating', 'num_ratings', 'last_rating'], ascending=False)

        # for edge cases - subset the movie list to those with only 5 or more reviews
        ranked_movies = ranked_movies[ranked_movies['num_ratings'] > 4]

        return ranked_movies

ranked_movies = create_ranked_df(movies_new, reviews_new)

In [None]:
def popular_recommendations(user_id, n_top, ranked_movies):
    '''
    INPUT:
    user_id - the user_id (str) of the individual you are making recommendations for
    n_top - an integer of the number recommendations you want back
    ranked_movies - a dataframe from

    OUTPUT:
    top_movies - a list of the n_top recommended movies by movie title in order best to worst
    '''

    # Implement your code here
    top_movies = ranked_movies['movie'][:n_top].to_list()
    return top_movies

In [None]:
# Top 20 movies recommended for id 1
recs_20_for_1 = popular_recommendations(1,20,ranked_movies)
recs_20_for_1

['Be Somebody (2016)',
 'Doctor Zhivago (1965)',
 'Taare Zameen Par (2007)',
 'Coldplay: A Head Full of Dreams (2018)',
 'City Lights (1931)',
 'Nema-ye Nazdik (1990)',
 'The Lord of the Rings: The Return of the King (2003)',
 'Tarzan (1999)',
 'Mimi wo sumaseba (1995)',
 'Drishyam (2015)',
 '12 Angry Men (1957)',
 'The Shawshank Redemption (1994)',
 'La meglio gioventù (2003)',
 "It's a Wonderful Life (1946)",
 'The Lord of the Rings: The Two Towers (2002)',
 'The Sound of Music (1965)',
 'Hotaru no haka (1988)',
 'Terminator 2: Judgment Day (1991)',
 'Hiroshima mon amour (1959)',
 'Aladdin (1992)']

## 2.2 Part II: Adding Filters    
Now that you have created a function to give back the n_top movies, let's make it a bit more robust. Add arguments that will act as filters for the movie year and genre.  

Use the cells below to adjust your existing function to allow for year and genre arguments as lists of strings. Then your ending results are filtered to only movies within the lists of provided years and genres (as or conditions). If no list is provided, there should be no filter applied.
  
You can adjust other necessary inputs as necessary to retrieve the final results you are looking for!  

In [None]:
def popular_recs_filtered(user_id, n_top, ranked_movies, years=None, genres=None):
    '''
    INPUT:
    user_id - the user_id (str) of the individual you are making recommendations for
    n_top - an integer of the number recommendations you want back
    ranked_movies - a pandas dataframe of the already ranked movies based on avg rating, count, and time
    years - a list of strings with years of movies
    genres - a list of strings with genres of movies

    OUTPUT:
    top_movies - a list of the n_top recommended movies by movie title in order best to worst
    '''

    # Implement your code here

    # Step 1: filter movies based on year and genre
    # Step 2: create top movies list
    filter_years = ranked_movies['date'].isin(years) if years!=None else 1
    filter_genres = ranked_movies[genres].sum(axis=1)>0 if genres!=None else 1
    row_filter = filter_years & filter_genres
    top_movies = list(ranked_movies.loc[row_filter,'movie'][:n_top])


    return top_movies



In [None]:
# Top 20 movies recommended for id 1 with years=['2015', '2016', '2017', '2018'], genres=['History']
recs_20_for_1_filtered = popular_recs_filtered(1,20,ranked_movies,years=['2015', '2016', '2017', '2018'], genres=['History'])
# Top 10 movies recommended for id 70000 with no year filter but genres=['History', 'News']
recs_10_for_70000_filtered = popular_recs_filtered(70000,10,ranked_movies, genres=['History','News'])

print(recs_20_for_1_filtered)
print(recs_10_for_70000_filtered)

['Taeksi woonjunsa (2017)', 'Ayla: The Daughter of War (2017)', 'Hacksaw Ridge (2016)', 'They Shall Not Grow Old (2018)', 'Straight Outta Compton (2015)', 'Hidden Figures (2016)', '13th (2016)', 'Little Boy (2015)', 'Under sandet (2015)', 'Hotel Mumbai (2018)', 'Darkest Hour (2017)', 'Kono sekai no katasumi ni (2016)', 'Bridge of Spies (2015)', 'Woman in Gold (2015)', 'The Birth of a Nation (2016)', 'The Big Short (2015)', 'Dunkirk (2017)', 'Victoria &amp; Abdul (2017)', 'Anthropoid (2016)', 'Truth (2015)']
['Hotel Rwanda (2004)', "Schindler's List (1993)", 'Amadeus (1984)', 'Gone with the Wind (1939)', 'Lawrence of Arabia (1962)', 'Braveheart (1995)', 'Barry Lyndon (1975)', 'Gandhi (1982)', 'Taeksi woonjunsa (2017)', 'Before the Flood (2016)']


# 3.Collaborative Filtering Recommendation
Neighborhood basd method
- Get the user-item matrix
- Get the neighborhood using euclidean distance
- Recommend movies watched and liked by neighbors in neighborhood but
  have not been watched by the user


## Get the user-item matrix

In [None]:
user_items = reviews[['user_id', 'movie_id', 'rating']]
user_items['user_id'] = user_items['user_id'].astype('int')
user_items.head()


In [None]:
# Create user-by-item matrix
user_by_movie = user_items.pivot_table(index='user_id',columns='movie_id',values='rating')


In [None]:
# Create a dictionary with users and corresponding movies seen

def create_user_movie_dict():
    '''
    INPUT: None
    OUTPUT: movies_seen - a dictionary where each key is a user_id and the value is an array of movie_ids

    Creates the movies_seen dictionary
    '''

    movies_seen = {}
    for i in range(user_by_movie.shape[0]):
        user_id = int(user_by_movie.index[i])
        movie_id = list(user_by_movie.iloc[i].loc[pd.isna(user_by_movie.iloc[i])==False].index)
        movies_seen[user_id] = movie_id

    return movies_seen


def movies_watched(user_id):
    '''
    INPUT:
    user_id - the user_id of an individual as int
    OUTPUT:
    movies - an array of movies the user has watched
    '''

    movies = movies_seen[user_id]
    return movies

movies_seen = create_user_movie_dict()

In [None]:
# Remove individuals who have watched 2 or fewer movies - don't have enough data to make recs

def create_movies_to_analyze(movies_seen, lower_bound=2):
    '''
    INPUT:
    movies_seen - a dictionary where each key is a user_id and the value is an array of movie_ids
    lower_bound - (an int) a user must have more movies seen than the lower bound to be added to the movies_to_analyze dictionary

    OUTPUT:
    movies_to_analyze - a dictionary where each key is a user_id and the value is an array of movie_ids

    The movies_seen and movies_to_analyze dictionaries should be the same except that the output dictionary has removed

    '''

    movies_to_analyze = {}
    user_to_add = [k for k,v in movies_seen.items() if len(v)>lower_bound]
    for u in user_to_add:
        movies_to_analyze[u] = movies_seen[u]

    return movies_to_analyze

movies_to_analyze = create_movies_to_analyze(movies_seen)

## Compute User Distance

In [None]:
def compute_euclidean_dist(user1, user2):
    '''
    INPUT
    user1 - int user_id
    user2 - int user_id
    OUTPUT
    the euclidean distance between user1 and user2
    '''

    movie1 = movies_to_analyze[user1]
    movie2 = movies_to_analyze[user2]
    common_movie = list(set(movie1).intersection(set(movie2)))
    data = user_by_movie.loc[[user1,user2],common_movie].transpose()
    dist = np.sqrt(sum([(data.iloc[i,0] - data.iloc[i,1])**2 for i in range(data.shape[0])]))
    return dist

In [None]:
# Compute the eculidean distance between users
# Here, for simplicity I only compute the first 50 users

unique_user = list(movies_to_analyze.keys())[:50]
df_dist = []
for i in range(len(unique_user)-1):
  for j in range(i+1,len(unique_user)):
    user1,user2 = unique_user[i], unique_user[j]
    corr = compute_euclidean_dist(user1,user2)
    df_dist.append([user1,user2,corr])
    df_dist.append([user2,user1,corr])

df_dist = pd.DataFrame(df_dist,columns=['user1','user2','eucl_dist'])


## Build Recommendation Engine

In [None]:
def find_closest_neighbors(user):
    '''
    INPUT:
        user - (int) the user_id of the individual you want to find the closest users
    OUTPUT:
        closest_neighbors - an array of the id's of the users sorted from closest to farthest away
    '''
    # Implement your code here

    closest_neighbors = list(df_dist.query('user1==@user').sort_values('eucl_dist')['user2'])[1:]
    return closest_neighbors



def movies_liked(user_id, min_rating=7):
    '''
    INPUT:
    user_id - the user_id of an individual as int
    min_rating - the minimum rating considered while still a movie is still a "like" and not a "dislike"
    OUTPUT:
    movies_liked - an array of movies the user has watched and liked
    '''
    # Implement your code here
    movies_liked = list(user_items.query('user_id==@user_id and rating>=@min_rating')['movie_id'])

    return movies_liked


def movie_names(movie_ids):
    '''
    INPUT
    movie_ids - a list of movie_ids
    OUTPUT
    movies - a list of movie names associated with the movie_ids
    '''
    # Implement your code here
    movie_lst = list(movies.query('movie_id.isin(@movie_ids)')['movie'])
    return movie_lst


def make_recommendations(user, num_recs=10):
    '''
    INPUT:
        user - (int) a user_id of the individual you want to make recommendations for
        num_recs - (int) number of movies to return
    OUTPUT:
        recommendations - a list of movies - if there are "num_recs" recommendations return this many
                          otherwise return the total number of recommendations available for the "user"
                          which may just be an empty list
    '''
    # Implement your code here
    seen = set(movies_to_analyze[user])
    neighbors = find_closest_neighbors(user)
    ans = set()
    for neigh in neighbors:
        for m in movie_names(movies_liked(neigh)):
            if m not in seen:
                ans.add(m)
        if len(ans) > num_recs:
            break


    recommendations = list(ans)[:num_recs]

    return recommendations

def all_recommendations(num_recs=10):
    '''
    INPUT
        num_recs (int) the (max) number of recommendations for each user
    OUTPUT
        all_recs - a dictionary where each key is a user_id and the value is an array of recommended movie titles
    '''
    # Implement your code here
    all_recs = {}
    for user_id in unique_user:
        all_recs[user_id] = make_recommendations(user_id,num_recs)


    return all_recs

all_recs = all_recommendations(10)

In [None]:
assert all_recs[100] == make_recommendations(100), "Oops!  Your recommendations for user 2 didn't match ours."
assert all_recs[1001] == make_recommendations(1001), "Oops!  It actually wasn't possible to make any recommendations for user 39."
assert all_recs[1024] == make_recommendations(1024), "Oops! Looks like your solution for user 1677 didn't match ours."

# Content based Recommendation

In the Collaborative Filtering, we remove users who have watched 2 or fewer movies to make sure robustness of the system. However, this will make many users don't gain any recommendation. Thus, we will look into content based recommendation to increase the user coverage of recommendation.

## User matrix multiplication to get the movie similarity matrix


In [None]:
movies_content  = np.array(movies_new.set_index('movie_id').drop(['movie','genre','date'],axis=1))

In [None]:
dot_prod_movies = movies_content.dot(movies_content.T)
movie_id_mapping = {k:v for k,v in zip(movies_new['movie_id'],movies_new['movie'])

For each user, we will perform the following:

i. For each movie, find the movies that are most similar that the user hasn't seen.

ii. Continue through the available, rated movies until 10 recommendations or until there are no additional movies.

In [None]:
def find_similar_movies(movie_id):
    '''
    INPUT
    movie_id - a movie_id
    OUTPUT
    similar_movies - an array of the most similar movies by title
    '''
    # find the index of movie_id
    idx = np.where(movies_new['movie_id']==movie_id)[0][0]
    row = dot_prod_movies[idx]
    # find the most similar movie indices
    similar_idxs = np.where(row==np.max(row))[0]
    similar_movies = movies_new.iloc[similar_idxs]['movie'].to_list()

    return similar_movies


def get_movie_names(movie_ids):
    '''
    INPUT
    movie_ids - a list of movie_ids
    OUTPUT
    movies - a list of movie names associated with the movie_ids

    '''
    # Implement your code here
    movie_lst = [movie_id_mapping[id] for id in movie_ids]
    return movie_lst

def make_recs():
    '''
    INPUT
    None
    OUTPUT
    recs - a dictionary with keys of the user and values of the recommendations
    '''

    recs = {}

    # Create the progressbar
    cnter = 0
    bar = progressbar.ProgressBar(maxval=len(unique_user)+1, widgets=[progressbar.Bar('=', '[', ']'), ' ', progressbar.Percentage()])
    bar.start()

    for u in unique_user:
      cnter += 1
      bar.update(cnter)
      recommend_set = set()
      movie_seen_id = user_items.query('user_id==@u').sort_values('rating',ascending=False)['movie_id']
      movie_seen_name = [movie_id_mapping[str(id)] for id in movie_seen_id]

      for movie_id in movie_seen_id:
        recommend_names = set(find_similar_movies(movie_id)) - set(movie_seen_name)
        recommend_set = recommend_set.union(recommend_names)
        if len(recommend_set) == 10:
          break
      recs[u] = list(recommend_set)

    bar.finish()
    return recs



In [None]:
recs = make_recs()

