# MovieLens and Recommender System

### System I: Recommendation Based on Genres

In [5]:
import numpy as np
import pandas as pd
import streamlit as st


In [6]:
import warnings
warnings.filterwarnings('ignore')

#### Load the data

In [49]:
ratings = pd.read_csv('ratings.dat', sep='::', engine = 'python', header=None)
ratings.columns = ['UserID', 'MovieID', 'Rating', 'Timestamp']

movies = pd.read_csv('movies.dat', sep='::', engine = 'python',
                     encoding="ISO-8859-1", header = None)
movies.columns = ['MovieID', 'Title', 'Genres']

users = pd.read_csv(f'users.dat', sep='::', engine = 'python', header = None)
users.columns = ['UserID', 'Gender', 'Age', 'Occupation', 'Zipcode']

rating_matrix = pd.read_csv('Rmat.csv', sep=',')

In [8]:
rating_merged = ratings.merge(movies, left_on = 'MovieID', right_on = 'MovieID')
#rating_merged


### How recommendations are generated by Genre?

* We are going to recommend movies based on highly-rated by users.

* `How to handle movies with multiple genres?` We are denormalizing genre + Movie ID. By doing so, that Movie becomes a recommendation candidate / recall set of both the Genres.

* `A scenario to be addressed:` How do we come up with a score that has Rating & Number of Ratings embedded inside.

* Our idea is, to keep the Rating schema simple. Add a dimnishing factor. And this can shrink the Movies with less ratings more. And perform less shrinkage on movies with more ratings.

* Here is a scheme, we are coming with: 
    * (`avg_rating_of_the_movie * rating_count_of_the_movie` + `min_rating_of_all_movies * avg_rating_count_of_all_movies`) / (`rating_count_of_the_movie + avg_rating_count_of_all_movies`)


#### Weighted Rating computed for each Movie

In [9]:
movie_rating = rating_merged[['MovieID', 'Rating']].groupby("MovieID").agg(['mean', 'count']).droplevel(0, axis=1).reset_index()

movie_rating.rename(columns={"mean": "Rating", "count": "Rating_count"}, inplace=True)


avg_rating_count = movie_rating['Rating_count'].mean() 
#avg_rating = (movie_rating['Rating'] * movie_rating['Rating_count']).sum() / movie_rating['Rating_count'].sum()
#avg_rating = 2.5
avg_rating = movie_rating['Rating'].min()

movie_rating['Weighted_Rating'] = (movie_rating['Rating'] * movie_rating['Rating_count'] + avg_rating * avg_rating_count)  / (movie_rating['Rating_count'] + avg_rating_count)



#### Data frame is built that has Movies with Genres & Weighted Rating

In [10]:
movie_with_rating = movies.join(movie_rating.set_index('MovieID'), how='left', on="MovieID")

movie_with_rating['Weighted_Rating'].fillna(value=avg_rating, inplace=True)

#movie_with_rating.sort_values(by='Rating_count', ascending=False)[0:30]

In [11]:
genre_movie_ratings = movie_with_rating.copy()
genre_movie_ratings['Genres'] = genre_movie_ratings['Genres'].str.split('|')
genre_movie_ratings = genre_movie_ratings.explode('Genres')
#genre_movie_ratings

### Find movie by genre

In [12]:
def get_all_genre():
    genres = genre_movie_ratings['Genres'].unique()
    
    return genres

In [13]:
def find_top_movies_by_genre(genre, n=10):
    top_movies = genre_movie_ratings[genre_movie_ratings['Genres'] == genre]


    top_movies = top_movies.sort_values(by='Weighted_Rating', ascending=False)
    
    top_movies = top_movies[0:n]
    return top_movies



In [14]:
get_all_genre()

array(['Animation', "Children's", 'Comedy', 'Adventure', 'Fantasy',
       'Romance', 'Drama', 'Action', 'Crime', 'Thriller', 'Horror',
       'Sci-Fi', 'Documentary', 'War', 'Musical', 'Mystery', 'Film-Noir',
       'Western'], dtype=object)

In [15]:
find_top_movies_by_genre(genre='Drama', n=10)

Unnamed: 0,MovieID,Title,Genres,Rating,Rating_count,Weighted_Rating
315,318,"Shawshank Redemption, The (1994)",Drama,4.554558,2227.0,4.170345
847,858,"Godfather, The (1972)",Drama,4.524966,2223.0,4.143341
523,527,Schindler's List (1993),Drama,4.510417,2304.0,4.142327
2789,2858,American Beauty (1999),Drama,4.317386,3428.0,4.075268
589,593,"Silence of the Lambs, The (1991)",Drama,4.351823,2578.0,4.034177
1959,2028,Saving Private Ryan (1998),Drama,4.337354,2653.0,4.029195
1178,1196,Star Wars: Episode V - The Empire Strikes Back...,Drama,4.292977,2990.0,4.020348
604,608,Fargo (1996),Drama,4.254676,2513.0,3.939032
900,912,Casablanca (1942),Drama,4.412822,1669.0,3.937765
1176,1193,One Flew Over the Cuckoo's Nest (1975),Drama,4.390725,1725.0,3.931993


### System II: Recommendation Based on IBCF

In [16]:
def get_random_movie_set(n=10):
    movie_set = movies.sample(n)
    return movie_set

get_random_movie_set()

Unnamed: 0,MovieID,Title,Genres
2053,2122,Children of the Corn (1984),Horror|Thriller
1274,1294,M*A*S*H (1970),Comedy|War
470,474,In the Line of Fire (1993),Action|Thriller
3477,3546,What Ever Happened to Baby Jane? (1962),Drama|Thriller
1147,1163,Mina Tannenbaum (1994),Drama
2637,2706,American Pie (1999),Comedy
1803,1872,Go Now (1995),Drama
1310,1330,April Fool's Day (1986),Comedy|Horror
3301,3370,Betrayed (1988),Drama|Thriller
925,937,Love in the Afternoon (1957),Comedy|Romance


### Building Similarity Matrix

* Filtering top 30 in Similarity Matrix is implemented, but disabled.

* As we had difficulty matching the results of Similarity matrix with the rest of the Students group, we will be performing the filtering during the Prediction time



In [17]:
def build_similarity_matrix_v2():
    #rating_matrix = ratings.pivot_table(index="UserID", columns="MovieID", values="Rating")
    rating_matrix = pd.read_csv('Rmat.csv', sep=',')

    normalized_rating_matrix = rating_matrix.subtract(rating_matrix.mean(axis=1), axis='rows')

    cardinality_df = (~normalized_rating_matrix.isna()).astype('int')
    cardinality_df = cardinality_df.T
    cardinality_matrix = cardinality_df @ cardinality_df.T
    
    normalized_rating_matrix = normalized_rating_matrix.T
    normalized_rating_matrix = normalized_rating_matrix.fillna(0)

    nr = normalized_rating_matrix @ normalized_rating_matrix.T
    #print(nr)

    squared_normalized_rating_matrix = ((normalized_rating_matrix**2) @ (normalized_rating_matrix!=0).T)
    squared_normalized_rating_matrix = squared_normalized_rating_matrix.apply(np.vectorize(np.sqrt))
    dr = squared_normalized_rating_matrix * squared_normalized_rating_matrix.T
    #print(dr)
    
    cosine_distance = nr/dr
    S = (1 + cosine_distance)/2
    #print(S)
    
    np.fill_diagonal(S.values, np.nan)

    S[cardinality_matrix<3] = None
    #print(S)
    
    #S[S.rank(axis=1, ascending=False)>30] = None
    #print(S)
    return S

In [18]:
S = build_similarity_matrix_v2()

#### Display the pairwise similarity values from the S matrix for the following specified movies: “m1”, “m10”, “m100”, “m1510”, “m260”, “m3212”. 

In [19]:
idx = np.array(["m1", "m10", "m100", "m1510", "m260", "m3212"])
S.loc[idx, idx]

Unnamed: 0,m1,m10,m100,m1510,m260,m3212
m1,,0.512106,0.392,,0.741597,
m10,0.512106,,0.547458,,0.534349,
m100,0.392,0.547458,,,0.329694,
m1510,,,,,,
m260,0.741597,0.534349,0.329694,,,
m3212,,,,,,


### myIBCF - Custom implementation for Item Based Collaborative filtering

* Computes the item scores based on User's recent ratings & returns the top n items.

* If the number of recommendations are below threshold, results are backfilled with Genre based recommendation model, built for System 1.

In [69]:
def myIBCF(S, w, n=10):
    S = S.copy()
    S = S.fillna(0)

    w = w.copy()
    identity = (~w.isna()).astype(int)
    w = w.fillna(0)

    reco_movies = w.dot(S) / identity.dot(S)
    reco_movies = reco_movies.sort_values(ascending=False)[0:n]
    
    reco_movies = reco_movies.dropna()
    
    if reco_movies.size < n:
        print("Backfilling from Genre based recommendations")        
        backfill_count = n - reco_movies.size
        random_genre = np.random.choice(get_all_genre())
        backfill_df = find_top_movies_by_genre(genre=random_genre, n=backfill_count)
        
        backfill_movies = pd.Series(data=backfill_df["Weighted_Rating"].values, 
                                    index=("m" +backfill_df["MovieID"].astype(str)).values)
        reco_movies = pd.concat([reco_movies, backfill_movies], axis=0)
    
    return reco_movies
    

#### Test cases for `myIBCF` 

In [74]:
user_rating = rating_matrix.loc["u1181"].copy()
user_rating
print(myIBCF(S, user_rating))

m749     4.526559
m3899    4.526066
m1039    4.000000
m3288    3.850019
m3232    3.726343
m853     3.682413
m729     3.671729
m2129    3.593696
m3126    3.535438
m53      3.529714
Name: u1181, dtype: float64


In [52]:
user_rating = rating_matrix.loc["u1351"].copy()
user_rating
print(myIBCF(S, user_rating))

m404     5.000000
m3373    5.000000
m2869    5.000000
m2623    5.000000
m1877    5.000000
m744     5.000000
m853     5.000000
m2934    4.940172
m3166    4.835657
m1532    4.822379
Name: u1351, dtype: float64


In [57]:
row = S.iloc[0, :]
user_rating = row.copy()
user_rating[:] = np.nan
user_rating["m1613"] = 5
user_rating["m1755"] = 4

print(myIBCF(S, user_rating))

m1661    5.0
m3715    5.0
m3224    5.0
m1366    5.0
m61      5.0
m3567    5.0
m234     5.0
m2729    5.0
m3296    5.0
m947     5.0
Name: m1, dtype: float64


##### Test for Backfill - where there is no recommendations from IBCF model

In [72]:
row = S.iloc[0, :]
user_rating = row.copy()
user_rating[:] = np.nan

print(myIBCF(S, user_rating))


Backfilling from Genre based recommendations
m919     3.806998
m1       3.784963
m3114    3.750569
m1097    3.649978
m34      3.505333
m2355    3.463899
m3751    3.393537
m1073    3.373508
m588     3.324033
m364     3.305719
dtype: float64
