In [1]:
import pandas as pd
import numpy as np
from collections import defaultdict
import itertools
import warnings
warnings.filterwarnings("ignore")

In [2]:
class DataHandler:
    
    """
    class just to get data
    """
    def __init__(self, verbose = False):
        self.verbose = verbose
        self.get_data()
        
        
    def get_data(self):
        if self.verbose:
            print('Getting ratings matrix')
        self.ratings_data = pd.read_csv('data.csv')
        
        #Ratings matrix
        if self.verbose:
            print('Getting ratings data')
        ratings = pd.read_csv('ml-1m/ratings.dat', sep ='::', header = None)
        ratings.columns =['UserID', 'MovieID', 'Rating', 'Timestamp']
        self.ratings = ratings
        
        #Movies matrix
        if self.verbose:
            print('Getting movies data')
        movies = pd.read_csv('ml-1m/movies.dat', sep ='::', on_bad_lines ='skip', encoding_errors='ignore', header= None)
        movies.columns = ['MovieID', 'Title', 'Genres']
        self.movies = movies
        
        #Users matrix
        if self.verbose:
            print('Getting user data')
        users = pd.read_csv('ml-1m/users.dat', sep ='::', on_bad_lines ='skip', encoding_errors='ignore', header= None)
        users.columns = ['UserID', 'Gender', 'Age', 'Occupation', 'Zip-code']
        self.users = users

In [3]:
DataClass = DataHandler()

## System 1

Taking the mean ranking of all movies in a certain genre is obviously the easiest way, but it is too elementary and makes the assumption that all rankings are the same. We use a few different ranking metrics and gather them to form a final ranking metric. <br>

**1) Reliability Adjusted Mean Ratings** <br>
We will implement a weighted-ranking system where not all users rankings will be weighted the same. Some users might have more "accurate" rankings. <br>

**Computation:** <br>
a) Get the ratings matrix for users and movies <br>
b) For each user rating, find the difference between the user's rating and the mean rating <br>
c) Square the difference and use the inverse of this squared difference as the reliability score

**2) Outperformance Score** <br>
Rarely-weighted high scoring titles might show up if we used any Mean/Median rating metric. We need to give some weight to the the more times a title is ranked. 

**Computation:** <br>
a) Given that the median rating is 3, we calculate the median-adjusted rating by subtracting the rating by 3.<br>
b) We then take the sum of the median-adjusted ratings and then calibrate the ranking <br>

**3) Number of Rating 5** <br>
We also need to reward titles that do not have that many rankings but have a a lot of 5 rankings.  


**Aggregation** <br>
The rankings from metric 1, 2 and 3 are averaged out and movies are ranked according to the average rank.

In [6]:
class System1:
    
    def __init__(self, dataclass):
        self.dataclass = dataclass
        
        self._init_calibration()
        self.run_all()
        
    def _init_calibration(self):
        self.generate_genre_mappings()
        self.generate_user_reliability()
        
    def generate_genre_mappings(self):
        #Get the movies dataframe
        movies = self.dataclass.movies.copy()
        movies['Genres']=movies['Genres'].str.split('|')
        movies=movies.explode('Genres')

        #Create genre mapping
        genre_mappings = defaultdict(list)
        for genre, movieid in zip(movies['Genres'], movies['MovieID']):
            genre_mappings[genre].append(movieid)
        self.genre_mappings = genre_mappings
        
    def generate_user_reliability(self):
        ratings_matrix = self.dataclass.ratings_data.copy()
        ratings_err = ratings_matrix - ratings_matrix.mean()
        user_weights = 1/(ratings_err**2).mean(axis = 1)
        self.user_weights = user_weights
        
    def generate_topmovies(self, genre_chosen):
        #Get the relevant genre movies
        movie_titles = self.genre_mappings[genre_chosen]
        all_movies = self.dataclass.movies.copy()
        genre_movies = all_movies.loc[all_movies['MovieID'].isin(set(movie_titles))]
        
        #1) Ranking 1 metric
        #Get the valid movie ratings
        movie_cols = ('m' + genre_movies['MovieID'].astype(str)).to_list()
        movie_cols = [col for col in movie_cols if col in set(self.dataclass.ratings_data.columns)]
        genre_movie_ratings = self.dataclass.ratings_data[movie_cols]
        valid_movie_ratings = (~genre_movie_ratings.isna()).astype(int)
        
        #Get the ratings and then adjust it by reliability scores
        sum_ratings = np.dot(genre_movie_ratings.T.fillna(0), self.user_weights)
        normalize_factor = np.dot(valid_movie_ratings.T, self.user_weights)
        reliability_adj_ratings = sum_ratings/normalize_factor

        #Get the genre rankings
        ranking_1 = pd.DataFrame()
        ranking_1['moviecol'] = movie_cols
        ranking_1['MovieID'] = ranking_1['moviecol'].str[1:].astype(int)
        ranking_1['rating'] = reliability_adj_ratings
        ranking_1['ranking_1'] = ranking_1['rating'].rank(ascending = False)
        
        
        #Ranking 2
        genre_ratings = self.dataclass.ratings
        genre_ratings = genre_ratings.loc[genre_ratings['MovieID'].isin(set(genre_movies['MovieID']))].copy()
        genre_ratings['adj_rating'] = genre_ratings['Rating'] - 3

        ranking_2 = pd.DataFrame(genre_ratings.groupby('MovieID')['adj_rating'].sum()).reset_index()
        ranking_2['ranking_2'] = ranking_2['adj_rating'].rank(ascending = False)
        
        #Ranking 3
        ranking_3 = pd.DataFrame(genre_ratings.loc[genre_ratings['Rating'] == 5].groupby('MovieID').count()['UserID']).reset_index()
        ranking_3 = ranking_3.rename(columns = {'UserID': 'count'})
        ranking_3['ranking_3'] = ranking_3['count'].rank(ascending = False)
        


        #map the movie names
        genre_movies = pd.merge(genre_movies, ranking_1[['MovieID', 'ranking_1']], on = 'MovieID')
        genre_movies = pd.merge(genre_movies, ranking_2[['MovieID', 'ranking_2']], on = 'MovieID')
        genre_movies = pd.merge(genre_movies, ranking_3[['MovieID', 'ranking_3']], on = 'MovieID')
        
        ranking_cols = [col for col in genre_movies.columns if 'ranking_' in col]
        genre_movies['meanranking'] =genre_movies[ranking_cols].mean(axis = 1)
        genre_movies = genre_movies.sort_values('meanranking', ascending= True)
        
        return genre_movies
    
    def run_all(self):
        #Top 10 movies are now hashed in top10movies_by_genre
        self.top10movies_by_genre = {}
        
        for genre in self.genre_mappings.keys():
            top_movies = self.generate_topmovies(genre)
            top_movies = top_movies.iloc[:10]
            self.top10movies_by_genre[genre] = top_movies
            

In [7]:
system1_handler = System1(DataClass)

In [8]:
system1_handler.top10movies_by_genre['Animation']

Unnamed: 0,MovieID,Title,Genres,ranking_1,ranking_2,ranking_3,meanranking
31,1148,"Wrong Trousers, The (1993)",Animation|Comedy,1.0,4.0,3.0,2.666667
0,1,Toy Story (1995),Animation|Children's|Comedy,7.0,1.0,1.0,3.0
85,3114,Toy Story 2 (1999),Animation|Children's|Comedy,6.0,2.0,2.0,3.333333
19,745,"Close Shave, A (1995)",Animation|Comedy|Thriller,2.0,8.0,5.0,5.0
68,2355,"Bug's Life, A (1998)",Animation|Children's|Comedy,18.0,3.0,6.0,9.0
96,3751,Chicken Run (2000),Animation|Children's|Comedy,15.0,6.0,7.0,9.333333
9,595,Beauty and the Beast (1991),Animation|Children's|Musical,12.0,11.0,9.5,10.833333
17,720,Wallace & Gromit: The Best of Aardman Animatio...,Animation,3.0,18.0,14.0,11.666667
36,1282,Fantasia (1940),Animation|Children's|Musical,14.0,12.0,9.5,11.833333
34,1223,"Grand Day Out, A (1992)",Animation|Comedy,4.0,17.0,16.0,12.333333


## System 2

In [286]:
def generate_similarity_matrix(DataClass):
    #Get the demeaned matrix
    ratings_matrix = DataClass.ratings_data.copy()
    demeaned_matrix = ratings_matrix.sub(ratings_matrix.mean(axis=1), axis=0)
    relevant_movies = [int(col[1:]) for col in demeaned_matrix.columns]
    
    #Get the set of common users
    ratings = DataClass.ratings.copy()
    ratings = ratings.loc[ratings['MovieID'].isin(set(relevant_movies))]
    
    #Calculate the matrix
    matrix = pd.DataFrame(columns = demeaned_matrix.columns, index = demeaned_matrix.columns)
    for movie_col in demeaned_matrix.columns:
        curr_rel_movie_cols = demeaned_matrix.loc[demeaned_matrix[movie_col].dropna().index]
        curr_movie_col = demeaned_matrix[movie_col].dropna()
        other_movie_cols = curr_rel_movie_cols[[col for col in curr_rel_movie_cols.columns if col != movie_col]]
        #filter for movie_cols >= 3
        filtered_movie_columns = other_movie_cols.columns[np.where((~other_movie_cols.isna()).sum() >= 3)[0]]
        other_movie_cols = other_movie_cols[filtered_movie_columns]

        #Numerator
        numerator = np.dot(other_movie_cols.fillna(0).T, curr_movie_col)
        #Deminominator for other cols
        denominator_other = np.sqrt(np.sum(other_movie_cols**2))
        #Deminominator for current col
        ##inidicator matrix to only select current_col rows that are relevant
        indicator_matrix = (~other_movie_cols.isna()).astype(int)
        denominator_current = np.sqrt(np.dot(indicator_matrix.T, curr_movie_col**2))
        to_fill = numerator/(denominator_other*denominator_current)
        matrix.loc[movie_col, to_fill.index] = to_fill
        
    #Calculate similarity matrix
    similarity_matrix = 0.5 + 0.5 *matrix
    similarity_ranking = similarity_matrix.rank(ascending = False, axis = 1)
    less_than_30 = (similarity_ranking <= 30).astype(int).replace(0, np.nan)
    final_matrix = similarity_matrix*less_than_30
    return final_matrix, similarity_matrix

In [287]:
final_matrix, similarity_matrix = generate_similarity_matrix(DataClass)

In [153]:
test_cols = ['m1', 'm10', 'm100', 'm1510', 'm260', 'm3212']
final_matrix.loc[test_cols, test_cols]

Unnamed: 0,m1,m10,m100,m1510,m260,m3212
m1,,,,,,
m10,,,,,,
m100,,,,,,
m1510,,,,,,
m260,,,,,,
m3212,,,,,,


In [16]:
similarity_matrix.loc[test_cols, test_cols]

Unnamed: 0,m1,m10,m100,m1510,m260,m3212
m1,,0.512106,0.392,,0.741148,
m10,0.512106,,0.547458,,0.534334,
m100,0.392,0.547458,,,0.329694,
m1510,,,,,,
m260,0.741148,0.534334,0.329694,,,
m3212,,,,,,


In [288]:
#final_matrix.to_csv('final_simmatrix.csv')

In [167]:
#similarity_matrix.to_csv('before_simmatrix.csv')

## myIBCF Function

The code will return the top 10 movies with non-NA ratings. If there are only 8 non-NA rated movies, only 8 will be returned. <br>

**Provide a method to suggest additional movies that have not been rated by this user.**
For users where their movie ratings do not have a valid recommendation via myIBCF, we implement something different. We first get the similarity scores with other users. We then get the 10 most similar users based on the cosine similarity metric. With this metric, we then get the movie ratings of these 10 users and find the top 10 movies based on the user similarity and the movie rankings

In [551]:
class System2:
    
    def __init__(self, DataClass):
        self.matrix = pd.read_csv('final_simmatrix.csv', index_col = 0)
        self.dataclass = DataClass
    
    def myIBCF(self, movie_ratings):
        rated_movies = movie_ratings.dropna()
        relevant_movie_ratings = self.matrix[rated_movies.index]
        
        denominator = relevant_movie_ratings.sum(axis = 1)
        numerator = (relevant_movie_ratings * movie_ratings).sum(axis = 1)
        score = (numerator/denominator)
        score = score.loc[~score.index.isin(rated_movies.index)]
        score = score.sort_values(ascending = False).iloc[:10].dropna()
        if len(score) == 0:
            return self.recommend_other_movies(movie_ratings)
        return score
    
    def recommend_other_movies(self, moving_ratings):
        """
        Back-up is here where we find the users that are most similar to current ratings
        """
        norm_all = np.sqrt(np.square(self.dataclass.ratings_data).sum(axis=1))
        norm_current = np.sqrt(np.square(movie_ratings).sum())

        cosine_sim = np.dot(self.dataclass.ratings_data.fillna(0), movie_ratings.fillna(0))/(norm_all * norm_current)
        cosine_sim = cosine_sim.loc[cosine_sim!= 0]
        cosine_sim = 1 - cosine_sim
        cosine_sim = cosine_sim.sort_values(ascending = False).iloc[:10]

        current_movie_waitings = self.dataclass.ratings_data.loc[cosine_sim.index].T
        new_weights = (~current_movie_waitings.isna()).astype(int)*cosine_sim
        new_weights = new_weights.div(new_weights.sum(axis=1), axis=0)

        best_movies = (current_movie_waitings*new_weights).sum(axis = 1).sort_values(ascending = False).iloc[:10]
        best_movies = best_movies.loc[~best_movies.index.isin(set(moving_ratings.dropna().index))]
        return best_movies

In [552]:
system2_handler = System2(DataClass)

In [553]:
movie_ratings_u1181 = DataClass.ratings_data.loc['u1181']
movie_ratings_u1351 = DataClass.ratings_data.loc['u1351']

movie_ratings_hypothetical = pd.Series(np.nan, DataClass.ratings_data.loc['u1181'].index)
movie_ratings_hypothetical.loc['m1613'] = 5
movie_ratings_hypothetical.loc['m1755'] = 4

In [554]:
best_movies_u1181 = system2_handler.myIBCF(movie_ratings_u1181)
best_movies_u1351 = system2_handler.myIBCF(movie_ratings_u1351)
best_movies_hypothetical = system2_handler.myIBCF(movie_ratings_hypothetical)

#### User U1181

In [555]:
best_movies_u1181

m3732    5.000000
m749     4.526559
m3899    4.526066
m249     4.000000
m1039    4.000000
m504     4.000000
m1914    4.000000
m2082    4.000000
m1253    4.000000
m337     4.000000
dtype: float64

#### User 1351

In [556]:
best_movies_u1351

m1871    5.0
m1514    5.0
m1901    5.0
m2061    5.0
m2127    5.0
m985     5.0
m744     5.0
m1877    5.0
m2000    5.0
m2028    5.0
dtype: float64

#### Hypothetical User

In [557]:
best_movies_hypothetical

m1017    5.0
m3269    5.0
m765     5.0
m74      5.0
m691     5.0
m592     5.0
m46      5.0
m2846    5.0
m338     5.0
m3258    5.0
dtype: float64

In [558]:
movie_ratings_test = pd.Series(np.nan, DataClass.ratings_data.loc['u1181'].index)
#movie_ratings_hypothetical.loc['m1613'] = 5
movie_ratings_test.loc['m3621'] = 4

In [559]:
best_movies_test = system2_handler.myIBCF(movie_ratings_test)

In [560]:
best_movies_test

m3250    5.0
m1197    5.0
m2540    5.0
m3760    5.0
m3188    5.0
m2028    5.0
m849     5.0
m3327    5.0
m3753    5.0
m442     5.0
dtype: float64