In [1]:
import pandas as pd
import random
import numpy as np
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

# UserItemData

In [23]:
class UserItemData:
    def __init__(self, path, start_date=None, end_date=None, min_ratings=0):
        self.data = pd.read_csv(path, sep="\t", encoding="latin-1")
        self.filter_by_date(start_date, end_date)
        self.filter_by_ratings(min_ratings)
    
    def get_date(self):
        data = self.data
        return pd.to_datetime((data.date_year*10000+data.date_month*100+data.date_day).apply(str),format='%Y%m%d')  
    
    def filter_by_date(self, start_date, end_date):
        if not start_date and not end_date:
            return
        
        dates = self.get_date()
        
        if start_date:
            s_date_vals = start_date.split(".")
            s_year = s_date_vals[2]
            s_month = s_date_vals[1]
            s_day = s_date_vals[0]

            start_date = pd.Timestamp(f"{s_year}-{s_month}-{s_day}")
            self.data = self.data[dates >= start_date]

        if end_date:
            e_date_vals = end_date.split(".")
            e_year = e_date_vals[2]
            e_month = e_date_vals[1]
            e_day = e_date_vals[0]

            end_date = pd.Timestamp(f"{e_day}-{e_month}-{e_year}")
            self.data = self.data[dates <= end_date]
    
    # count the frequency of a movieID
    def filter_by_ratings(self, min_ratings):
        self.data["ratingsNum"] = self.data.groupby('movieID')['rating'].transform('count')
        self.data = self.data[self.data["ratingsNum"] >= min_ratings]
        
    def nratings(self):
        return len(self.data.index)
    
    def add_ratings(self, new_data):
        rows = []
        for items in new_data:
            user_id, movie_id, rating = items
            new_row = {}
            
            new_row["userID"] = user_id
            new_row["movieID"] = movie_id
            new_row["rating"] = rating
            
            new_row["date_day"] = "13"
            new_row["date_month"] = "12"
            new_row["date_year"] = "2009"
            new_row["date_hour"] = "13"
            new_row["date_minute"] = "13"
            new_row["date_second"] = "13"
            
            rows.append(new_row)
        
        my_df = pd.DataFrame.from_dict(rows, orient='columns')
        self.data = self.data.append(my_df)

In [11]:
uim = UserItemData('data/user_ratedmovies.dat')
print(uim.nratings())

uim = UserItemData('data/user_ratedmovies.dat', start_date = '12.1.2007', end_date='16.2.2008', min_ratings=100)
print(uim.nratings())

855598
73657


In [3]:
class MovieData():
    def __init__(self, path):
        self.data = pd.read_table(path, encoding='latin-1')

    def get_title(self, ajdi):
        row = self.data[self.data["id"] == ajdi]
        return str(row["title"].item())
    
    def get_all_movies(self):
        return self.data["id"].unique()
    
    def get_genre_info(self, path):
        self.genres_table = pd.read_table(path, encoding='latin-1')
        keep = ["id", "title"]
        
        filtered_data = self.data[keep].set_index('id')
        genres = self.genres_table.groupby('movieID')['genre'].apply(list)        
        return filtered_data.merge(genres.to_frame(), left_index=True, right_index=True)

In [6]:
md = MovieData('data/movies.dat')
print(md.get_title(1))

Toy story


# RandomPredictor

In [8]:
class RandomPredictor:
    def __init__(self, min_rating, max_rating):
        self.min_rating = min_rating
        self.max_rating = max_rating

    def fit(self, X):
        self.uim = X

    def predict(self, user_id):
        md = MovieData('data/movies.dat')

        data = self.uim.data[self.uim.data["userID"] == user_id]
        keep = ["movieID", "rating"]
        data = data[keep]

        all_movies = md.get_all_movies()
        temp_user_table = pd.DataFrame(
            columns=["movieID", "rating"])
        temp_user_table["movieID"] = all_movies

        user_table = pd.concat([temp_user_table, data])
        user_table = user_table.drop_duplicates(
            subset=["movieID"], keep="last")
        user_table = user_table.fillna(
            value=random.randint(self.min_rating, self.max_rating))
        user_table = user_table.set_index("movieID")

        return user_table.to_dict()["rating"]

In [9]:
md = MovieData('data/movies.dat')
uim = UserItemData('data/user_ratedmovies.dat')
rp = RandomPredictor(1, 5)
rp.fit(uim)
pred = rp.predict(78)
print(type(pred))
items = [1, 3, 20, 50, 100]
for item in items:
    print("Film: {}, ocena: {}".format(md.get_title(item), pred[item]))

<class 'dict'>
Film: Toy story, ocena: 4.0
Film: Grumpy Old Men, ocena: 4.0
Film: Money Train, ocena: 4.0
Film: The Usual Suspects, ocena: 4.0
Film: City Hall, ocena: 4.0


# Recommender

In [4]:
class Recommender: 
    def __init__(self, pred):
        self.pred = pred
        
        
    def fit(self, X): 
        self.uim = X 
        self.pred.fit(X)
    
    
    def recommend(self, user_id, n=10, rec_seen=True):
        slovar = self.pred.predict(user_id)
        watched = self.uim.data[self.uim.data["userID"] == user_id].movieID.values
        top_rated = sorted(slovar, key=slovar.get, reverse=True)
        
        if not rec_seen:
            top_rated = [x for x in top_rated if x not in watched]
        
        return[(key, slovar[key]) for key in top_rated[:n]]
    
    
    def recommend_for_val(self, user_id, n):
        user_avg_score = self.pred.user_mean_ratings[user_id]
        preds = self.pred.predict(user_id)
        temp_df = self.pred.uim.data
        user_movies = set(temp_df[temp_df['userID'] == user_id]['movieID'])
        
        for movie in user_movies:
            preds.pop(movie, None)
        
        result = {key: value for key, value in preds.items() if value >= user_avg_score}
        return sorted(result.items(), key=lambda x: x[1], reverse=True)[:n]
    
    
    def calc_p(self, user_id, n):
        recommended = self.recommend_for_val(user_id, n)
        recommended = set([key for key, value in recommended])
        
        fil = self.filtered_td['userID'] == user_id
        watched = set(self.filtered_td[fil]['movieID'].values)
        
        intersection = set.intersection(recommended, watched)
        
        if len(recommended):
            precision = len(intersection) / len(recommended)
        else:
            precision = 0
        
        recall = len(intersection) / len(watched)
        return precision, recall
        
    
    def calc_m(self, pred, real):
        mse = 0
        mae = 0

        for movie in real:
            mse += (real[movie] - pred[movie])**2
            mae += abs(real[movie] - pred[movie])
        
        return mse, mae, len(real)
        
    
    def evaluate(self, test_data, n):
        self.test_data = test_data.data[['userID', 'movieID','rating']]
        train_uids = np.unique(self.pred.uim.data['userID'].values)
        train_mids = set(self.pred.data['movieID'].values)
        
        self.filtered_td = self.test_data[self.test_data['movieID'].isin(train_mids)]
        test_uids = np.unique(self.filtered_td['userID'].values)
        
        users = np.intersect1d(test_uids, train_uids)

        self.test_ratings = {}
        for user in users:
            mask = (self.filtered_td['userID'] == user)
            self.test_ratings[user] = dict(zip(self.filtered_td[mask]['movieID'].values, self.filtered_td[mask]['rating'].values))
        
        
        mse = 0
        mae = 0
        size = 0
        precision = 0
        recall = 0
        for user in users:
            pred =  self.pred.predict(user)
            real = self.test_ratings[user]
            mse_, mae_, n_ = self.calc_m(pred, real)
            mse += mse_
            mae += mae_
            size += n_
            
            precision_, recall_ = self.calc_p(user, n)
            precision += precision_
            recall += recall_
        
        mse = mse / size
        mae = mae / size
        precision /= size
        recall /= size
        f1 = (2 * precision * recall) / (precision + recall)
             
        return mse, mae, precision, recall, f1

In [6]:
md = MovieData('data/movies.dat')
uim = UserItemData('data/user_ratedmovies.dat', min_ratings=1000, end_date='1.1.2008')
rp = SlopeOnePredictor()
rec = Recommender(rp)
rec.fit(uim)

uim_test = UserItemData('data/user_ratedmovies.dat', min_ratings=200, start_date='2.1.2008')
mse, mae, precision, recall, f = rec.evaluate(uim_test, 20)
print(mse, mae, precision, recall, f)

0.7284717901938453 0.6305249422642825 0.10034102028289822 0.14149272901587545 0.11741557853882702


In [11]:
md = MovieData('data/movies.dat')
uim = UserItemData('data/user_ratedmovies.dat')
rp = RandomPredictor(1, 5)
rec = Recommender(rp)
rec.fit(uim)
rec_items = rec.recommend(78, n=5, rec_seen=False)
for idmovie, val in rec_items:
    print("Film: {}, ocena: {}".format(md.get_title(idmovie), val))  

Film: Toy story, ocena: 5.0
Film: Jumanji, ocena: 5.0
Film: Grumpy Old Men, ocena: 5.0
Film: Waiting to Exhale, ocena: 5.0
Film: Father of the Bride Part II, ocena: 5.0


# AveragePredictor

In [13]:
class AveragePredictor:
    def __init__(self, b):
        self.b = b
    
    def get_avg(self, data, movie_id):
        n = data[data["movieID"] == movie_id]["rating"].values
        vs = n.sum()
        n = len(n)
        return (vs + self.b * self.g_avg) / (n + self.b)

    def fit(self, X):
        self.data = X.data        
        
    def predict(self, user_id):
        result = {}
        self.g_avg = self.data["rating"].mean()
        
        for movie_id in set(self.data["movieID"].values):
            result[movie_id] = self.get_avg(self.data, movie_id)
        return result


In [14]:
md = MovieData('data/movies.dat')
uim = UserItemData('data/user_ratedmovies.dat')
rp = AveragePredictor(0)
rec = Recommender(rp)
rec.fit(uim)
rec_items = rec.recommend(78, n=5, rec_seen=False)
for idmovie, val in rec_items:
    print("Film: {}, ocena: {}".format(md.get_title(idmovie), val))  

Film: Brother Minister: The Assassination of Malcolm X, ocena: 5.0
Film: Synthetic Pleasures, ocena: 5.0
Film: Gabbeh, ocena: 5.0
Film: Storefront Hitchcock, ocena: 5.0
Film: Ko to tamo peva, ocena: 5.0


In [15]:
md = MovieData('data/movies.dat')
uim = UserItemData('data/user_ratedmovies.dat')
rp = AveragePredictor(100)
rec = Recommender(rp)
rec.fit(uim)
rec_items = rec.recommend(78, n=5, rec_seen=False)
for idmovie, val in rec_items:
    print("Film: {}, ocena: {}".format(md.get_title(idmovie), val))  

Film: The Usual Suspects, ocena: 4.225944245560473
Film: The Godfather: Part II, ocena: 4.146907937910189
Film: Cidade de Deus, ocena: 4.116538340205236
Film: The Dark Knight, ocena: 4.10413904093503
Film: 12 Angry Men, ocena: 4.103639627096175


# ViewsPredictor

In [16]:
class ViewsPredictor:
    def fit(self, X):
        self.data = X.data

    def predict(self, user_id):
        self.data["ratingsNum"] = self.data.groupby('movieID')['rating'].transform('count')
        result = self.data.drop_duplicates(subset=['movieID'], keep='first')
        
        keep=["movieID", "ratingsNum"]
        result = result[keep]
        result = result.set_index("movieID")
        
        return result.to_dict()["ratingsNum"]

In [17]:
md = MovieData('data/movies.dat')
uim = UserItemData('data/user_ratedmovies.dat')
rp = ViewsPredictor()
rec = Recommender(rp)
rec.fit(uim)
rec_items = rec.recommend(78, n=5, rec_seen=False)
for idmovie, val in rec_items:
    print("Film: {}, ocena: {}".format(md.get_title(idmovie), val))  

Film: The Lord of the Rings: The Fellowship of the Ring, ocena: 1576
Film: The Lord of the Rings: The Two Towers, ocena: 1528
Film: The Lord of the Rings: The Return of the King, ocena: 1457
Film: The Silence of the Lambs, ocena: 1431
Film: Shrek, ocena: 1404


# STDPredictor

In [18]:
class STDPredictor:
    def __init__(self, n):
        self.n = n
    
    def filter_by_ratings(self):
        self.data["ratingsNum"] = self.data.groupby('movieID')['rating'].transform('count')
        self.data = self.data[self.data["ratingsNum"] >= self.n]
    
    def fit(self, X):
        self.data = X.data
        self.filter_by_ratings()
        
    def predict(self, user_id):
        return self.data.groupby("movieID")["rating"].std().to_dict()

In [23]:
md = MovieData('data/movies.dat')
uim = UserItemData('data/user_ratedmovies.dat')
rp = STDPredictor(100)
rec = Recommender(rp)
rec.fit(uim)
rec_items = rec.recommend(78, n=5, rec_seen=False)

for idmovie, val in rec_items:
    print("Film: {}, ocena: {}".format(md.get_title(idmovie), val))

Film: Plan 9 from Outer Space, ocena: 1.3449520951495717
Film: The Passion of the Christ, ocena: 1.281493459525735
Film: The Texas Chainsaw Massacre, ocena: 1.235349321908819
Film: Jackass Number Two, ocena: 1.2189769976366684
Film: White Chicks, ocena: 1.1899581424297319


# ItemBasedPredictor

In [13]:
from scipy.spatial import distance
import itertools

# values are a bit different but the results are the same so it's fineee
class ItemBasedPredictor:
    def __init__(self, min_values=0, threshold=0):
        self.min_values = min_values
        self.threshold = threshold
    
    
    def similarity(self, p1, p2):
        users1 = self.data[self.data["movieID"] == p1]["userID"].values
        users2 = self.data[self.data["movieID"] == p2]["userID"].values
        c_users = np.intersect1d(users1, users2)
        
        if len(c_users) < self.min_values:
            return 0
        
        table = self.user_ratings_w[self.user_ratings_w.index.isin(c_users)]
        m1 = table[p1]
        m2 = table[p2]
        
        sim = 1 - distance.cosine(m1, m2)
        
        if sim <= self.threshold:
            return 0
        return sim
    
    
    # get weight
    def get_w(self, x, user_id, means):
            return x - means[user_id]
    
    
    def sort(self, num):
        return sorted(self.sims.items(), key=lambda x: x[1], reverse=True)[:num]
       
        
    def fit(self, X):
        self.data = X.data
        keep = ['userID', 'movieID' ,'rating']
        self.data = self.data[keep]
         
        users = self.data.userID
        movies = np.unique(self.data.movieID.values)

        #         p1     p2    p3
        # u1      x      x     x
        # u2      x      x     x
        # u3      x      x     x
        self.user_ratings = self.data.pivot_table(index=['userID'], columns=['movieID'], values='rating')
    
        df_temp = self.data[['userID', 'rating']]
        self.user_mean_ratings = pd.DataFrame.to_dict(df_temp.groupby('userID').mean())['rating']
        self.user_ratings_w = self.user_ratings.apply(lambda row: self.get_w(row, row.name, self.user_mean_ratings), axis=1)
        
        self.sim_table = pd.DataFrame(index=movies, columns=movies)
        self.sim_table[:] = 0
        self.sims = {}
        for pair in itertools.combinations(movies, 2):
            m1, m2 = pair
            sim = self.similarity(m1, m2)
            
            self.sims[(m1, m2)] = sim
            self.sims[(m2, m1)] = sim
            self.sim_table.loc[m1, m2] = sim
            self.sim_table.loc[m2, m1] = sim
    
    
    def trim_key(self, key, item):
        if key[0] == item:
            return key[1]
        return key[0]
    
    
    def similarItems(self, item, n):
        pairs = dict((self.trim_key(key, item), value) for key, value in self.sims.items() if item in key)
        return sorted(pairs.items(), key=lambda x: x[1], reverse=True)[:n]
            
            
    def predict(self, user_id):
        df = self.data
        movies_rated = df[df["userID"] == user_id]
        filtered_sm = self.sim_table.loc[np.in1d(self.sim_table.index.values, movies_rated),]
        mask = np.in1d(self.user_ratings_w.columns.values, movies_rated)
        filtered_r = self.user_ratings_w.loc[user_id, mask].values
        
        weighted_ratings = filtered_sm.apply(lambda column: 
                                            np.sum(column * filtered_r) / np.sum(column) + self.user_mean_ratings[user_id] 
                                            if np.sum(column) else self.user_mean_ratings[user_id],
                                            axis=0)
        
        new_ratings = weighted_ratings.to_dict()
    
        movies_rated = df[df['userID'] == user_id][['movieID','rating']]
        predictions = dict(zip(movies_rated.movieID, movies_rated.rating))
        for key, rating in new_ratings.items():
            if key not in movies_rated:
                predictions[key] = rating
                
        return predictions

In [16]:
md = MovieData('data/movies.dat')
uim = UserItemData('data/user_ratedmovies.dat', min_ratings=1000)
rp = ItemBasedPredictor()
rec = Recommender(rp)
rec.fit(uim)
print("Podobnost med filmoma 'Men in black'(1580) in 'Ghostbusters'(2716): ", rp.similarity(1580, 2716))
print("Podobnost med filmoma 'Men in black'(1580) in 'Schindler's List'(527): ", rp.similarity(1580, 527))
print("Podobnost med filmoma 'Men in black'(1580) in 'Independence day'(780): ", rp.similarity(1580, 780))

Podobnost med filmoma 'Men in black'(1580) in 'Ghostbusters'(2716):  0.23395523176756639
Podobnost med filmoma 'Men in black'(1580) in 'Schindler's List'(527):  0
Podobnost med filmoma 'Men in black'(1580) in 'Independence day'(780):  0.4246612584468762


In [17]:
print("Predictions for 78: ")
rec_items = rec.recommend(78, n=15, rec_seen=False)
for idmovie, val in rec_items:
    print("Film: {}, ocena: {}".format(md.get_title(idmovie), val))

Predictions for 78: 
Film: Shichinin no samurai, ocena: 4.3557347903101595
Film: The Usual Suspects, ocena: 4.354681728067836
Film: The Silence of the Lambs, ocena: 4.335305303472516
Film: Sin City, ocena: 4.2786871668991004
Film: Monsters, Inc., ocena: 4.2175811369435205
Film: The Incredibles, ocena: 4.2070985832817485
Film: The Lord of the Rings: The Fellowship of the Ring, ocena: 4.152792107348347
Film: Batman Begins, ocena: 4.146413806700199
Film: Die Hard, ocena: 4.125915602232819
Film: Rain Man, ocena: 4.071535242958551
Film: The Lord of the Rings: The Return of the King, ocena: 4.020237449257013
Film: A Beautiful Mind, ocena: 4.015142490064839
Film: Good Will Hunting, ocena: 4.0092808069228205
Film: The Lord of the Rings: The Two Towers, ocena: 3.941476305095594
Film: Indiana Jones and the Last Crusade, ocena: 3.796976496378924


In [38]:
sort_sims = rp.sort(20)
for movies, sim in sort_sims:
    print(f"Film1: {md.get_title(movies[0])}, Film2: {md.get_title(movies[1])}, podobnost: {sim}")

Film1: The Lord of the Rings: The Two Towers, Film2: The Lord of the Rings: The Return of the King, podobnost: 0.8439842148481417
Film1: The Lord of the Rings: The Return of the King, Film2: The Lord of the Rings: The Two Towers, podobnost: 0.8439842148481417
Film1: The Lord of the Rings: The Fellowship of the Ring, Film2: The Lord of the Rings: The Two Towers, podobnost: 0.8231885401761888
Film1: The Lord of the Rings: The Two Towers, Film2: The Lord of the Rings: The Fellowship of the Ring, podobnost: 0.8231885401761888
Film1: The Lord of the Rings: The Fellowship of the Ring, Film2: The Lord of the Rings: The Return of the King, podobnost: 0.8079374897442497
Film1: The Lord of the Rings: The Return of the King, Film2: The Lord of the Rings: The Fellowship of the Ring, podobnost: 0.8079374897442497
Film1: Kill Bill: Vol. 2, Film2: Kill Bill: Vol. 2, podobnost: 0.737234022438103
Film1: Kill Bill: Vol. 2, Film2: Kill Bill: Vol. 2, podobnost: 0.737234022438103
Film1: Star Wars, Film2: S

In [39]:
rec_items = rp.similarItems(4993, 10)
print('Filmi podobni "The Lord of the Rings: The Fellowship of the Ring": ')
for idmovie, val in rec_items:
    print("Film: {}, ocena: {}".format(md.get_title(idmovie), val))

Filmi podobni "The Lord of the Rings: The Fellowship of the Ring": 
Film: The Lord of the Rings: The Two Towers, ocena: 0.8231885401761888
Film: The Lord of the Rings: The Return of the King, ocena: 0.8079374897442497
Film: Star Wars: Episode V - The Empire Strikes Back, ocena: 0.2396194307349645
Film: Star Wars, ocena: 0.21965586527074066
Film: The Matrix, ocena: 0.21515552706880237
Film: Raiders of the Lost Ark, ocena: 0.1994427670634501
Film: The Usual Suspects, ocena: 0.18321188451910753
Film: Blade Runner, ocena: 0.16399681315410275
Film: Schindler's List, ocena: 0.16105905138148702
Film: Monty Python and the Holy Grail, ocena: 0.15780453798519134


In [26]:
md = MovieData('data/movies.dat')
uim = UserItemData('data/user_ratedmovies.dat', min_ratings=1000)
rp = ItemBasedPredictor()
rec = Recommender(rp)

uim.add_ratings([[1, 589, 3], [1, 2446, 5], [1, 2810, 5], [1, 3717, 2.5], [1, 2012, 3], [1, 160, 2],
                [1, 2013, 1], [1, 1394, 3.5], [1, 1396, 3.5], [1, 1408, 3.5], [1, 8924, 4], 
                [1, 8949, 4.5], [1, 8958, 4], [1, 8961, 4.5]])

rec.fit(uim)
print("Predictions for me: ")
rec_items = rec.recommend(1, n=10, rec_seen=False)
for idmovie, val in rec_items:
    print("Film: {}, ocena: {}".format(md.get_title(idmovie), val))

Predictions for me: 
Film: Finding Nemo, ocena: 3.4855061079202
Film: Monsters, Inc., ocena: 3.484138943119527
Film: Toy story, ocena: 3.480419951075355
Film: Le fabuleux destin d'Amélie Poulain, ocena: 3.4700922537364396
Film: Shrek, ocena: 3.4694129326243957
Film: Raiders of the Lost Ark, ocena: 3.4679712622505265
Film: Eternal Sunshine of the Spotless Mind, ocena: 3.467773158719824
Film: Star Wars: Episode V - The Empire Strikes Back, ocena: 3.4677368405739952
Film: The Lord of the Rings: The Return of the King, ocena: 3.4677040329714743
Film: Star Wars, ocena: 3.4667180503370916


# Slope One Predictor

In [5]:
class SlopeOnePredictor():
    def fit(self, X):
        self.uim = X
        self.data = X.data
        self.movies = np.unique(X.data.movieID.values)
        
        df_temp = self.data[['userID', 'rating']]
        self.user_mean_ratings = pd.DataFrame.to_dict(df_temp.groupby('userID').mean())['rating']
    
    
    def get_diff(self, m1, m2):
        users_1 = self.data[self.data["movieID"] == m1]["userID"].values
        users_2 = self.data[self.data["movieID"] == m2]["userID"].values
        c_users = np.intersect1d(users_1, users_2)
        
        vector1 = self.data[(self.data["movieID"] == m1) & (self.data["userID"].isin(c_users))].rating
        vector2 = self.data[(self.data["movieID"] == m2) & (self.data["userID"].isin(c_users))].rating
        vector1 = vector1.reset_index(drop=True)
        vector2 = vector2.reset_index(drop=True)
        
        difference = vector1.subtract(vector2)
        return difference.sum() / len(c_users)
    
    
    def get_n(self, m1, m2):
        users_1 = self.data[self.data["movieID"] == m1]["userID"].values
        users_2 = self.data[self.data["movieID"] == m2]["userID"].values
        c_users = np.intersect1d(users_1, users_2)
        
        return len(c_users)
    
    
    def predict(self, user):
        userHasSeen = self.data[self.data["userID"] == user]["movieID"].values
        
        predictions = {}
        for movie in self.movies:
            predictions[movie] = self.__predictOne(user, movie, userHasSeen)
        
        return predictions
    
    
    def __predictOne(self, user, movie, userHasSeen):
        pred = sum([(self.data[(self.data["movieID"] == otherM) & (self.data["userID"] == user)].iloc[0]["rating"] + self.get_diff(movie,otherM)) * self.get_n(movie,otherM) for otherM in userHasSeen[userHasSeen != movie]])
        sumOfAllN = sum([self.get_n(movie,otherM) for otherM in userHasSeen[userHasSeen != movie]])
        
        if sumOfAllN == 0:
            return self.data[self.data["userID"] == user].rating.mean()
        
        return pred / sumOfAllN

In [7]:
md = MovieData('data/movies.dat')
uim = UserItemData('data/user_ratedmovies.dat', min_ratings=1000)
rp = SlopeOnePredictor()
rec = Recommender(rp)
rec.fit(uim)

print("Predictions for 78: ")
rec_items = rec.recommend(78, n=15, rec_seen=False)
for idmovie, val in rec_items:
    print("Film: {}, ocena: {}".format(md.get_title(idmovie), val))

Predictions for 78: 
Film: The Usual Suspects, ocena: 4.325079182263173
Film: The Lord of the Rings: The Fellowship of the Ring, ocena: 4.155293229840448
Film: The Lord of the Rings: The Return of the King, ocena: 4.153135076202185
Film: The Silence of the Lambs, ocena: 4.127978169643881
Film: Shichinin no samurai, ocena: 4.119790444913598
Film: The Lord of the Rings: The Two Towers, ocena: 4.083325894849594
Film: Indiana Jones and the Last Crusade, ocena: 3.9670398355464194
Film: The Incredibles, ocena: 3.9664496674557546
Film: Good Will Hunting, ocena: 3.963362387354114
Film: Sin City, ocena: 3.942619137615212
Film: Batman Begins, ocena: 3.9375326640077017
Film: A Beautiful Mind, ocena: 3.9140940935239508
Film: Rain Man, ocena: 3.9107819079644943
Film: Monsters, Inc., ocena: 3.8819375978658006
Film: Finding Nemo, ocena: 3.8807711131654794


# Hybrid Predictor

In [14]:
class HybridPredictor:
    def __init__(self, slope_one, item_based, avg_pred):
        self.so = slope_one
        self.ib = item_based
        self.ap = avg_pred
    
    def fit(self, X):
        self.so.fit(X)
        self.ib.fit(X)
        self.ap.fit(X)
        
    def predict(self, user_id):
        so_data = self.so.predict(user_id)
        ib_data = self.ib.predict(user_id)
        ap_data = self.ap.predict(user_id)
        
        movies = so_data.keys()
        preds = {}
        
        for movie in movies:
            preds[movie] = (so_data[movie] + ib_data[movie] + ap_data[movie])/3
        
        return preds

In [16]:
md = MovieData('data/movies.dat')
uim = UserItemData('data/user_ratedmovies.dat', min_ratings=1000)

ib = ItemBasedPredictor()
so = SlopeOnePredictor()
ap = AveragePredictor(100)
hybrid = HybridPredictor(so, ib, ap)

rec = Recommender(hybrid)
rec.fit(uim)

rec_items = rec.recommend(78, n=15, rec_seen=False)
for idmovie, val in rec_items:
    print("Film: {}, ocena: {}".format(md.get_title(idmovie), val))

Film: The Usual Suspects, ocena: 4.311278518974733
Film: Shichinin no samurai, ocena: 4.173534033030131
Film: The Silence of the Lambs, ocena: 4.166374348258444
Film: The Lord of the Rings: The Fellowship of the Ring, ocena: 4.1247457059145285
Film: The Lord of the Rings: The Return of the King, ocena: 4.083025349250652
Film: Sin City, ocena: 4.04150141849935
Film: The Incredibles, ocena: 4.03517179295511
Film: The Lord of the Rings: The Two Towers, ocena: 4.013769116650408
Film: Batman Begins, ocena: 3.996277322964873
Film: Monsters, Inc., ocena: 3.9850683879684383
Film: Good Will Hunting, ocena: 3.963985125214793
Film: Rain Man, ocena: 3.9468100652778144
Film: Die Hard, ocena: 3.946727709564506
Film: A Beautiful Mind, ocena: 3.9404428687612474
Film: Indiana Jones and the Last Crusade, ocena: 3.9017846201129402


# Matrix Factorization Predictor

In [17]:
class MatrixFactorizationPredictor:
    def fit(self, X):
        rating_df = X.data
        self.ratings_table = X.data.pivot(index = 'userID', columns = 'movieID', values = 'rating').fillna(0)
        ratings_table = self.ratings_table
        
        # demean
        matrix = ratings_table.to_numpy()
        user_ratings_mean = np.mean(matrix, axis = 1)
        matrix = matrix - user_ratings_mean.reshape(-1, 1)
        
        # single value decomposition
        u, sigma, vh = np.linalg.svd(matrix, full_matrices=False)
        sigma = np.diag(sigma) # to ease multiplication
        
        pred_ratings = np.dot(np.dot(u, sigma), vh) + user_ratings_mean.reshape(-1, 1)
        self.pred_df = pd.DataFrame(pred_ratings, columns = ratings_table.columns)
            
    def predict(self, user_id):
        pred_df = self.pred_df
        ratings_table = self.ratings_table
        
        return pred_df.iloc[user_id].to_dict()

In [19]:
md = MovieData('data/movies.dat')
uim = UserItemData('data/user_ratedmovies.dat', min_ratings=1000)
rp = MatrixFactorizationPredictor()

rec = Recommender(rp)
rec.fit(uim)
rec_items = rec.recommend(78, n=5, rec_seen=False)
for idmovie, val in rec_items:
    print("Film: {}, ocena: {}".format(md.get_title(idmovie), val))

Film: Austin Powers: The Spy Who Shagged Me, ocena: 5.000000000000003
Film: The Lord of the Rings: The Two Towers, ocena: 4.999999999999997
Film: The Silence of the Lambs, ocena: 4.499999999999995
Film: The Incredibles, ocena: 8.881784197001252e-15
Film: Mrs. Doubtfire, ocena: 4.773959005888173e-15


# Running on GPU

I failed this miserably

In [11]:
    def similarity(data, user_ratings_w, p1, p2, min_values, threshold):
        users1 = data[data["movieID"] == p1]["userID"].values
        users2 = data[data["movieID"] == p2]["userID"].values
        c_users = np.intersect1d(users1, users2)
        
        if len(c_users) < min_values:
            return 0
        
        table = user_ratings_w[user_ratings_w.index.isin(c_users)]
        m1 = table[p1]
        m2 = table[p2]
        
        sim = 1 - distance.cosine(m1, m2)
        
        if sim <= threshold:
            return 0
        return sim

In [12]:
def get_similarities(data, user_ratings, user_ratings_w, min_values=0, threshold=0):       
        users = data.userID
        movies = np.unique(data.movieID.values)

        sims = {}
        sim_table = pd.DataFrame(index=movies, columns=movies)
        for pair in itertools.combinations(movies, 2):
            m1, m2 = pair
            sim = similarity(data, user_ratings, m1, m2, min_values, threshold)
            
            sims[(m1, m2)] = sim
            sims[(m2, m1)] = sim
            sim_table.loc[m1, m2] = sim
            sim_table.loc[m2, m1] = sim
        
        return sims

In [13]:
from numba import jit

@jit
def get_similarities_jit(data, user_ratings, user_ratings_w, min_values=0, threshold=0):       
        users = data.userID
        movies = np.unique(data.movieID.values)

        sims = {}
        sim_table = pd.DataFrame(index=movies, columns=movies)
        for pair in itertools.combinations(movies, 2):
            m1, m2 = pair
            sim = similarity(data, user_ratings, m1, m2, min_values, threshold)
            
            sims[(m1, m2)] = sim
            sims[(m2, m1)] = sim
            sim_table.loc[m1, m2] = sim
            sim_table.loc[m2, m1] = sim
        
        return sims

In [14]:
def get_w(x, user_id, means):
        return x - means[user_id]

# Neural Network Predictor

In [48]:
from zipfile import ZipFile
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from pathlib import Path

class NN_Model(keras.Model):
    def __init__(self, num_users, num_movies, embedding_size, **kwargs):
        super(NN_Model, self).__init__(**kwargs)
        self.num_users = num_users
        self.num_movies = num_movies
        self.embedding_size = embedding_size
        self.user_embedding = layers.Embedding(
            num_users,
            embedding_size,
            embeddings_initializer="he_normal",
            embeddings_regularizer=keras.regularizers.l2(1e-6),
        )
        self.user_bias = layers.Embedding(num_users, 1)
        self.movie_embedding = layers.Embedding(
            num_movies,
            embedding_size,
            embeddings_initializer="he_normal",
            embeddings_regularizer=keras.regularizers.l2(1e-6),
        )
        self.movie_bias = layers.Embedding(num_movies, 1)

    def call(self, inputs):
        user_vector = self.user_embedding(inputs[:, 0])
        user_bias = self.user_bias(inputs[:, 0])
        movie_vector = self.movie_embedding(inputs[:, 1])
        movie_bias = self.movie_bias(inputs[:, 1])
        dot_user_movie = tf.tensordot(user_vector, movie_vector, 2)
        # Add all the components (including bias)
        x = dot_user_movie + user_bias + movie_bias
        # The sigmoid activation forces the rating to between 0 and 1
        return tf.nn.sigmoid(x)

    
    
class NN_Predictor():
    def __init__(self, movies_df):
            self.movies_df = movies_df.data
        
        
        
    def fit(self, X):
        self.data = X.data
        data = self.data
        
        
    
    def predict(self, user_id):
        data = self.data
        movie_df = self.movies_df
        
        user_ids = data["userID"].unique().tolist()
        user2user_encoded = {x: i for i, x in enumerate(user_ids)}
        userencoded2user = {i: x for i, x in enumerate(user_ids)}

        movie_ids = data["movieID"].unique().tolist()
        movie2movie_encoded = {x: i for i, x in enumerate(movie_ids)}
        movie_encoded2movie = {i: x for i, x in enumerate(movie_ids)}

        data["user"] = data["userID"].map(user2user_encoded)
        data["movie"] = data["movieID"].map(movie2movie_encoded)

        num_users = len(user2user_encoded)
        num_movies = len(movie_encoded2movie)
        data["rating"] = data["rating"].values.astype(np.float32)

        # min and max ratings will be used to normalize the ratings later
        min_rating = min(data["rating"])
        max_rating = max(data["rating"])

        data = data.sample(frac=1, random_state=42)
        x = data[["user", "movie"]].values

        # normalize
        y = data["rating"].apply(lambda x: (x - min_rating) / (max_rating - min_rating)).values

        # train on 90% of the data and test on 10%.
        train_indices = int(0.9 * data.shape[0])
        x_train, x_val, y_train, y_val = (
            x[:train_indices],
            x[train_indices:],
            y[:train_indices],
            y[train_indices:],
        )

        model = NN_Model(num_users, num_movies, 50)
        model.compile(
            loss=tf.keras.losses.BinaryCrossentropy(), optimizer=keras.optimizers.Adam(lr=0.001)
        )

        history = model.fit(
            x=x_train,
            y=y_train,
            batch_size=64,
            epochs=5,
            verbose=1,
            validation_data=(x_val, y_val),
        )
        
        movies_watched_by_user = data[data.userID == user_id]
        
        movies_not_watched = movie_df[
            ~movie_df["id"].isin(movies_watched_by_user.movieID.values)]["id"]
        
        movies_not_watched = list(
        set(movies_not_watched).intersection(set(movie2movie_encoded.keys()))
        )
        
        movies_not_watched = [[movie2movie_encoded.get(x)] for x in movies_not_watched]
        user_encoder = user2user_encoded.get(user_id)
        
        user_movie_array = np.hstack(
            ([[user_encoder]] * len(movies_not_watched), movies_not_watched)
        )
        
        ratings = model.predict(user_movie_array).flatten()
        top_ratings_indices = ratings.argsort()[::-1]
        recommended_movie_ids = [
            movie_encoded2movie.get(movies_not_watched[x][0]) for x in top_ratings_indices                       
        ]
        
        recommended_movies = movie_df[movie_df["id"].isin(recommended_movie_ids)]
        predictions = dict(zip(top_ratings_indices, ratings))
        
        return predictions

In [50]:
md = MovieData('data/movies.dat')
uim = UserItemData('data/user_ratedmovies.dat', min_ratings=1000)
rp = NN_Predictor(md)

rec = Recommender(rp)
rec.fit(uim)

print()
rec_items = rec.recommend(78, n=5, rec_seen=False)
for idmovie, val in rec_items:
    print("Film: {}, ocena: {}".format(md.get_title(idmovie), val))


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Film: Tom and Huck, ocena: 0.8419322967529297
Film: Copycat, ocena: 0.7930474877357483
Film: Sudden Death, ocena: 0.7905702590942383
Film: Yao a yao yao dao waipo qiao, ocena: 0.7840534448623657
Film: Dracula: Dead and Loving It, ocena: 0.7831463813781738


# Word2Vec
<b>NOTE</b> I trimmed the dataset to 1,000,000 items because it would take a very long time to use the entire thing.

In [6]:
from tqdm import tqdm
from gensim.models import Word2Vec

class Word2VecPredictor:
    def __init__(self):
        orders = pd.read_csv('cart_data/orders.csv')
        prior = pd.read_csv('cart_data/order_products__prior.csv')
        products = pd.read_csv('cart_data/products.csv')
        
        _data = pd.merge(prior, products, on = ['product_id','product_id'])
        self.data = pd.merge(_data, orders, on=['order_id','order_id'])[["order_id", "product_id", "product_name", "user_id"]]
        
        # shorten for testing purposes, there are no null vlaues in dataset
        self.data = self.data.iloc[:1000000]
        self.data["product_id"] = self.data["product_id"].astype(str)
        # 206209 users in total
        self.users = self.data["user_id"].unique().tolist()
                

        
    def fit(self):
        df = self.data
            
        # get 90% of users to train model
        random.shuffle(self.users)
        users_train = [self.users[i] for i in range(round(0.9*len(self.users)))]
            
        # split into train and test set
        self.train = df[df['user_id'].isin(users_train)]
        self.test = df[~df['user_id'].isin(users_train)]
        train = self.train
        test = self.test
            
        # lists to capture purchase history of the customers
        # got tqdm because there's a lot of data and i wanna see the loop progress 'n stuff
        purchases_train = []
        for i in tqdm(users_train):
            temp = train[train["user_id"] == i]["product_id"].tolist()
            purchases_train.append(temp)
                
        purchases_test = []
        for i in tqdm(test['user_id'].unique()):
            temp = test[test["user_id"] == i]["product_id"].tolist()
            purchases_test.append(temp)
                
        self.model = self.build_model(purchases_train, purchases_test)
            
            
            
    def build_model(self, purchases_train, purchases_test):
        model = Word2Vec(window = 10, sg = 1, hs = 0,
            negative = 10, # for negative sampling
            alpha=0.03, min_alpha=0.0007,
            seed = 14)

        model.build_vocab(purchases_train, progress_per=200)

        model.train(purchases_train, total_examples = model.corpus_count, 
                            epochs=10, report_delay=1)
            
        model.init_sims(replace=True)
        return model
    
    
    
    def vec_predict(self, v, n):
        products = self.train[["product_id", "product_name"]]
        products.drop_duplicates(inplace=True, subset='product_id', keep="last")

        # create product-ID and product-description dictionary
        products_dict = products.groupby('product_id')['product_name'].apply(list).to_dict()
        
        # extract n most similar products for the input vector
        return self.model.wv.similar_by_vector(v, topn = n)
    
    
    
    def read_product(self, product_id):
        return self.data[self.data["product_id"] == product_id].iloc[0]["product_name"]
    
    
    
    def predict(self, product_id, n=5):
        product_id = str(product_id)
        preds = self.vec_predict(str(product_id), n)
        preds = list(preds)
        preds_dict = {}
                
        for pred in preds:
            preds_dict[self.read_product(pred[0])] = pred[1]
        
        return preds_dict

In [10]:
pred = Word2VecPredictor()
pred.fit()

print(f"Recommendations for {pred.read_product('28985')}")
print(pred.predict("28985"))

100%|███████████████████████████████████████████████████████████████████████████| 19654/19654 [00:36<00:00, 537.96it/s]
100%|████████████████████████████████████████████████████████████████████████████| 2184/2184 [00:01<00:00, 1926.88it/s]


Recommendations for Michigan Organic Kale
{'Limes': 0.8007930517196655, 'Large Lemon': 0.7838623523712158, 'Organic Red Onion': 0.7832534909248352, 'Organic Strawberries': 0.7758601903915405, 'Extra Virgin Olive Oil': 0.7725867629051208}


# Clustering

In [90]:
from scipy.sparse import csr_matrix
from sklearn.cluster import KMeans

class ClusterPredictor:
    def fit(self, X):
        self.ratings = X.data[["movieID", "userID", "rating"]]
        self.matrix = self.ratings.pivot_table(index=['userID'], columns=['movieID'], values='rating').fillna(0)  
        cols = self.matrix.columns
        fjalor = {}
        
        # a column represents a movie
        for col in cols:
            fjalor[col] = pd.arrays.SparseArray(self.matrix[col])
        
        sparseFrame = pd.DataFrame(fjalor)
        sparse_ratings = csr_matrix(sparseFrame)
        pred_sparse_1 = KMeans(n_clusters=12, algorithm='full').fit_predict(sparse_ratings)
        
        # get clusters of users
        self.cluster = pd.concat([self.matrix.reset_index(), pd.DataFrame({'group': pred_sparse_1})], axis=1)  
    
    def get_user_cluster(self, user_id):
        return self.cluster[self.cluster["userID"] == user_id]["group"].item()
    
    def predict(self, user_id):
        cluster_id = self.get_user_cluster(user_id)
        user_cluster = self.cluster[self.cluster.group == cluster_id].drop(['group'], axis=1)
        user_cluster = user_cluster.set_index("userID")
        
        user_row = user_cluster[user_cluster.index == user_id]
        user_rated = user_row.loc[:, (user_row != 0).any(axis=0)].T # remove empty columns
        user_rated = user_rated.rename(columns={user_id: "rating"})
        
        user_rated = user_rated["rating"].to_dict()
        means = user_cluster.mean().to_dict()
        
        # user_rated replaces values in means
        recs = {**means, **user_rated}
        return recs

In [91]:
md = MovieData('data/movies.dat')
uim = UserItemData('data/user_ratedmovies.dat', min_ratings=1000)
rp = ClusterPredictor()

rec = Recommender(rp)
rec.fit(uim)
rec_items = rec.recommend(78, rec_seen=False)
for idmovie, val in rec_items:
    print("Film: {}, ocena: {}".format(md.get_title(idmovie), val))

Film: The Silence of the Lambs, ocena: 3.793103448275862
Film: The Usual Suspects, ocena: 3.6413793103448278
Film: Shichinin no samurai, ocena: 3.1724137931034484
Film: The Lord of the Rings: The Fellowship of the Ring, ocena: 2.7344827586206897
Film: Die Hard, ocena: 2.679310344827586
Film: Rain Man, ocena: 2.5724137931034483
Film: The Fugitive, ocena: 2.537931034482759
Film: Men in Black, ocena: 2.503448275862069
Film: Gladiator, ocena: 2.4034482758620688
Film: Indiana Jones and the Last Crusade, ocena: 2.396551724137931
