In [25]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import pandas as pd
from torch.utils.data import DataLoader, TensorDataset
import os
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import mean_squared_error, mean_absolute_error
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor

In [26]:
!pip uninstall -y scikit-learn
!pip install scikit-learn==1.5.2  # This is below 1.6.0 as required by category-encoders

Found existing installation: scikit-learn 1.5.2
Uninstalling scikit-learn-1.5.2:
  Successfully uninstalled scikit-learn-1.5.2
Collecting scikit-learn==1.5.2
  Using cached scikit_learn-1.5.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Using cached scikit_learn-1.5.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.3 MB)
Installing collected packages: scikit-learn
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
cesium 0.12.4 requires numpy<3.0,>=2.0, but you have numpy 1.26.4 which is incompatible.[0m[31m
[0mSuccessfully installed scikit-learn-1.5.2


In [27]:
credits = pd.read_csv('/kaggle/input/the-movies-dataset/credits.csv')
keywords = pd.read_csv('/kaggle/input/the-movies-dataset/keywords.csv')
links_small = pd.read_csv('/kaggle/input/the-movies-dataset/links_small.csv')
links_small = links_small[links_small['tmdbId'].notnull()]['tmdbId'].astype('int')

movies = pd.read_csv('/kaggle/input/the-movies-dataset/movies_metadata.csv')
movies = movies.drop([19730, 29503, 35587])
movies['id'] = movies['id'].astype('int')
movies['genres'] = movies['genres'].fillna('[]').apply(literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
movies['description'] = movies['overview']
movies['description'] = movies['description'].fillna('')

ratings_df = pd.read_csv('/kaggle/input/the-movies-dataset/ratings.csv')
#movies = movies[movies['id'].isin(links_small)]

  movies = pd.read_csv('/kaggle/input/the-movies-dataset/movies_metadata.csv')


In [28]:
columns_take = ['genres', 'id', 'title', 'description']
all_columns = movies.columns
columns_drop = [column for column in all_columns if column not in columns_take]
movies = movies.drop(columns=columns_drop)

In [29]:
movies_df = movies.copy()
unique_movie_ids_in_ratings = ratings_df['movieId'].unique()
print(f"Number of unique movie IDs in ratings dataframe: {len(ratings_df['movieId'].unique())}")

# Get all movie IDs from movies dataframe
all_movie_ids_in_movies = set(movies_df['id'])
print(f"Number of movie IDs in movies dataframe: {len(all_movie_ids_in_movies)}")

# Find movie IDs that are in ratings but not in movies
missing_movie_ids = [movie_id for movie_id in unique_movie_ids_in_ratings 
                     if movie_id not in all_movie_ids_in_movies]

print(f"Number of movie IDs present in ratings but missing from movies: {len(missing_movie_ids)}")
print(f"Percentage of missing movies: {len(missing_movie_ids) / len(unique_movie_ids_in_ratings) * 100:.2f}%")


Number of unique movie IDs in ratings dataframe: 45115
Number of movie IDs in movies dataframe: 45433
Number of movie IDs present in ratings but missing from movies: 37550
Percentage of missing movies: 83.23%


In [30]:
import pandas as pd

movie_ids_in_ratings = set(ratings_df['movieId'].unique())
print(f"Number of unique movie IDs in ratings dataframe: {len(movie_ids_in_ratings)}")

filtered_movies_df = movies_df[movies_df['id'].isin(movie_ids_in_ratings)]
filtered_ratings_df = ratings_df[ratings_df['movieId'].isin(set(movies_df['id']))]

print(f"Number of movies with at least one rating: {len(filtered_movies_df)}")
print(f"Number of movies removed: {len(movies_df) - len(filtered_movies_df)}")
print(f"Percentage of original movies kept: {len(filtered_movies_df) / len(movies_df) * 100:.2f}%")
# filtered_movies_df.to_csv('filtered_movies.csv', index=False)
filtered_movies_df.head()

Number of unique movie IDs in ratings dataframe: 45115
Number of movies with at least one rating: 7569
Number of movies removed: 37894
Percentage of original movies kept: 16.65%


Unnamed: 0,genres,id,title,description
0,"[Animation, Comedy, Family]",862,Toy Story,"Led by Woody, Andy's toys live happily in his ..."
1,"[Adventure, Fantasy, Family]",8844,Jumanji,When siblings Judy and Peter discover an encha...
5,"[Action, Crime, Drama, Thriller]",949,Heat,"Obsessive master thief, Neil McCauley leads a ..."
9,"[Adventure, Action, Thriller]",710,GoldenEye,James Bond must unmask the mysterious head of ...
14,"[Action, Adventure]",1408,Cutthroat Island,"Morgan Adams and her slave, William Shaw, are ..."


In [31]:
filtered_ratings_df = filtered_ratings_df.drop(columns=['timestamp'])
filtered_ratings_df.head()

Unnamed: 0,userId,movieId,rating
0,1,110,1.0
1,1,147,4.5
2,1,858,5.0
4,1,1246,5.0
5,1,1968,4.0


In [36]:
percentile = 0.05
user_rating_counts = filtered_ratings_df['userId'].value_counts()
num_top_users = int(np.ceil(len(user_rating_counts) * percentile))
top_users = user_rating_counts.nlargest(num_top_users).index.tolist()
filtered_df = filtered_ratings_df[filtered_ratings_df['userId'].isin(top_users)].copy()
filtered_df

Unnamed: 0,userId,movieId,rating
1711,24,2,3.0
1712,24,6,4.0
1713,24,16,3.0
1714,24,17,3.0
1715,24,21,4.0
...,...,...,...
26023494,270887,166643,5.0
26023501,270887,167738,4.0
26023509,270887,168712,5.0
26023510,270887,169864,5.0


In [37]:
num_users = filtered_df['userId'].nunique()
num_items = filtered_df['movieId'].nunique()

user_mapping = {id: idx for idx, id in enumerate(filtered_df['userId'].unique())}
item_mapping = {id: idx for idx, id in enumerate(filtered_df['movieId'].unique())}
#convert non-sequential user IDs to sequential indices for matrix factorization
filtered_df['user_idx'] = filtered_df['userId'].map(user_mapping)
filtered_df['item_idx'] = filtered_df['movieId'].map(item_mapping)

filtered_df.drop(columns=['userId', 'movieId'], inplace=True)

In [41]:
from surprise import Reader, Dataset, SVD, SlopeOne, accuracy
from surprise.model_selection import KFold
from surprise.model_selection import train_test_split
import pandas as pd

In [None]:
"""
SVD implementation for reference

class SVD(AlgoBase):

    The prediction :math:`\\hat{r}_{ui}` is set as:

    .. math::
        \\hat{r}_{ui} = \\mu + b_u + b_i + q_i^Tp_u

    If user :math:`u` is unknown, then the bias :math:`b_u` and the factors
    :math:`p_u` are assumed to be zero. The same applies for item :math:`i`
    with :math:`b_i` and :math:`q_i`.

    To estimate all the unknown, we minimize the following regularized squared
    error:

    .. math::
        \\sum_{r_{ui} \\in R_{train}} \\left(r_{ui} - \\hat{r}_{ui} \\right)^2 +
        \\lambda\\left(b_i^2 + b_u^2 + ||q_i||^2 + ||p_u||^2\\right)


    The minimization is performed by a very straightforward stochastic gradient
    descent:

    .. math::
        b_u &\\leftarrow b_u &+ \\gamma (e_{ui} - \\lambda b_u)\\\\
        b_i &\\leftarrow b_i &+ \\gamma (e_{ui} - \\lambda b_i)\\\\
        p_u &\\leftarrow p_u &+ \\gamma (e_{ui} \\cdot q_i - \\lambda p_u)\\\\
        q_i &\\leftarrow q_i &+ \\gamma (e_{ui} \\cdot p_u - \\lambda q_i)

    where :math:`e_{ui} = r_{ui} - \\hat{r}_{ui}`. These steps are performed
    over all the ratings of the trainset and repeated ``n_epochs`` times.
    Baselines are initialized to ``0``. User and item factors are randomly
    initialized according to a normal distribution, which can be tuned using
    the ``init_mean`` and ``init_std_dev`` parameters.

    learning rate :math:`\\gamma` and the
    regularization term :math:`\\lambda` can be different for each
    kind of parameter (see below). 

    Args:
        n_factors: The number of factors. Default is ``100``.
        n_epochs: The number of iteration of the SGD procedure. Default is
            ``20``.
        biased(bool): Whether to use baselines (or biases). See :ref:`note
            <unbiased_note>` above.  Default is ``True``.
        init_mean: The mean of the normal distribution for factor vectors
            initialization. Default is ``0``.
        init_std_dev: The standard deviation of the normal distribution for
            factor vectors initialization. Default is ``0.1``.
        lr_all: The learning rate for all parameters. Default is ``0.005``.
        reg_all: The regularization term for all parameters. Default is
            ``0.02``.
        lr_bu: The learning rate for :math:`b_u`. Takes precedence over
            ``lr_all`` if set. Default is ``None``.
        lr_bi: The learning rate for :math:`b_i`. Takes precedence over
            ``lr_all`` if set. Default is ``None``.
        lr_pu: The learning rate for :math:`p_u`. Takes precedence over
            ``lr_all`` if set. Default is ``None``.
        lr_qi: The learning rate for :math:`q_i`. Takes precedence over
            ``lr_all`` if set. Default is ``None``.
        reg_bu: The regularization term for :math:`b_u`. Takes precedence
            over ``reg_all`` if set. Default is ``None``.
        reg_bi: The regularization term for :math:`b_i`. Takes precedence
            over ``reg_all`` if set. Default is ``None``.
        reg_pu: The regularization term for :math:`p_u`. Takes precedence
            over ``reg_all`` if set. Default is ``None``.
        reg_qi: The regularization term for :math:`q_i`. Takes precedence
            over ``reg_all`` if set. Default is ``None``.
        random_state(int, RandomState instance from numpy, or ``None``):
            Determines the RNG that will be used for initialization. If
            int, ``random_state`` will be used as a seed for a new RNG. This is
            useful to get the same initialization over multiple calls to
            ``fit()``.  If RandomState instance, this same instance is used as
            RNG. If ``None``, the current RNG from numpy is used.  Default is
            ``None``.
        verbose: If ``True``, prints the current epoch. Default is ``False``.

    Attributes:
        pu(numpy array of size (n_users, n_factors)): The user factors (only
            exists if ``fit()`` has been called)
        qi(numpy array of size (n_items, n_factors)): The item factors (only
            exists if ``fit()`` has been called)
        bu(numpy array of size (n_users)): The user biases (only
            exists if ``fit()`` has been called)
        bi(numpy array of size (n_items)): The item biases (only
            exists if ``fit()`` has been called)


    def __init__(self, n_factors=100, n_epochs=20, biased=True, init_mean=0,
                 init_std_dev=.1, lr_all=.005,
                 reg_all=.02, lr_bu=None, lr_bi=None, lr_pu=None, lr_qi=None,
                 reg_bu=None, reg_bi=None, reg_pu=None, reg_qi=None,
                 random_state=None, verbose=False):

        self.n_factors = n_factors
        self.n_epochs = n_epochs
        self.biased = biased
        self.init_mean = init_mean
        self.init_std_dev = init_std_dev
        self.lr_bu = lr_bu if lr_bu is not None else lr_all
        self.lr_bi = lr_bi if lr_bi is not None else lr_all
        self.lr_pu = lr_pu if lr_pu is not None else lr_all
        self.lr_qi = lr_qi if lr_qi is not None else lr_all
        self.reg_bu = reg_bu if reg_bu is not None else reg_all
        self.reg_bi = reg_bi if reg_bi is not None else reg_all
        self.reg_pu = reg_pu if reg_pu is not None else reg_all
        self.reg_qi = reg_qi if reg_qi is not None else reg_all
        self.random_state = random_state
        self.verbose = verbose

        AlgoBase.__init__(self)

    def fit(self, trainset):

        AlgoBase.fit(self, trainset)
        self.sgd(trainset)

        return self

    def sgd(self, trainset):

        # for f in range(n_factors):
        #       for _ in range(n_iter):
        #           for u, i, r in all_ratings:
        #               err = r_ui - <p[u, :f+1], q[i, :f+1]>
        #               update p[u, f]
        #               update q[i, f]


        rng = get_rng(self.random_state)

        # user biases
        cdef double [::1] bu = np.zeros(trainset.n_users, dtype=np.double)
        # item biases
        cdef double [::1] bi = np.zeros(trainset.n_items, dtype=np.double)
        # user factors
        cdef double [:, ::1] pu = rng.normal(self.init_mean, self.init_std_dev, size=(trainset.n_users, self.n_factors))
        # item factors
        cdef double [:, ::1] qi = rng.normal(self.init_mean, self.init_std_dev, size=(trainset.n_items, self.n_factors))

        cdef int u, i, f
        cdef int n_factors = self.n_factors
        cdef bint biased = self.biased

        cdef double r, err, dot, puf, qif
        cdef double global_mean = self.trainset.global_mean

        cdef double lr_bu = self.lr_bu
        cdef double lr_bi = self.lr_bi
        cdef double lr_pu = self.lr_pu
        cdef double lr_qi = self.lr_qi

        cdef double reg_bu = self.reg_bu
        cdef double reg_bi = self.reg_bi
        cdef double reg_pu = self.reg_pu
        cdef double reg_qi = self.reg_qi

        if not biased:
            global_mean = 0

        for current_epoch in range(self.n_epochs):
            if self.verbose:
                print("Processing epoch {}".format(current_epoch))

            for u, i, r in trainset.all_ratings():
                # compute current error
                dot = 0  # <q_i, p_u>
                for f in range(n_factors):
                    dot += qi[i, f] * pu[u, f]
                err = r - (global_mean + bu[u] + bi[i] + dot)

                # update biases
                if biased:
                    bu[u] += lr_bu * (err - reg_bu * bu[u])
                    bi[i] += lr_bi * (err - reg_bi * bi[i])

                # update factors
                for f in range(n_factors):
                    puf = pu[u, f]
                    qif = qi[i, f]
                    pu[u, f] += lr_pu * (err * qif - reg_pu * puf)
                    qi[i, f] += lr_qi * (err * puf - reg_qi * qif)

        self.bu = np.asarray(bu)
        self.bi = np.asarray(bi)
        self.pu = np.asarray(pu)
        self.qi = np.asarray(qi)
"""

In [51]:
filtered_df

Unnamed: 0,user_idx,item_idx,rating
1711,0,0,3.0
1712,0,1,4.0
1713,0,2,3.0
1714,0,3,3.0
1715,0,4,4.0
...,...,...,...
26023494,13295,2203,5.0
26023501,13295,2146,4.0
26023509,13295,3187,5.0
26023510,13295,2512,5.0


In [50]:
filtered_df = filtered_df[['user_idx', 'item_idx', 'rating']]

In [52]:
from surprise import SVD, Dataset, Reader
from surprise.model_selection import KFold
from tqdm import tqdm

reader = Reader(rating_scale=(0.5, 5))
data = Dataset.load_from_df(filtered_df, reader)

kf = KFold(n_splits=5)
svd = SVD(n_factors=200, n_epochs=60, lr_all=0.02, reg_all=0.03)

for trainset, testset in tqdm(kf.split(data), total=kf.n_splits, desc="Cross-validation"):
    svd.fit(trainset)
    predictions = svd.test(testset)
    mse = accuracy.mse(predictions)
    mae = accuracy.mae(predictions)
    print(mse, mae)

Cross-validation:   0%|          | 0/5 [00:00<?, ?it/s]

MSE: 0.5910


Cross-validation:  20%|██        | 1/5 [04:56<19:46, 296.64s/it]

MAE:  0.5882
0.5909758988026823 0.5881515794019595
MSE: 0.5916


Cross-validation:  40%|████      | 2/5 [09:55<14:53, 297.80s/it]

MAE:  0.5879
0.5915613528229694 0.587873419586319
MSE: 0.5910


Cross-validation:  60%|██████    | 3/5 [14:46<09:49, 294.71s/it]

MAE:  0.5874
0.5910471960047652 0.5873560777983525
MSE: 0.5916


Cross-validation:  80%|████████  | 4/5 [19:51<04:58, 298.75s/it]

MAE:  0.5880
0.5915537262749918 0.5879622325619329
MSE: 0.5907


Cross-validation: 100%|██████████| 5/5 [24:47<00:00, 297.58s/it]

MAE:  0.5875
0.5906521719850558 0.5874622061131664





In [45]:
trainset, testset = train_test_split(data, test_size=0.15)

algo = SVD(n_factors = 200, n_epochs = 60, lr_all = 0.02, reg_all = 0.03)

# Train the algorithm on the trainset, and predict ratings for the testset
algo.fit(trainset)
predictions = algo.test(testset)

# Then compute RMSE
accuracy.mse(predictions)
accuracy.mae(predictions)


MSE: 1260545.7142
MAE:  758.5500


758.5499969513552