In [5]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
import torch
import os
import torch.optim as optim

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [6]:
def set_random_seed(state=1):
    gens = (np.random.seed, torch.manual_seed, torch.cuda.manual_seed)
    for set_state in gens:
        set_state(state)

RANDOM_STATE = 42
set_random_seed(RANDOM_STATE)

# 1. Load dataset

In [7]:
ratings = pd.read_csv("../data/ratings.csv")
movies = pd.read_csv("../data/movies.csv")

# 2. Basic EDA

In [8]:
ratings[ratings.duplicated(subset = ['userId','movieId'])]

Unnamed: 0,userId,movieId,rating,timestamp


In [9]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  9742 non-null   int64 
 1   title    9742 non-null   object
 2   genres   9742 non-null   object
dtypes: int64(1), object(2)
memory usage: 228.5+ KB


In [10]:
len(ratings['movieId'].unique())

9724

In [11]:
rated_movie_ids = ratings['movieId'].unique()

In [12]:
filtered_movies_df = movies[movies['movieId'].isin(rated_movie_ids)].copy()

# Optionally, you can check the number of rows after filtering
print("Number of movies before filtering:", movies.shape[0])
print("Number of movies after filtering:", filtered_movies_df.shape[0])

Number of movies before filtering: 9742
Number of movies after filtering: 9724


In [13]:
movies = filtered_movies_df

In [17]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [18]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [19]:
minmax = ratings.rating.min(), ratings.rating.max()
minmax

(0.5, 5.0)

In [20]:
len(movies['movieId'].unique())

9724

In [21]:
movies[movies.duplicated(subset = "movieId",keep = "first")]

Unnamed: 0,movieId,title,genres


In [22]:
def preview(ratings, n = 10):
    user_groups = ratings.groupby('userId')['rating'].count()
    top_users = user_groups.sort_values(ascending=False)[:n]

    movie_groups = ratings.groupby('movieId')['rating'].count()
    top_movies = movie_groups.sort_values(ascending=False)[:n]

    top = (ratings.join(top_users,rsuffix  = "_r", how = "inner", on = "userId").join(top_movies,rsuffix  = "_r", how= "inner", on = "movieId"))

    return pd.crosstab(top.userId,top.movieId,top.rating, aggfunc = sum)

preview(ratings)

  return pd.crosstab(top.userId,top.movieId,top.rating, aggfunc = sum)
  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


movieId,110,260,296,318,356,480,527,589,593,2571
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
68,2.5,5.0,2.0,3.0,3.5,3.5,4.0,3.5,3.5,4.5
274,4.5,3.0,5.0,4.5,4.5,3.5,4.0,4.5,4.0,4.0
288,5.0,5.0,5.0,5.0,5.0,2.0,5.0,4.0,5.0,3.0
380,4.0,5.0,5.0,3.0,5.0,5.0,,5.0,5.0,4.5
414,5.0,5.0,5.0,5.0,5.0,4.0,4.0,5.0,4.0,5.0
448,,5.0,5.0,,3.0,3.0,,3.0,5.0,2.0
474,3.0,4.0,4.0,5.0,3.0,4.5,5.0,4.0,4.5,4.5
599,3.5,5.0,5.0,4.0,3.5,4.0,,4.5,3.0,5.0
606,3.5,4.5,5.0,3.5,4.0,2.5,5.0,3.5,4.5,5.0
610,4.5,5.0,5.0,3.0,3.0,5.0,3.5,5.0,4.5,5.0


# 3. Preparing the training data

In [25]:
def create_dataset(ratings):
    '''
    input : ratings dataframe
    
    return 
    feedback matrix : (num movies, num users)
    
    '''
    matrix =  ratings.pivot(index = "movieId", columns = "userId", values = "rating")
    matrix = matrix.fillna(0)
    return matrix

Y_df = create_dataset(ratings)
Y_df
    
    

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,0.0,0.0,0.0,4.0,0.0,4.5,0.0,0.0,0.0,...,4.0,0.0,4.0,3.0,4.0,2.5,4.0,2.5,3.0,5.0
2,0.0,0.0,0.0,0.0,0.0,4.0,0.0,4.0,0.0,0.0,...,0.0,4.0,0.0,5.0,3.5,0.0,0.0,2.0,0.0,0.0
3,4.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193581,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
193583,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
193585,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
193587,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [26]:
R_df = (Y_df != 0).astype(int)
R_df

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1,0,0,0,1,0,1,0,0,0,...,1,0,1,1,1,1,1,1,1,1
2,0,0,0,0,0,1,0,1,0,0,...,0,1,0,1,1,0,0,1,0,0
3,1,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,1,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193581,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
193583,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
193585,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
193587,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [27]:
tmean = np.mean(Y_df.loc[1,R_df.loc[1,:].astype(bool)])

In [28]:
tmean

3.9209302325581397


The collaborative filtering cost function is given by
$$J({\mathbf{x}^{(0)},...,\mathbf{x}^{(n_m-1)},\mathbf{w}^{(0)},b^{(0)},...,\mathbf{w}^{(n_u-1)},b^{(n_u-1)}})= \frac{1}{2}\sum_{(i,j):r(i,j)=1}(\mathbf{w}^{(j)} \cdot \mathbf{x}^{(i)} + b^{(j)} - y^{(i,j)})^2
+\underbrace{
\frac{\lambda}{2}
\sum_{j=0}^{n_u-1}\sum_{k=0}^{n-1}(\mathbf{w}^{(j)}_k)^2
+ \frac{\lambda}{2}\sum_{i=0}^{n_m-1}\sum_{k=0}^{n-1}(\mathbf{x}_k^{(i)})^2
}_{regularization}
\tag{1}$$
The first summation in (1) is "for all $i$, $j$ where $r(i,j)$ equals $1$" and could be written:

$$
= \frac{1}{2}\sum_{j=0}^{n_u-1} \sum_{i=0}^{n_m-1}r(i,j)*(\mathbf{w}^{(j)} \cdot \mathbf{x}^{(i)} + b^{(j)} - y^{(i,j)})^2
+\text{regularization}
$$



In [29]:
def cost_func(X, W, b, Y, R, lambda_):
    """
    Returns the cost for the content-based filtering
    Args:
      X (ndarray (num_movies,num_features)): matrix of item features (tensor)
      W (ndarray (num_users,num_features)) : matrix of user parameters (tensor)
      b (ndarray (1, num_users)            : vector of user parameters (tensor)
      Y (ndarray (num_movies,num_users)    : matrix of user ratings of movies (tensor)
      R (ndarray (num_movies,num_users)    : matrix, where R(i, j) = 1 if the i-th movies was rated by the j-th user
      lambda_ (float): regularization parameter
    Returns:
      J (float) : Cost
    """
    y_hat = torch.matmul(X,W.t()) + b 
    j = (y_hat - Y) * R
    J = 0.5* torch.sum(j**2) + lambda_/2 * (torch.sum(W**2) + torch.sum(X**2)) 
    return J

**Normalize ratings**\
We normalize the ratings to make the algorithm run faster and more efficient.

In [30]:
def normalizeRatings(Y, R):
    """
    Preprocess data by subtracting mean rating for every movie (every row).
    Only include real ratings R(i,j)=1.
    [Ynorm, Ymean] = normalizeRatings(Y, R) normalized Y so that each movie
    has a rating of 0 on average. Unrated moves then have a mean rating (0)
    Returns the mean rating in Ymean.
    """
    Ymean = (np.sum(Y*R,axis=1)/(np.sum(R, axis=1)+1e-12)).reshape(-1,1)
    Ynorm = Y - np.multiply(Ymean, R) 
    return(Ynorm, Ymean)

**Convert dataframe to tensor and initialize weight for users and movies matrix**

In [68]:


def prepare_data(Y_df,R_df, newRatings = None):
    Y_np = Y_df.to_numpy()
    R_np = R_df.to_numpy()
    if newRatings is not None:
        Y_np = np.c_[Y_df,newRatings]
        R_np = np.c_[R_np,(newRatings!=0).astype(int)]
            
    
    Ynorm,Ymean = normalizeRatings(Y_np,R_np)
    Ynorm = torch.tensor(Ynorm)
    Ymean = torch.tensor(Ymean)
    
    Rtensor = torch.tensor(R_np)
    return (Ynorm,Ymean, Rtensor)
def prepare_train(Y_df,R_df,newRatings =None, num_features = 10):
    num_movies = Y_df.shape[0]
    num_users = Y_df.shape[1]
    W = torch.randn((num_users if newRatings is None else num_users+1), num_features, dtype=torch.float64, requires_grad=True)
    X = torch.randn(num_movies, num_features, dtype=torch.float64, requires_grad=True)
    b = torch.randn(1, (num_users if newRatings is None else num_users+1), dtype=torch.float64, requires_grad=True)


    Ynorm,Ymean,R = prepare_data(Y_df,R_df,newRatings)
    return X,W,b,R, Ynorm, Ymean



## Add new user
Added new user that we would give recommendation

In [69]:
my_ratings =np.zeros(Y_df.shape[0])# initialize array for new user with length of num movies

my_ratings[929]  = 4   
my_ratings[246]  = 4   
my_ratings[2716] = 2   
my_ratings[1150] = 5   
my_ratings[382]  = 3   
my_ratings[366]  = 5   
my_ratings[622]  = 5   
my_ratings[988]  = 3   
my_ratings[2925] = 1   
my_ratings[2937] = 1   
my_ratings[793]  = 5  

In [70]:
my_rated = [i for i in range(len(my_ratings)) if my_ratings[i]>0]

print('\nNew user ratings:\n')
for i in range(len(my_ratings)):
    if my_ratings[i]>0:
        print(f'Rated {my_ratings[i]} for {movies.loc[i,"title"]} movie - {i}')


New user ratings:

Rated 4.0 for New York Cop (Nyû Yôku no koppu) (1993) movie - 246
Rated 5.0 for Blink (1994) movie - 366
Rated 3.0 for Cowboy Way, The (1994) movie - 382
Rated 5.0 for Nutty Professor, The (1996) movie - 622
Rated 5.0 for Die Hard (1988) movie - 793
Rated 4.0 for Raging Bull (1980) movie - 929
Rated 3.0 for Koyaanisqatsi (a.k.a. Koyaanisqatsi: Life Out of Balance) (1983) movie - 988
Rated 5.0 for Romy and Michele's High School Reunion (1997) movie - 1150
Rated 2.0 for Fighting Seabees, The (1944) movie - 2716
Rated 1.0 for Bikini Beach (1964) movie - 2925
Rated 1.0 for Slumber Party Massacre III (1990) movie - 2937


In [None]:
Ymean

tensor([[3.9209],
        [3.4318],
        [3.2596],
        ...,
        [3.5000],
        [3.5000],
        [4.0000]], dtype=torch.float64)

# 4. Train the model

In [None]:
X,W,b,R,Ynorm,Ymean = prepare_train(Y_df,R_df,my_ratings,150)

In [None]:
opt = optim.Adam([X,W,b], lr = 1e-1 )

In [None]:
def train(iteration,lambda_):
    
    for iter in range(1, iteration+1):
        opt.zero_grad()
        cost_value = cost_func(X,W,b,Ynorm,R,lambda_)
        cost_value.backward()
        opt.step()
        if iter % 20 == 0:
            # .item() converts the tensor to a standard Python number for printing
            print(f"Training loss at iteration {iter}: {cost_value.item():0.1f}")
train(200,1)

Training loss at iteration 20: 7202.3
Training loss at iteration 40: 6345.4
Training loss at iteration 60: 5750.9
Training loss at iteration 80: 5331.1
Training loss at iteration 100: 5029.2
Training loss at iteration 120: 4808.1
Training loss at iteration 140: 4643.0
Training loss at iteration 160: 4517.5
Training loss at iteration 180: 4420.4
Training loss at iteration 200: 4343.9


# 5. Make Recommendations

In [None]:
pred = torch.matmul(X,W.t())+b
predm = pred + Ymean
my_pred = predm[:,-1]

idx = torch.argsort(my_pred,descending = True)

for i in range(16):
    j = idx[i].item()
    if j not in my_rated:
        print(f'Predicting rating {my_pred[j].item():0.2f} for movie {movies.loc[j,"title"]}')

print('\n\n Original vs Predicted ratings:\n')
for i in range(len(my_ratings)):
    if my_ratings[i]>0:
        print(f'Original {my_ratings[i]}, Predicted {my_pred[i].item():0.2f} for movie {movies.loc[i,"title"]}')


Predicting rating 4.96 for movie Return of Martin Guerre, The (Retour de Martin Guerre, Le) (1982)
Predicting rating 4.96 for movie Belle époque (1992)
Predicting rating 4.95 for movie Night Porter, The (Portiere di notte, Il) (1974)
Predicting rating 4.95 for movie MatchMaker, The (1997)
Predicting rating 4.94 for movie Tightrope (1984)
Predicting rating 4.94 for movie Thin Line Between Love and Hate, A (1996)
Predicting rating 4.93 for movie Fog, The (2005)
Predicting rating 4.93 for movie The Fault in Our Stars (2014)
Predicting rating 4.93 for movie Skulls, The (2000)
Predicting rating 4.92 for movie Bachelor and the Bobby-Soxer, The (1947)
Predicting rating 4.92 for movie Silk Stockings (1957)
Predicting rating 4.92 for movie When a Woman Ascends the Stairs (Onna ga kaidan wo agaru toki) (1960)
Predicting rating 4.92 for movie Door in the Floor, The (2004)
Predicting rating 4.92 for movie Black Mass (2015)
Predicting rating 4.92 for movie Redbelt (2008)


 Original vs Predicted ra