# Matrix Factorization

In [1]:
import pandas as pd
import numpy as np

## Reading Ratings Data

In [2]:
ratings_df = pd.read_csv('https://raw.githubusercontent.com/surajdwivedi0307/UnsupervisedLearning/main/factorization/u.data',
                         sep = '\t')

In [3]:
ratings_df

Unnamed: 0,196,242,3,881250949
0,186,302,3,891717742
1,22,377,1,878887116
2,244,51,2,880606923
3,166,346,1,886397596
4,298,474,4,884182806
...,...,...,...,...
99994,880,476,3,880175444
99995,716,204,5,879795543
99996,276,1090,1,874795795
99997,13,225,2,882399156


In [4]:
ratings_df.columns = ['userid', 'movieid', 'rating', 'timestamp']

In [5]:
ratings_df

Unnamed: 0,userid,movieid,rating,timestamp
0,186,302,3,891717742
1,22,377,1,878887116
2,244,51,2,880606923
3,166,346,1,886397596
4,298,474,4,884182806
...,...,...,...,...
99994,880,476,3,880175444
99995,716,204,5,879795543
99996,276,1090,1,874795795
99997,13,225,2,882399156


In [6]:
len(ratings_df.userid.unique())

943

In [7]:
len(ratings_df.movieid.unique())

1682

## Reading the movies metadata

In [8]:
movies_df = pd.read_csv('https://raw.githubusercontent.com/manaranjanp/ISB_MLUL/main/factorization/u.item',
                        encoding = 'iso-8859-1',
                        sep = '|',
                        header = None,
                        usecols=[0, 1])

In [9]:
movies_df

Unnamed: 0,0,1
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)
4,5,Copycat (1995)
...,...,...
1677,1678,Mat' i syn (1997)
1678,1679,B. Monkey (1998)
1679,1680,Sliding Doors (1998)
1680,1681,You So Crazy (1994)


In [10]:
movies_df.columns = ['movieid', 'moviename']

## Creating user-movies ratings matrix

In [11]:
user_movies_df = ratings_df.pivot( index='userid',
                                 columns='movieid',
                                 values = "rating" ).reset_index(drop=True)
user_movies_df.index = ratings_df.userid.unique()

In [12]:
user_movies_df

movieid,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
186,5.0,3.0,4.0,3.0,3.0,5.0,4.0,1.0,5.0,3.0,...,,,,,,,,,,
22,4.0,,,,,,,,,2.0,...,,,,,,,,,,
244,,,,,,,,,,,...,,,,,,,,,,
166,,,,,,,,,,,...,,,,,,,,,,
298,4.0,3.0,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,,,,,,,,,5.0,,...,,,,,,,,,,
936,,,,2.0,,,4.0,5.0,3.0,,...,,,,,,,,,,
930,5.0,,,,,,4.0,,,,...,,,,,,,,,,
920,,,,,,,,,,,...,,,,,,,,,,


### Matrix Factorization Methods

In [13]:
import numpy as np

def als_matrix_factorization(R, num_features, lambda_reg, iterations):
    """
    Perform matrix factorization using Alternating Least Squares (ALS) on the
    incomplete matrix R with NaN values and return the error.

    R: the input matrix with NaNs
    num_features: the number of latent features
    lambda_reg: the regularization parameter
    iterations: the number of iterations to perform
    """

    num_users, num_items = R.shape
    W = np.random.rand(num_users, num_features)
    H = np.random.rand(num_items, num_features).T

    mask = ~np.isnan(R)

    errors = []

    for _ in range(iterations):
        # Update W
        for i in range(num_users):
            H_i = H[:, mask[i, :]]
            R_i = R[i, mask[i, :]]
            W[i, :] = np.linalg.solve(H_i @ H_i.T + lambda_reg * np.eye(num_features), H_i @ R_i)

        # Update H
        for j in range(num_items):
            W_j = W[mask[:, j], :]
            R_j = R[mask[:, j], j]
            H[:, j] = np.linalg.solve(W_j.T @ W_j + lambda_reg * np.eye(num_features), W_j.T @ R_j)

        # Calculate the reconstruction error
        R_hat = W @ H
        error = np.nansum((R - R_hat)**2 * mask)
        errors.append(np.sqrt(error))

    return W, H.T, np.round(np.sqrt(errors), 4)

## Factorizing User-Movies Ratings Matrix

In [14]:
num_features = 20
lambda_reg = 0.1
iterations = 200

W, H, errors = als_matrix_factorization(user_movies_df.to_numpy(), num_features, lambda_reg, iterations)

print("W (User Feature Matrix):")
print(W)
print("\nH (Item Feature Matrix):")
print(H)

KeyboardInterrupt: 

In [None]:
errors

In [None]:
W.shape

In [None]:
H.shape

## Finding Similarity

In [None]:
from sklearn.metrics import pairwise_distances
from scipy.spatial.distance import cosine, correlation

movies_sim = 1 - pairwise_distances( H, metric="cosine" )
movies_sim_df = pd.DataFrame( movies_sim )

In [None]:
def get_similar_movies( movieid, topN = 5 ):
    movieidx = movies_df[movies_df.movieid == movieid].index[0]
    movies_df['similarity'] = movies_sim_df.iloc[movieidx]
    top_n = movies_df.sort_values( ["similarity"], ascending = False )[0:topN]
    return top_n

In [None]:
movies_sim_df

## Finding Similar Movies

In [None]:
movies_df[movies_df.movieid == 127]

In [None]:
get_similar_movies(127)

In [None]:
get_similar_movies(222)

In [None]:
get_similar_movies(88)

In [None]:
movies_df[movies_df.moviename.str.contains("Gump")]

In [None]:
get_similar_movies(82)