## Data preparation & Data exploration

In [11]:
!unzip /content/drive/MyDrive/ml-100k.zip

Archive:  /content/drive/MyDrive/ml-100k.zip
replace ml-100k/allbut.pl? [y]es, [n]o, [A]ll, [N]one, [r]ename: 

In [12]:
import pandas as pd


In [13]:
ratings_df = pd.read_csv('/content/ml-100k/u.data', sep='\t', header=None, names=['user_id', 'movie_id', 'rating', 'timestamp'])

In [14]:
ratings_df

Unnamed: 0,user_id,movie_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596
...,...,...,...,...
99995,880,476,3,880175444
99996,716,204,5,879795543
99997,276,1090,1,874795795
99998,13,225,2,882399156


In [15]:
item_df = pd.read_csv('/content/ml-100k/u.item', sep='|', header=None, encoding='latin-1',
                      names=['movie_id', 'title', 'release_date', 'video_release_date', 'IMDb_URL',
                             'unknown', 'Action', 'Adventure', 'Animation', 'Children', 'Comedy',
                             'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror',
                             'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western'])


In [16]:
item_df.head()

Unnamed: 0,movie_id,title,release_date,video_release_date,IMDb_URL,unknown,Action,Adventure,Animation,Children,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [17]:
item_df.shape

(1682, 24)

In [18]:
# Establish a mapping of Movie IDs to Movie Titles
movie_names = item_df.set_index('movie_id')['title'].to_dict()

# Determine the count of distinct users and movies in the dataset
unique_user_count = len(ratings_df['user_id'].unique())
unique_movie_count = len(item_df['movie_id'].unique())

# Provide insights into the dataset
print("Number of distinct users:", unique_user_count)
print("Number of distinct movies:", unique_movie_count)
print("The complete rating matrix consists of:", unique_user_count * unique_movie_count, 'elements.')
print('----------')
print("Total number of ratings recorded:", len(ratings_df))
# Compute the percentage of the matrix that has been populated
matrix_fill_percentage = (len(ratings_df) / (unique_user_count * unique_movie_count)) * 100
print(f"As a result, approximately {matrix_fill_percentage:.2f}% of the matrix is populated.")


Number of distinct users: 943
Number of distinct movies: 1682
The complete rating matrix consists of: 1586126 elements.
----------
Total number of ratings recorded: 100000
As a result, approximately 6.30% of the matrix is populated.


## Train & Test & Evaluate

In [19]:
import torch
import numpy as np
from torch.autograd import Variable
from tqdm import tqdm_notebook as tqdm

class MatrixFactorization(torch.nn.Module):
    '''
      The purpose of this class is to represent a Matrix Factorization model.
      It is designed to capture the latent factors (embeddings) of users and items
      in a way that allows the model to predict user-item interactions.

      The user_factors and item_factors embeddings are created as lookup tables.
      These embeddings represent users and items in a lower-dimensional space (n_factors),
      allowing the model to learn patterns and relationships in the data.

      The forward method is where the actual matrix multiplication happens.
      It takes user and item indices and computes the dot product of their embeddings,
      resulting in a prediction for the user-item interaction.

      The predict method is a convenient way to use the model for making predictions.
      It calls the forward method with user and item indices to generate predictions.
    '''
    def __init__(self, n_users, n_items, n_factors=20):
        super().__init__()

        # User embeddings: Representing users in a lower-dimensional space
        self.user_factors = torch.nn.Embedding(n_users, n_factors)

        # Item embeddings: Representing items (e.g., movies) in a lower-dimensional space
        self.item_factors = torch.nn.Embedding(n_items, n_factors)

        # Initialize embeddings with small random values to start training
        self.user_factors.weight.data.uniform_(0, 0.05)
        self.item_factors.weight.data.uniform_(0, 0.05)

    def forward(self, data):
        # Extract user and item indices from the input data
        users, items = data[:, 0], data[:, 1]

        # Matrix multiplication to estimate user-item interactions
        # This operation generates predictions for user-item ratings
        return (self.user_factors(users) * self.item_factors(items)).sum(1)

    def predict(self, user, item):
        # Convenience method for making predictions
        return self.forward(user, item)


In [20]:
from torch.utils.data.dataset import Dataset
from torch.utils.data import DataLoader

class Loader(Dataset):
    '''

      The purpose of this class is to create a PyTorch DataLoader for training a model.
      It inherits from the Dataset class, allowing it to be used with PyTorch's DataLoader.

      The initialization method sets up the necessary data structures for mapping
      between original user and movie IDs and continuous indices.
      It also transforms the data into PyTorch tensors.

      The __getitem__ method returns a tuple of features (x) and target rating (y)
      for a given index, making the dataset compatible with PyTorch's DataLoader.

      The __len__ method returns the total number of ratings in the dataset,
      which is essential for iterating through the DataLoader during training.

    '''
    def __init__(self, ratings_df):
        # Store the ratings DataFrame
        self.ratings = ratings_df

        # Extract all unique user IDs and movie IDs
        users = ratings_df.user_id.unique()
        movies = ratings_df.movie_id.unique()

        # Producing new continuous IDs for users and movies

        # Map unique values to indices
        self.userid2idx = {o: i for i, o in enumerate(users)}
        self.movieid2idx = {o: i for i, o in enumerate(movies)}

        # Obtain continuous ID for users and movies
        self.idx2userid = {i: o for o, i in self.userid2idx.items()}
        self.idx2movieid = {i: o for o, i in self.movieid2idx.items()}

        # Replace original movie and user IDs with continuous indices
        self.ratings.movie_id = ratings_df.movie_id.apply(lambda x: self.movieid2idx[x])
        self.ratings.user_id = ratings_df.user_id.apply(lambda x: self.userid2idx[x])

        # Extract features (x) and target ratings (y) for training
        self.x = self.ratings.drop(['rating', 'timestamp'], axis=1).values
        self.y = self.ratings['rating'].values

        # Transform the data to PyTorch tensors (ready for torch models)
        self.x, self.y = torch.tensor(self.x), torch.tensor(self.y)

    def __getitem__(self, index):
        # Return a tuple of features (x) and target rating (y) for a given index
        return (self.x[index], self.y[index])

    def __len__(self):
        # Return the total number of ratings in the dataset
        return len(self.ratings)

In [21]:
from sklearn.model_selection import train_test_split

In [22]:
train_data, test_data = train_test_split(ratings_df, test_size=0.2, random_state=42)

In [25]:
# The purpose of this section is to set up and configure the training process for the MatrixFactorization model.

# The model is instantiated, and information about its parameters is displayed.
# GPU is enabled if available to accelerate training.

# The Mean Squared Error (MSE) loss function and the ADAM optimizer are defined.

# DataLoader instances for both the training and testing sets are created.
# These DataLoader instances will be used to efficiently load batches of data during training and evaluation.

num_epochs = 128
cuda = torch.cuda.is_available()

print("Is running on GPU:", cuda)

# Create an instance of the MatrixFactorization model
model = MatrixFactorization(unique_user_count, unique_movie_count, n_factors=8)
print(model)

# Display the model's trainable parameters
for name, param in model.named_parameters():
    if param.requires_grad:
        print(name, param.data)

# Enable GPU if available
if cuda:
    model = model.cuda()

# Define the Mean Squared Error (MSE) loss function
loss_fn = torch.nn.MSELoss()

# Set up the ADAM optimizer with a specified learning rate
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

# Create DataLoader instances for training and testing sets
# Note: The DataLoader is essential for efficiently iterating through the data during training
train_set, test_set = Loader(train_data), Loader(test_data)
train_loader = DataLoader(train_set, 128, shuffle=True)
test_loader = DataLoader(test_set, 128, shuffle=True)


Is running on GPU: True
MatrixFactorization(
  (user_factors): Embedding(943, 8)
  (item_factors): Embedding(1682, 8)
)
user_factors.weight tensor([[0.0175, 0.0354, 0.0050,  ..., 0.0364, 0.0393, 0.0229],
        [0.0106, 0.0274, 0.0217,  ..., 0.0355, 0.0354, 0.0175],
        [0.0163, 0.0499, 0.0353,  ..., 0.0281, 0.0041, 0.0053],
        ...,
        [0.0234, 0.0091, 0.0122,  ..., 0.0195, 0.0363, 0.0415],
        [0.0044, 0.0013, 0.0065,  ..., 0.0314, 0.0336, 0.0291],
        [0.0057, 0.0320, 0.0240,  ..., 0.0336, 0.0050, 0.0489]])
item_factors.weight tensor([[0.0280, 0.0148, 0.0473,  ..., 0.0449, 0.0229, 0.0297],
        [0.0428, 0.0227, 0.0441,  ..., 0.0084, 0.0468, 0.0270],
        [0.0040, 0.0313, 0.0252,  ..., 0.0018, 0.0320, 0.0027],
        ...,
        [0.0171, 0.0198, 0.0355,  ..., 0.0054, 0.0179, 0.0225],
        [0.0245, 0.0188, 0.0498,  ..., 0.0251, 0.0400, 0.0473],
        [0.0370, 0.0261, 0.0231,  ..., 0.0228, 0.0370, 0.0462]])


In [27]:
for it in tqdm(range(num_epochs)):
    losses = []
    for x, y in train_loader:
         if cuda:
            x, y = x.cuda(), y.cuda()
            optimizer.zero_grad()
            outputs = model(x)
            loss = loss_fn(outputs.squeeze(), y.type(torch.float32))
            losses.append(loss.item())
            loss.backward()
            optimizer.step()
    print("Step: {}".format(it), "Loss:", sum(losses) / len(losses))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for it in tqdm(range(num_epochs)):


  0%|          | 0/128 [00:00<?, ?it/s]

Step: 0 Loss: 1.0122457243919372
Step: 1 Loss: 0.9515471235275269
Step: 2 Loss: 0.9182203866004944
Step: 3 Loss: 0.8982041790008545
Step: 4 Loss: 0.8865507208824158
Step: 5 Loss: 0.8784835305213928
Step: 6 Loss: 0.8729431324005127
Step: 7 Loss: 0.869108350276947
Step: 8 Loss: 0.8663018528938293
Step: 9 Loss: 0.8639363633155823
Step: 10 Loss: 0.8624344347000122
Step: 11 Loss: 0.8608925952911377
Step: 12 Loss: 0.8602116648674011
Step: 13 Loss: 0.8592595704078674
Step: 14 Loss: 0.8584215323448181
Step: 15 Loss: 0.8576731983184814
Step: 16 Loss: 0.8572587082862854
Step: 17 Loss: 0.8566233036994934
Step: 18 Loss: 0.8561114504814148
Step: 19 Loss: 0.8555982717514038
Step: 20 Loss: 0.8554407277107239
Step: 21 Loss: 0.8552981014251709
Step: 22 Loss: 0.8546996611595153
Step: 23 Loss: 0.854435955619812
Step: 24 Loss: 0.8541381181716919
Step: 25 Loss: 0.8536113221168518
Step: 26 Loss: 0.8535857684135437
Step: 27 Loss: 0.8529739855766296
Step: 28 Loss: 0.8525595029830932
Step: 29 Loss: 0.851966186

In [28]:
model.eval()  # Set the model to evaluation mode

test_losses = []
with torch.no_grad():
    for x, y in test_loader:
        if cuda:
            x, y = x.cuda(), y.cuda()
        outputs = model(x)
        loss = loss_fn(outputs.squeeze(), y.type(torch.float32))
        test_losses.append(loss.item())

average_test_loss = sum(test_losses) / len(test_losses)
print("Average Test Loss:", average_test_loss)

Average Test Loss: 2.1489775674358294


In [29]:
c = 0
uw = 0
iw = 0
for name, param in model.named_parameters():
    if param.requires_grad:
        print(name, param.data)
        if c == 0:
          uw = param.data
          c +=1
        else:
          iw = param.data

user_factors.weight tensor([[ 0.7607,  1.5361,  0.9437,  ...,  0.5016,  0.6973,  0.8558],
        [ 0.6135,  0.6085,  0.7962,  ...,  0.9106,  0.8538,  0.7984],
        [-0.1347,  1.3452, -0.4280,  ...,  0.6075,  1.0871,  0.0536],
        ...,
        [ 0.3408,  0.9082,  1.2186,  ...,  0.6981,  0.9231,  1.0307],
        [-0.1854,  0.5670,  0.1655,  ...,  0.9257,  0.2514,  0.8772],
        [ 1.3655,  1.3807,  1.1379,  ...,  1.0150,  0.3242, -0.0611]],
       device='cuda:0')
item_factors.weight tensor([[ 0.3745, -0.1107,  0.5241,  ...,  0.5068, -0.4985,  1.0398],
        [ 0.2658,  0.7900,  0.6408,  ...,  0.4347,  1.3743,  0.5819],
        [ 0.3260,  0.6852,  0.9213,  ...,  0.1527,  1.0339,  0.0211],
        ...,
        [ 0.0171,  0.0198,  0.0355,  ...,  0.0054,  0.0179,  0.0225],
        [ 0.0245,  0.0188,  0.0498,  ...,  0.0251,  0.0400,  0.0473],
        [ 0.0370,  0.0261,  0.0231,  ...,  0.0228,  0.0370,  0.0462]],
       device='cuda:0')


In [30]:
trained_movie_embeddings = model.item_factors.weight.data.cpu().numpy()

In [31]:
from sklearn.cluster import KMeans
# Fit the clusters based on the movie weights
kmeans = KMeans(n_clusters=10, random_state=0).fit(trained_movie_embeddings)



In [32]:
len(kmeans.labels_)

1682

In [33]:
# The purpose of this section is to analyze movie clusters obtained from KMeans clustering.

# It iterates through each cluster and prints the top 10 movies in each cluster based on rating count.

# Analyzing movie clusters and their genres based on KMeans clustering results
# Iterate through each cluster (assuming there are 10 clusters)
for cluster in range(10):
    print("Cluster #{}".format(cluster))
    movs = []

    # Iterate through movie indices in the current cluster
    for movidx in np.where(kmeans.labels_ == cluster)[0]:
        try:
            # Convert the index back to the original movie ID
            movid = train_set.idx2movieid[movidx]

            # Count the number of ratings for the current movie
            rat_count = ratings_df.loc[ratings_df['movie_id'] == movid].count()[0]

            # Append movie name and rating count to the list
            movs.append((movie_names[movid], rat_count))
        except:
            pass

    # Display the top 10 movies in the cluster based on rating count
    for mov in sorted(movs, key=lambda tup: tup[1], reverse=True)[:10]:
        print("\t", mov[0])

Cluster #0
	 Broken Arrow (1996)
	 Volcano (1997)
	 Eraser (1996)
	 Devil's Advocate, The (1997)
	 Long Kiss Goodnight, The (1996)
	 My Best Friend's Wedding (1997)
	 Sleepers (1996)
	 Happy Gilmore (1996)
	 Batman Returns (1992)
	 Jungle2Jungle (1997)
Cluster #1
	 Star Wars (1977)
	 Return of the Jedi (1983)
	 Empire Strikes Back, The (1980)
	 Princess Bride, The (1987)
	 Monty Python and the Holy Grail (1974)
	 Men in Black (1997)
	 Alien (1979)
	 Aliens (1986)
	 Blade Runner (1982)
	 Chasing Amy (1997)
Cluster #2
	 Devil's Own, The (1997)
	 Top Gun (1986)
	 Lion King, The (1994)
	 Murder at 1600 (1997)
	 Sleepless in Seattle (1993)
	 Sabrina (1995)
	 Clear and Present Danger (1994)
	 Ghost (1990)
	 Pretty Woman (1990)
	 While You Were Sleeping (1995)
Cluster #3
	 Tales from the Hood (1995)
	 Lawnmower Man 2: Beyond Cyberspace (1996)
	 Amityville II: The Possession (1982)
	 Mr. Magoo (1997)
	 Vampire in Brooklyn (1995)
	 Robocop 3 (1993)
	 Mighty Morphin Power Rangers: The Movie (199