In [1]:
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import Dataset, DataLoader
from torch import nn
import torch.optim as optim
import torch.nn.functional as F

from tqdm import tqdm

import pandas as pd
import numpy as np

import os, time, random
from pathlib import Path
from collections import Counter
from typing import Callable, List
from functools import cache
import re
import sys


In [2]:
cur_dir = Path('.').absolute()
data_dir=cur_dir.parent/ 'data'

In [3]:
list(data_dir.iterdir())

[PosixPath('/home/t/aproject/movie-recommender-system-collaborative-filtering/data/tag.csv'),
 PosixPath('/home/t/aproject/movie-recommender-system-collaborative-filtering/data/movie.csv'),
 PosixPath('/home/t/aproject/movie-recommender-system-collaborative-filtering/data/link.csv'),
 PosixPath('/home/t/aproject/movie-recommender-system-collaborative-filtering/data/rating.csv'),
 PosixPath('/home/t/aproject/movie-recommender-system-collaborative-filtering/data/movies.csv'),
 PosixPath('/home/t/aproject/movie-recommender-system-collaborative-filtering/data/genome_tags.csv'),
 PosixPath('/home/t/aproject/movie-recommender-system-collaborative-filtering/data/genome_scores.csv')]

## Exploring Movie df

In [4]:
movie_df = pd.read_csv(data_dir/'movie.csv')

In [5]:
movie_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27278 entries, 0 to 27277
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  27278 non-null  int64 
 1   title    27278 non-null  object
 2   genres   27278 non-null  object
dtypes: int64(1), object(2)
memory usage: 639.5+ KB


In [6]:
movie_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [7]:
movie_df.isna().sum() # no nulls

movieId    0
title      0
genres     0
dtype: int64

In [8]:
movie_df.duplicated().sum() #no duplicates

0

In [9]:
movie_df.title.nunique()

27262

In [10]:

# Calculate the value counts for each movie title
title_value_counts = movie_df['title'].value_counts()

# Filter titles that appear more than once
duplicate_titles = title_value_counts[title_value_counts > 1].index.tolist()

print(duplicate_titles)

['Aladdin (1992)', 'Johnny Express (2014)', 'Chaos (2005)', 'Hamlet (2000)', '20,000 Leagues Under the Sea (1997)', 'Darling (2007)', 'Casanova (2005)', 'Paradise (2013)', 'Beneath (2013)', 'Girl, The (2012)', 'Clear History (2013)', 'Emma (1996)', 'Offside (2006)', 'Blackout (2007)', 'Men with Guns (1997)', 'War of the Worlds (2005)']


Some movies have multiple entries with different `movieid` , but it doesn't affect much 

so in genres column there seems to have no spaces bw genres, lets see want unique genres as there

In [11]:
all_genres = movie_df.genres.apply(lambda x : ' '.join(str(x).split('|'))).values.tolist() # split from |
all_genres = ' '.join(set(all_genres)).split() # join all strings and break them into words
all_genres = set(all_genres)  # make a set to find unique ones

In [12]:
print(all_genres, len(all_genres))

{'Sci-Fi', 'IMAX', 'Action', 'War', 'Horror', 'Film-Noir', 'Children', 'listed)', 'Comedy', 'Adventure', '(no', 'Animation', 'Fantasy', 'Musical', 'Documentary', 'Mystery', 'Drama', 'Western', 'Crime', 'Thriller', 'Romance', 'genres'} 22


there are 20 genres and 1 for movies with no genre (which is (no listed)) which is broken as '(no' and 'listed)'

In [13]:
#create a tag columns that contains all information about movie in a sentence
movie_df['tags'] = movie_df['title'] + ' ' + movie_df['genres'].apply(lambda x: ' '.join(str(x).split('|')))
movie_df['tags'] = movie_df['tags'].str.lower()

In [14]:
# movie_df['Title'] = movie_df.title.apply(lambda x : str(x).split('(')[0])
# movie_df['Year'] = movie_df.title.apply(lambda x : (str(x).split('(')[-1]).strip(')'))

In [15]:
movie_df.sample(22)

Unnamed: 0,movieId,title,genres,tags
11585,50610,Beer League (2006),Comedy,beer league (2006) comedy
9849,32168,Or (a.k.a. My Treasure) (2004),Drama,or (a.k.a. my treasure) (2004) drama
18703,93139,Mega Shark vs. Crocosaurus (2010),Action|Adventure|Horror,mega shark vs. crocosaurus (2010) action adven...
26115,125571,The Court-Martial of Jackie Robinson,(no genres listed),the court-martial of jackie robinson (no genre...
10193,33901,Satan's Little Helper (2004),Comedy|Horror,satan's little helper (2004) comedy horror
12833,60524,August (2008),Drama,august (2008) drama
20286,99741,"Company You Keep, The (2012)",Thriller,"company you keep, the (2012) thriller"
8278,8961,"Incredibles, The (2004)",Action|Adventure|Animation|Children|Comedy,"incredibles, the (2004) action adventure anima..."
15085,76709,Spider-Man: The Ultimate Villain Showdown (2002),Animation,spider-man: the ultimate villain showdown (200...
22859,109290,The African (1983),Adventure|Comedy,the african (1983) adventure comedy


## Exploraing User data

In [16]:
user_df = pd.read_csv(data_dir/'rating.csv', usecols=['userId','movieId','rating'])

In [17]:
user_df.userId.max(), user_df.userId.min(), user_df.userId.nunique()

(138493, 1, 138493)

In [18]:
# this columns are using too much precision for very low values, lowering the datatype precision
user_df['movieId'] = user_df['movieId'].astype('int32')
user_df['userId'] = user_df['userId'].astype('int32')
user_df['rating'] = user_df['rating'].astype('float32')
print()




In [19]:
user_df.userId.max(), user_df.userId.min(), user_df.userId.nunique()

(138493, 1, 138493)

In [20]:
user_df.shape #(20000263,3)

(20000263, 3)

In [21]:
user_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000263 entries, 0 to 20000262
Data columns (total 3 columns):
 #   Column   Dtype  
---  ------   -----  
 0   userId   int32  
 1   movieId  int32  
 2   rating   float32
dtypes: float32(1), int32(2)
memory usage: 228.9 MB


In [22]:
user_df.head()

Unnamed: 0,userId,movieId,rating
0,1,2,3.5
1,1,29,3.5
2,1,32,3.5
3,1,47,3.5
4,1,50,3.5


# Text Vectorization

In [23]:
def clean_text(x: str) -> str:
    x = re.sub(r'[^\w\s]', '', x)  # Remove punctuation
    x = x.lower()  # Convert to lowercase
    return x    

In [24]:
a = 'helo#$#@$#%$@%@#$ 44 sir'
clean_text(a)

'helo 44 sir'

In [25]:
class Vocabulary:
    def __init__(self, special_tokens: List[str]):
        self.word_to_idx = {}
        self.idx_to_word = {}
        self.counter = Counter()
        
        self.UNK_TOKEN = '<UNK>'
        self.UNK = 1
        self.PAD_TOKEN = '<PAD>'
        self.PAD = 0
        
        self.word_to_idx[self.UNK_TOKEN] = self.UNK
        self.idx_to_word[self.UNK] = self.UNK_TOKEN

        self.word_to_idx[self.PAD_TOKEN] = self.PAD
        self.idx_to_word[self.PAD] = self.PAD_TOKEN

        self.vocab_size = 2
        for idx, token in enumerate(special_tokens, start=2):
            self.word_to_idx[token] = idx
            self.idx_to_word[idx] = token
            self.vocab_size += 1

    def build_vocab(self, tokenized_data, max_tokens, min_freq):
        self.counter = Counter()
        for words in tokenized_data:
            self.counter.update(words)

        if max_tokens is not None:
            sorted_tokens = [word for word, _ in self.counter.most_common()]
            for word in sorted_tokens:
                if word not in self.word_to_idx:
                    self.add_word_to_vocab(word)

                if self.vocab_size == max_tokens - 1:
                    break
        else:
            for word, freq in self.counter.items():
                if freq >= min_freq and word not in self.word_to_idx:
                    self.add_word_to_vocab(word)

    def add_word_to_vocab(self, word):
        self.word_to_idx[word] = self.vocab_size
        self.idx_to_word[self.vocab_size] = word
        self.vocab_size += 1

    def __len__(self):
        return self.vocab_size

In [26]:


class IntegerVectorizer:
    def __init__(self, 
                 tokenizer: Callable[[str], List[str]] = None,
                 preprocessing_func: Callable[[str], str] = None,
                 max_tokens=None,
                 min_freq=1,
                 special_tokens: List[str] = None,
                 max_seq_length=None,
                 pad_to_max=False):
        
        self.min_freq = min_freq
        self.max_tokens = max_tokens
        self.max_seq_length = max_seq_length
        self.tokenizer = tokenizer
        self.preprocessing_func = preprocessing_func
        self.reserved_tokens = ['<UNK>', '<PAD>']
        self.special_tokens = [token for token in special_tokens if token not in self.reserved_tokens] if special_tokens else []
        self.pad_to_max = pad_to_max  # Store the argument

        self.vocab = Vocabulary(self.special_tokens)
        self.tokenized_data = []

    def adapt(self, data):
        self.tokenized_data = self.tokenize_data_generator(data)
        self.vocab.build_vocab(self.tokenized_data, self.max_tokens, self.min_freq)
        print('Vocab size:', len(self.vocab))

    def __call__(self, data, reverse=False):
        if reverse:
            return self.reverse_transform(data)
        else:
            return self.transform(data)

    def preprocess_sentence(self, sentence):
        if self.preprocessing_func:
            words = sentence.split()
            preprocessed_words = [self.preprocessing_func(word) if word not in self.special_tokens else word for word in words]
            return " ".join(preprocessed_words)
        return sentence

    def tokenize_data_generator(self, data):
        for sentence in data:
            sentence = self.preprocess_sentence(sentence)
            words = self.tokenizer(sentence) if self.tokenizer else sentence.split()
            yield words

    def transform(self, data):
        self.tokenized_data = self.tokenize_data_generator(data)
        vectorized_data = []
        for sentence in self.tokenized_data:
            vectorized_sentence = [self.vocab.word_to_idx.get(word, self.vocab.UNK) for word in sentence]
            vectorized_sentence = self.adjust_sequence_length(vectorized_sentence)
            vectorized_data.append(vectorized_sentence)
        return vectorized_data

    def adjust_sequence_length(self, sequence):
        if self.max_seq_length is not None:
            if len(sequence) < self.max_seq_length:
                if self.pad_to_max:  # Check the new argument
                    sequence += [self.vocab.PAD] * (self.max_seq_length - len(sequence))
            elif len(sequence) > self.max_seq_length:
                sequence = sequence[:self.max_seq_length]
        return sequence


    def reverse_transform(self, vectorized_data: List[List[int]]) -> List[str]:
        original_data = []
        for vector in vectorized_data:
            sentence = [self.vocab.idx_to_word[idx] for idx in vector if idx != self.vocab.PAD]
            original_data.append(" ".join(sentence))
        return original_data


In [27]:
data = [
    "This is a sample sentence (23). <UNK>",
    "<START> Another example sentence. <END>",
    "<START> This is another sentence. <END>",
    '3 3 3 3 3 3 '
]
min_freq = 2
max_tokens = None
special_tokens = ['<START>', '<END>', '<UNK>','<PAD>']


vectorizer = IntegerVectorizer(preprocessing_func=clean_text, max_seq_length=10, pad_to_max=False)
vectorizer.adapt(data)
a = vectorizer(data)
b = vectorizer.reverse_transform(a)


print("Original data:", data)
print("Vectorized data:", a)
print('reverse vec: ', b)

Vocab size: 14
Original data: ['This is a sample sentence (23). <UNK>', '<START> Another example sentence. <END>', '<START> This is another sentence. <END>', '3 3 3 3 3 3 ']
Vectorized data: [[2, 3, 4, 5, 6, 7, 8], [9, 10, 11, 6, 12], [9, 2, 3, 10, 6, 12], [13, 13, 13, 13, 13, 13]]
reverse vec:  ['this is a sample sentence 23 unk', 'start another example sentence end', 'start this is another sentence end', '3 3 3 3 3 3']


In [28]:
def sizeof_fmt(num, suffix="B"):
    
    """by Fred Cirera,  https://stackoverflow.com/a/1094933/1870254, modified"""
    for unit in ["", "Ki", "Mi", "Gi", "Ti", "Pi", "Ei", "Zi"]:
        if abs(num) < 1024.0:
            return "%3.1f %s%s" % (num, unit, suffix)
        num /= 1024.0
    return "%.1f %s%s" % (num, "Yi", suffix)
    

for name, size in sorted(
    ((name, sys.getsizeof(value)) for name, value in locals().items()),
    key=lambda x: -x[1],
)[:10]:
    print("{:>30}: {:>8}".format(name, sizeof_fmt(size)))

                       user_df: 228.9 MiB
                      movie_df:  7.0 MiB
            title_value_counts:  2.5 MiB
                           _15:  5.9 KiB
                           _ii:  3.0 KiB
                          _i26:  3.0 KiB
                    all_genres:  2.2 KiB
                       Counter:  1.6 KiB
                    Vocabulary:  1.6 KiB
             IntegerVectorizer:  1.6 KiB


In [29]:
# Define a custom dataset class
class MovieRatingDataset(Dataset):
    def __init__(self, dataframe):
        self.dataframe = dataframe

    def __len__(self):
        return len(self.dataframe)
    
    def __getitem__(self, idx):
        user_id = self.dataframe.iloc[idx]['userId']
        movie_tags = torch.tensor(self.dataframe.iloc[idx]['tagvector'], dtype=torch.long)
        # movie_tags = self.dataframe.iloc[idx]['movieId']
        rating = self.dataframe.iloc[idx]['rating']        
        return user_id, movie_tags, rating


# Model

In [31]:
class RecommenderModel(nn.Module):
    def __init__(self, num_users, num_tokens, embedding_dim):
        super(RecommenderModel, self).__init__()
        
        self.user_embedding = nn.Embedding(num_users+1, embedding_dim)
        self.movie_embedding = nn.Embedding(num_tokens+1, embedding_dim)
        self.out = nn.Linear(embedding_dim,1)
        
    def forward(self, user_ids, movie_tags, debug=False):
        user_ids = user_ids.to(torch.long)  # Convert to Long data type
        movie_tags = movie_tags.to(torch.long)  # Convert to Long data type

        user_emb = self.user_embedding(user_ids)
        movie_emb = self.movie_embedding(movie_tags)
        interaction = user_emb * movie_emb
        x = interaction.mean(dim=1)
        output = self.out(x)

        if debug:
            print('user_emb.shape: ',user_emb.shape)
            print('movie_emb.shape: ',movie_emb.shape)
            print('interaction.shape: ',interaction.shape)
            print('output.shape:',output.shape)

        return output

In [32]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [33]:
model = RecommenderModel(10, 20, 8)
model

RecommenderModel(
  (user_embedding): Embedding(11, 8)
  (movie_embedding): Embedding(21, 8)
  (out): Linear(in_features=8, out_features=1, bias=True)
)

In [34]:
test_out = model(torch.randint(1,10,(8,1)), torch.randint(1,20,(8,1)))
test_out.shape

torch.Size([8, 1])

# Training

In [35]:
#adapt to data
vectorizer = IntegerVectorizer(preprocessing_func=clean_text,max_seq_length=10, pad_to_max=True)
vectorizer.adapt(movie_df['tags'].tolist()) #22693

# vectorize the data
movie_df['tagvector'] = movie_df['tags'].apply(lambda x : vectorizer.transform([x])[0])

Vocab size: 22693


In [36]:
# merge movie_df to user_df
user_df = user_df.merge(movie_df[['movieId','tagvector']], on='movieId')

In [37]:
user_df.userId.max(), user_df.userId.min()

(138493, 1)

In [38]:
# split the data
train_df, test_df = train_test_split(user_df[['userId','tagvector','rating', 'movieId']], train_size=.9, test_size=.001, random_state=2)
print(train_df.shape, test_df.shape)


(18000236, 4) (20001, 4)


In [58]:
# Set batch size for DataLoader
batch_size = 32

# train
train_dataset = MovieRatingDataset(train_df)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

# test
test_dataset = MovieRatingDataset(test_df)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)

# Iterate through the DataLoader during training
for batch in train_dataloader:
    user_ids, movie_tags, ratings = batch
    print("User IDs:", user_ids)
    print("Movie Tags:", movie_tags)
    print("Ratings:", ratings)
    break  # only print the first batch

User IDs: tensor([118432,  41859,  81172,  46179,  12887,  39382,  26227,  24269, 129775,
          3397, 137993,  51370, 121884, 134225,  52948,  26612, 112988,  43545,
         32340,  68625,  74122,  26425,  81984, 136204, 114713, 102242,  32865,
         19353,   5074, 135724,   4040,   7032], dtype=torch.int32)
Movie Tags: tensor([[  988,   989,   566,    21,   134,    26,     7,    14,     0,     0],
        [ 1745,    21,  1700,  1050,  1173,     8,     0,     0,     0,     0],
        [  724,   725,   726,   727,   421,    18,     0,     0,     0,     0],
        [  314,   569,   570,  1152,    21,  2091,  2092,  2053,  1320,    26],
        [ 5124,   667,  1722,    18,   126,    14,     0,     0,     0,     0],
        [  129,   341,   421,    26,    18,     0,     0,     0,     0,     0],
        [  991,   171,  1702,  1288,    26,     5,    62,    14,    28,     0],
        [ 1604,    13,  1464,     8,    18,     0,     0,     0,     0,     0],
        [  777,   281,  1073, 

In [40]:
user_df.userId.nunique(),movie_df.movieId.nunique(), len(vectorizer.vocab.word_to_idx)

(138493, 27278, 22693)

In [41]:
print('max userid value: ',user_df.userId.max())
print('unique userid: ',user_df.userId.nunique())


max userid value:  138493
unique userid:  138493


In [42]:
print('max vocab value: ',max(vectorizer.vocab.word_to_idx.values()))
print('vocab size: ',len(vectorizer.vocab))


max vocab value:  22692
vocab size:  22693


In [None]:
# Initialize your model, optimizer, and loss function
num_users = user_df.userId.nunique()  # actual number of users
num_tokens = len(vectorizer.vocab) # actual number of tokens
dim = 8  
model = RecommenderModel(num_users, num_tokens, dim)
optimizer = optim.Adam(model.parameters(), lr=0.001)
loss_function = nn.MSELoss()

# Training loop
num_epochs = 1  # Set the number of training epochs

In [160]:
def train_model(model, dataloader, optimizer, loss_function, num_epochs=10, device='cpu', data_percent=1.0, steps_per_epoch=None):
    model.to(device)
    print(f'{model.__class__.__name__} Running on : {device}')

    data_size = int(data_percent * len(dataloader))
    dataloader = iter(dataloader)

    for epoch in range(num_epochs):
        total_loss = 0.0
        correct_predictions = 0
        total_samples = 0

        epoch_progress = tqdm(range(data_size), desc=f"Epoch [{epoch+1:2}/{num_epochs:2}]")
        
        if steps_per_epoch is not None:
            epoch_progress = tqdm(range(steps_per_epoch), desc=f"Epoch [{epoch+1:2}/{num_epochs:2}]")

        for _ in epoch_progress:
            try:
                batch = next(dataloader)
            except StopIteration:
                dataloader = iter(dataloader)
                batch = next(dataloader)

            user_ids, movie_tags, ratings = batch
            
            user_ids = user_ids.view(-1, 1)

            user_ids = user_ids.to(device)
            movie_tags = movie_tags.to(device)
            ratings = ratings.to(device)

            optimizer.zero_grad()
            
            outputs = model(user_ids, movie_tags).squeeze()

            loss = loss_function(outputs, ratings)
            
            loss.backward()
            optimizer.step()
            
            correct_predictions += torch.sum((outputs - ratings).abs() < 0.5).item()
            total_samples += len(ratings)
            total_loss += loss.item()

            formatted_loss = f"{loss.item():.8f}"
            formatted_accuracy = f"{correct_predictions / total_samples:.8f}"

            epoch_progress.set_postfix({"Loss": formatted_loss, "Accuracy": formatted_accuracy})
            epoch_progress.update()

            if steps_per_epoch is not None and _ + 1 >= steps_per_epoch:
                break

        # epoch_progress.close()

        average_loss = total_loss / min(data_size, steps_per_epoch) if steps_per_epoch is not None else total_loss / data_size
        accuracy = correct_predictions / total_samples
        
        print(f"Epoch [{epoch+1:2}/{num_epochs:2}] - Average Loss: {average_loss:.8f} - Average Accuracy: {accuracy:.8f}")
        print()


In [162]:
def inference(model, dataloader, device='cpu'):
    model.eval()  # Set the model to evaluation mode
    model.to(device)
    
    labels = []
    predictions = []

    with torch.no_grad():
        for batch in dataloader:
            user_ids, movie_tags, ratings = batch
            
            user_ids = user_ids.view(-1, 1)

            user_ids = user_ids.to(device)
            movie_tags = movie_tags.to(device)
            ratings = ratings.to(device)

            outputs = model(user_ids, movie_tags).squeeze()

            labels.extend(ratings.cpu().numpy())
            
            if len(outputs.shape) == 0:
                predictions.append(outputs.item())
            else:
                predictions.extend(outputs.cpu().numpy())

    print('Mean Squared Error: ',mean_squared_error(labels, predictions))
    print('Mean Absolute Error: ',mean_absolute_error(labels, predictions))
    return labels, predictions


In [163]:
# train_model(model, train_dataloader,  optimizer, loss_function, num_epochs=1, device=device, data_percent=0.03, steps_per_epoch=None)

In [176]:
# SAVe the model
model_path = cur_dir.parent/'models'
# print(model_path)
torch.save(model.state_dict(), model_path/'model.pth')

/home/t/aproject/movie-recommender-system-collaborative-filtering/models


In [165]:
labels, predictions = inference(model, test_dataloader)

In [171]:
print('mean_squared_error: ',mean_squared_error(labels, predictions))
print('mean_absolute_error: ',mean_absolute_error(labels, predictions))

mean_squared_error:  0.8249118893361443
mean_absolute_error:  0.695049490650191
