In [142]:
from sklearn.model_selection import train_test_split

import pandas as pd
import numpy as np

import os, time, random
from pathlib import Path
from collections import Counter
from typing import Callable, List
from functools import cache

In [143]:
cur_dir = Path('.').absolute()
data_dir=cur_dir.parent/ 'data'

In [144]:
list(data_dir.iterdir())

[PosixPath('/home/t/aproject/movie-recommender-system-collaborative-filtering/data/tag.csv'),
 PosixPath('/home/t/aproject/movie-recommender-system-collaborative-filtering/data/movie.csv'),
 PosixPath('/home/t/aproject/movie-recommender-system-collaborative-filtering/data/link.csv'),
 PosixPath('/home/t/aproject/movie-recommender-system-collaborative-filtering/data/rating.csv'),
 PosixPath('/home/t/aproject/movie-recommender-system-collaborative-filtering/data/movies.csv'),
 PosixPath('/home/t/aproject/movie-recommender-system-collaborative-filtering/data/genome_tags.csv'),
 PosixPath('/home/t/aproject/movie-recommender-system-collaborative-filtering/data/genome_scores.csv')]

## Exploring Movie df

In [145]:
movie_df = pd.read_csv(data_dir/'movie.csv')

In [146]:
movie_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27278 entries, 0 to 27277
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  27278 non-null  int64 
 1   title    27278 non-null  object
 2   genres   27278 non-null  object
dtypes: int64(1), object(2)
memory usage: 639.5+ KB


In [147]:
movie_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [148]:
movie_df.isna().sum() # no nulls

movieId    0
title      0
genres     0
dtype: int64

In [149]:
movie_df.duplicated().sum() #no duplicates

0

In [150]:
movie_df.title.nunique()

27262

In [151]:

# Calculate the value counts for each movie title
title_value_counts = movie_df['title'].value_counts()

# Filter titles that appear more than once
duplicate_titles = title_value_counts[title_value_counts > 1].index.tolist()

print(duplicate_titles)

['Aladdin (1992)', 'Johnny Express (2014)', 'Chaos (2005)', 'Hamlet (2000)', '20,000 Leagues Under the Sea (1997)', 'Darling (2007)', 'Casanova (2005)', 'Paradise (2013)', 'Beneath (2013)', 'Girl, The (2012)', 'Clear History (2013)', 'Emma (1996)', 'Offside (2006)', 'Blackout (2007)', 'Men with Guns (1997)', 'War of the Worlds (2005)']


Some movies have multiple entries with different `movieid` , but it doesn't affect much 

so in genres column there seems to have no spaces bw genres, lets see want unique genres as there

In [152]:
all_genres = movie_df.genres.apply(lambda x : ' '.join(str(x).split('|'))).values.tolist() # split from |
all_genres = ' '.join(set(all_genres)).split() # join all strings and break them into words
all_genres = set(all_genres)  # make a set to find unique ones

In [153]:
print(all_genres, len(all_genres))

{'Thriller', 'Drama', 'listed)', 'War', 'Animation', 'Fantasy', 'Documentary', 'Crime', 'Action', 'IMAX', 'Romance', 'Western', 'Musical', 'Children', 'Comedy', 'genres', 'Sci-Fi', 'Adventure', '(no', 'Film-Noir', 'Horror', 'Mystery'} 22


there are 20 genres and 1 for movies with no genre (which is (no listed)) which is broken as '(no' and 'listed)'

In [154]:
#create a tag columns that contains all information about movie in a sentence
movie_df['tags'] = movie_df['title'] + ' ' + movie_df['genres'].apply(lambda x: ' '.join(str(x).split('|')))
movie_df['tags'] = movie_df['tags'].str.lower()

In [155]:
# movie_df['Title'] = movie_df.title.apply(lambda x : str(x).split('(')[0])
# movie_df['Year'] = movie_df.title.apply(lambda x : (str(x).split('(')[-1]).strip(')'))

In [156]:
movie_df.sample(22)

Unnamed: 0,movieId,title,genres,tags
1975,2059,"Parent Trap, The (1998)",Children|Comedy|Romance,"parent trap, the (1998) children comedy romance"
22506,107914,Fade To Black (1980),Comedy|Horror|Thriller,fade to black (1980) comedy horror thriller
4639,4735,Ghosts of Mars (2001),Horror|Sci-Fi|Thriller,ghosts of mars (2001) horror sci-fi thriller
17760,89230,Tooth & Nail (2007),Drama|Horror|Sci-Fi,tooth & nail (2007) drama horror sci-fi
19572,96923,2-Headed Shark Attack (2012),Comedy|Horror,2-headed shark attack (2012) comedy horror
3813,3906,Under Suspicion (2000),Crime|Thriller,under suspicion (2000) crime thriller
7960,8643,"Cinderella Story, A (2004)",Comedy|Romance,"cinderella story, a (2004) comedy romance"
26956,129532,Island (2011),Drama|Mystery|Thriller,island (2011) drama mystery thriller
27236,131122,Love Exposure (2007),Action|Comedy|Drama|Romance,love exposure (2007) action comedy drama romance
2926,3012,Battling Butler (1926),Comedy,battling butler (1926) comedy


## Exploraing User data

In [157]:
user_df = pd.read_csv(data_dir/'rating.csv', usecols=['userId','movieId','rating'])

In [158]:
user_df.userId.max(), user_df.userId.min(), user_df.userId.nunique()

(138493, 1, 138493)

In [159]:
# this columns are using too much precision for very low values, lowering the datatype precision
user_df['movieId'] = user_df['movieId'].astype('int32')
user_df['userId'] = user_df['userId'].astype('int32')
user_df['rating'] = user_df['rating'].astype('float32')
print()




In [160]:
user_df.userId.max(), user_df.userId.min(), user_df.userId.nunique()

(138493, 1, 138493)

In [161]:
user_df.shape #(20000263,3)

(20000263, 3)

In [162]:
user_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000263 entries, 0 to 20000262
Data columns (total 3 columns):
 #   Column   Dtype  
---  ------   -----  
 0   userId   int32  
 1   movieId  int32  
 2   rating   float32
dtypes: float32(1), int32(2)
memory usage: 228.9 MB


In [163]:
user_df.head()

Unnamed: 0,userId,movieId,rating
0,1,2,3.5
1,1,29,3.5
2,1,32,3.5
3,1,47,3.5
4,1,50,3.5


In [164]:
def clean_text(x: str) -> str:
    x = re.sub(r'[^\w\s]', '', x)  # Remove punctuation
    x = x.lower()  # Convert to lowercase
    return x    

In [165]:
class Vocabulary:
    def __init__(self, special_tokens: List[str]):
        self.word_to_idx = {}
        self.idx_to_word = {}
        self.counter = Counter()
        
        self.UNK_TOKEN = '<UNK>'
        self.UNK = 1
        self.PAD_TOKEN = '<PAD>'
        self.PAD = 0
        
        self.word_to_idx[self.UNK_TOKEN] = self.UNK
        self.idx_to_word[self.UNK] = self.UNK_TOKEN

        self.word_to_idx[self.PAD_TOKEN] = self.PAD
        self.idx_to_word[self.PAD] = self.PAD_TOKEN

        self.vocab_size = 2
        for idx, token in enumerate(special_tokens, start=2):
            self.word_to_idx[token] = idx
            self.idx_to_word[idx] = token
            self.vocab_size += 1

    def build_vocab(self, tokenized_data, max_tokens, min_freq):
        self.counter = Counter()
        for words in tokenized_data:
            self.counter.update(words)

        if max_tokens is not None:
            sorted_tokens = [word for word, _ in self.counter.most_common()]
            for word in sorted_tokens:
                if word not in self.word_to_idx:
                    self.add_word_to_vocab(word)

                if self.vocab_size == max_tokens - 1:
                    break
        else:
            for word, freq in self.counter.items():
                if freq >= min_freq and word not in self.word_to_idx:
                    self.add_word_to_vocab(word)

    def add_word_to_vocab(self, word):
        self.word_to_idx[word] = self.vocab_size
        self.idx_to_word[self.vocab_size] = word
        self.vocab_size += 1

    def __len__(self):
        return self.vocab_size

In [166]:


class IntegerVectorizer:
    def __init__(self, 
                 tokenizer: Callable[[str], List[str]] = None,
                 preprocessing_func: Callable[[str], str] = None,
                 max_tokens=None,
                 min_freq=1,
                 special_tokens: List[str] = None,
                 max_seq_length=None,
                 pad_to_max=False):
        
        self.min_freq = min_freq
        self.max_tokens = max_tokens
        self.max_seq_length = max_seq_length
        self.tokenizer = tokenizer
        self.preprocessing_func = preprocessing_func
        self.reserved_tokens = ['<UNK>', '<PAD>']
        self.special_tokens = [token for token in special_tokens if token not in self.reserved_tokens] if special_tokens else []
        self.pad_to_max = pad_to_max  # Store the argument

        self.vocab = Vocabulary(self.special_tokens)
        self.tokenized_data = []

    def adapt(self, data):
        self.tokenized_data = self.tokenize_data_generator(data)
        self.vocab.build_vocab(self.tokenized_data, self.max_tokens, self.min_freq)
        print('Vocab size:', len(self.vocab))

    def __call__(self, data, reverse=False):
        if reverse:
            return self.reverse_transform(data)
        else:
            return self.transform(data)

    def preprocess_sentence(self, sentence):
        if self.preprocessing_func:
            words = sentence.split()
            preprocessed_words = [self.preprocessing_func(word) if word not in self.special_tokens else word for word in words]
            return " ".join(preprocessed_words)
        return sentence

    def tokenize_data_generator(self, data):
        for sentence in data:
            sentence = self.preprocess_sentence(sentence)
            words = self.tokenizer(sentence) if self.tokenizer else sentence.split()
            yield words

    def transform(self, data):
        self.tokenized_data = self.tokenize_data_generator(data)
        vectorized_data = []
        for sentence in self.tokenized_data:
            vectorized_sentence = [self.vocab.word_to_idx.get(word, self.vocab.UNK) for word in sentence]
            vectorized_sentence = self.adjust_sequence_length(vectorized_sentence)
            vectorized_data.append(vectorized_sentence)
        return vectorized_data

    def adjust_sequence_length(self, sequence):
        if self.max_seq_length is not None:
            if len(sequence) < self.max_seq_length:
                if self.pad_to_max:  # Check the new argument
                    sequence += [self.vocab.PAD] * (self.max_seq_length - len(sequence))
            elif len(sequence) > self.max_seq_length:
                sequence = sequence[:self.max_seq_length]
        return sequence


    def reverse_transform(self, vectorized_data: List[List[int]]) -> List[str]:
        original_data = []
        for vector in vectorized_data:
            sentence = [self.vocab.idx_to_word[idx] for idx in vector if idx != self.vocab.PAD]
            original_data.append(" ".join(sentence))
        return original_data


In [167]:
data = [
    "This is a sample sentence. <UNK>",
    "<START> Another example sentence. <END>",
    "<START> This is another sentence. <END>",
    '3 3 3 3 3 3 '
]
min_freq = 2
max_tokens = None
special_tokens = ['<START>', '<END>', '<UNK>','<PAD>']


vectorizer = IntegerVectorizer(max_seq_length=10, pad_to_max=False)
vectorizer.adapt(data)
a = vectorizer(data)
b = vectorizer.reverse_transform(a)


print("Original data:", data)
print("Vectorized data:", a)
print('reverse vec: ', b)

Vocab size: 13
Original data: ['This is a sample sentence. <UNK>', '<START> Another example sentence. <END>', '<START> This is another sentence. <END>', '3 3 3 3 3 3 ']
Vectorized data: [[2, 3, 4, 5, 6, 1], [7, 8, 9, 6, 10], [7, 2, 3, 11, 6, 10], [12, 12, 12, 12, 12, 12]]
reverse vec:  ['This is a sample sentence. <UNK>', '<START> Another example sentence. <END>', '<START> This is another sentence. <END>', '3 3 3 3 3 3']


In [168]:
#adapt to data
vectorizer = IntegerVectorizer(max_seq_length=15, pad_to_max=True)
vectorizer.adapt(movie_df['tags'].tolist())

Vocab size: 29464


In [169]:
movie_df['tagvector'] = movie_df['tags'].apply(lambda x : vectorizer.transform([x])[0])

In [170]:
movie_df.head()

Unnamed: 0,movieId,title,genres,tags,tagvector
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,toy story (1995) adventure animation children ...,"[2, 3, 4, 5, 6, 7, 8, 9, 0, 0, 0, 0, 0, 0, 0]"
1,2,Jumanji (1995),Adventure|Children|Fantasy,jumanji (1995) adventure children fantasy,"[10, 4, 5, 7, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
2,3,Grumpier Old Men (1995),Comedy|Romance,grumpier old men (1995) comedy romance,"[11, 12, 13, 4, 8, 14, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,waiting to exhale (1995) comedy drama romance,"[15, 16, 17, 4, 8, 18, 14, 0, 0, 0, 0, 0, 0, 0..."
4,5,Father of the Bride Part II (1995),Comedy,father of the bride part ii (1995) comedy,"[19, 20, 21, 22, 23, 24, 4, 8, 0, 0, 0, 0, 0, ..."


In [171]:
import sys
def sizeof_fmt(num, suffix="B"):
    
    """by Fred Cirera,  https://stackoverflow.com/a/1094933/1870254, modified"""
    for unit in ["", "Ki", "Mi", "Gi", "Ti", "Pi", "Ei", "Zi"]:
        if abs(num) < 1024.0:
            return "%3.1f %s%s" % (num, unit, suffix)
        num /= 1024.0
    return "%.1f %s%s" % (num, "Yi", suffix)
    

for name, size in sorted(
    ((name, sys.getsizeof(value)) for name, value in locals().items()),
    key=lambda x: -x[1],
)[:10]:
    print("{:>30}: {:>8}".format(name, sizeof_fmt(size)))

                       user_df: 228.9 MiB
                      movie_df: 11.6 MiB
            title_value_counts:  2.5 MiB
                           _47:  6.0 KiB
                          _156:  5.8 KiB
                           _17:  5.7 KiB
                           _60:  3.7 KiB
                          _i28:  3.0 KiB
                          _i55:  3.0 KiB
                         _i166:  3.0 KiB


In [172]:
user_df = user_df.merge(movie_df[['movieId','tagvector']], on='movieId')

In [173]:
user_df.sample(10)

Unnamed: 0,userId,movieId,rating,tagvector
2697298,29690,908,4.0,"[1042, 174, 1831, 1362, 26, 5, 62, 14, 28, 0, ..."
333591,23485,296,4.5,"[669, 670, 137, 8, 27, 18, 28, 0, 0, 0, 0, 0, ..."
5156508,96952,11,3.0,"[36, 37, 21, 4, 8, 18, 14, 0, 0, 0, 0, 0, 0, 0..."
9118945,58170,6711,4.5,"[74, 156, 8729, 7897, 8, 18, 14, 0, 0, 0, 0, 0..."
6295194,559,1183,4.0,"[2257, 2258, 21, 159, 18, 14, 115, 0, 0, 0, 0,..."
11580390,110846,3147,4.5,"[702, 4665, 21, 2984, 27, 18, 0, 0, 0, 0, 0, 0..."
6591774,67294,2013,1.0,"[3268, 370, 21, 1734, 26, 5, 18, 0, 0, 0, 0, 0..."
9110410,55162,6659,2.5,"[8670, 1188, 8, 42, 65, 0, 0, 0, 0, 0, 0, 0, 0..."
3834561,82868,1257,4.0,"[2384, 1953, 2385, 1114, 8, 14, 0, 0, 0, 0, 0,..."
13164357,134233,23,4.0,"[63, 4, 26, 27, 28, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"


In [174]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader


# Define a custom dataset class
class MovieRatingDataset(Dataset):
    def __init__(self, dataframe):
        self.dataframe = dataframe

    def __len__(self):
        return len(self.dataframe)
    
    def __getitem__(self, idx):
        user_id = self.dataframe.iloc[idx]['userId']
        movie_tags = torch.tensor(self.dataframe.iloc[idx]['tagvector'], dtype=torch.long)
        rating = self.dataframe.iloc[idx]['rating']        
        return user_id, movie_tags, rating


In [211]:
# Instantiate your custom dataset
dataset = MovieRatingDataset(user_df[['userId','tagvector','rating']])

# Set batch size for DataLoader
batch_size = 128

# Create a DataLoader using your custom dataset
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Iterate through the DataLoader during training
for batch in dataloader:
    user_ids, movie_tags, ratings = batch
    print("User IDs:", user_ids)
    print("Movie Tags:", movie_tags)
    print("Ratings:", ratings)
    break  # only print the first batch

User IDs: tensor([ 57218,  42600, 136332, 132179,  32677,  93323,  87079, 117121,  72910,
        133823,  81393,  77068,  56727,   4357,  92902,  11491,  55704,  98018,
        106922,  58922,  52779, 112940,  72334,  11179, 104669,  17646,  77451,
         42020, 108793,  49374,  81161, 103440,  23761, 117459, 132712,  75299,
         63237,  28677,  35244,  26923,  86894,  49422, 127719,  22103,  65461,
        132736,  67829, 120340, 107862, 121914,  20346,  55056,  92861,  95048,
         94536,  40209, 138406,  83090,   6662,  58402,  94957, 105620,  59079,
         68699,  68161, 134456,  95526, 103872,  49346,  41831, 100313, 118902,
         99555,   5104, 128690,  76929,   8320,  83382,  49514, 114266,  67181,
        131721,  15919, 137286,  96691, 131630,  81641,   7531, 119017,  55066,
         62751,   5138, 108587,  32441,  24073,  36473, 118256,  47194, 107232,
         42733, 137360,  53346, 131695,  43921, 105357,  78083, 122729,  22352,
         82202,  35662,  14267

# Model

In [212]:
import torch 
from torch import nn

In [213]:
class RecommenderModel(nn.Module):
    def __init__(self, num_users, num_tokens, embedding_dim):
        super(RecommenderModel, self).__init__()
        
        self.user_embedding = nn.Embedding(num_users, embedding_dim)
        self.movie_embedding = nn.Embedding(num_tokens, embedding_dim)
        self.out = nn.Linear(embedding_dim,1)
        
    def forward(self, user_ids, movie_tags, debug=False):
        user_ids = user_ids.to(torch.long)  # Convert to Long data type
        movie_tags = movie_tags.to(torch.long)  # Convert to Long data type

        user_emb = self.user_embedding(user_ids)
        movie_emb = self.movie_embedding(movie_tags)
        interaction = user_emb * movie_emb
        x = interaction.mean(dim=1)
        output = self.out(x)

        if debug:
            print('user_emb.shape: ',user_emb.shape)
            print('movie_emb.shape: ',movie_emb.shape)
            print('interaction.shape: ',interaction.shape)
            print('output.shape:',output.shape)

            

        return output

In [214]:
model = RecommenderModel(10, 20, 8)
model

RecommenderModel(
  (user_embedding): Embedding(10, 8)
  (movie_embedding): Embedding(20, 8)
  (out): Linear(in_features=8, out_features=1, bias=True)
)

In [217]:
test_out = model(torch.randint(1,10,(8,1)), torch.randint(1,20,(8,5)))
test_out.shape

torch.Size([8, 1])

In [218]:
user_df.userId.nunique(), len(vectorizer.vocab.word_to_idx)

(138493, 29464)

In [236]:
import torch
import torch.optim as optim
import torch.nn.functional as F

# Initialize your model, optimizer, and loss function
num_users = 138493  # Replace with your actual number of users
num_tokens = 29464 # Replace with your actual number of tokens
dim = 16  # Replace with your desired embedding dimension
model = RecommenderModel(num_users, num_tokens, dim)
optimizer = optim.Adam(model.parameters(), lr=0.001)
loss_function = nn.MSELoss()

# Assuming you have a DataLoader named 'dataloader' from the previous example

# Training loop
num_epochs = 10  # Set the number of training epochs

def train_model(model, dataloader, num_epochs=10, learning_rate=0.001, device='cpu'):
    model.to(device)
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    loss_function = nn.MSELoss()

    for epoch in range(num_epochs):
        total_loss = 0.0
        correct_predictions = 0
        total_samples = 0

        for batch in dataloader:
            user_ids, movie_tags, ratings = batch
            
            user_ids = user_ids.view(-1, 1)
            # print('userid shape: ',user_ids.shape)
            # print('movie_tags shape: ',movie_tags.shape)
            # print('ratings shape: ',ratings.shape)


            user_ids = user_ids.to(device)
            movie_tags = movie_tags.to(device)
            ratings = ratings.to(device)

            optimizer.zero_grad()
            
            outputs = model(user_ids, movie_tags).squeeze()
            # print('pred shape:', outputs.shape)

            loss = loss_function(outputs, ratings)
            
            loss.backward()
            optimizer.step()
            
            correct_predictions += torch.sum((outputs - ratings).abs() < 0.5).item()
            total_samples += len(ratings)
            total_loss += loss.item()

        average_loss = total_loss / len(dataloader)
        accuracy = correct_predictions / total_samples
        
        print(f"Epoch [{epoch+1}/{num_epochs}] - Loss: {average_loss:.4f} - Accuracy: {accuracy:.4f}")

def inference_model(model, user_ids, movie_tags, device='cpu'):
    model.to(device)
    user_ids = user_ids.view(-1, 1)
    user_ids = user_ids.to(device)
    movie_tags = movie_tags.to(device)
    outputs = model(user_ids, movie_tags)
    return outputs


In [237]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [238]:
train_model(model, dataloader, num_epochs=1, learning_rate=0.01, device='cpu')

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
