# GRU4REC Model: Preprocessing + Modelling



In [None]:
!pip install

[31mERROR: You must give at least one requirement to install (see "pip help install")[0m[31m
[0m

## Import Library

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import gdown
import joblib
from torch.optim import Adam
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from torch.nn.utils.rnn import pad_sequence
from torch.optim import Adam

## Angkut berkas dan ubah menjadi sequence yang tepat

In [None]:
def download_drive_file(file_id, output_file):
    try:
        url = f'https://drive.google.com/uc?id={file_id}'

        gdown.download(url, output_file, quiet=False)

        print(f"File downloaded successfully: {output_file}")
    except Exception as e:
        print(f"An error occurred while downloading the file: {e}")

# download the cleaned dataset
# download_drive_file('1LN4teEXs1tQ2nXgrKoRrZqVr9DVftUJh', 'sessions.csv')

In [None]:
# sessions = pd.read_csv('sessions.csv')
# sessions.head()

In [None]:
download_drive_file('1dMKbq9sawiAVH9Z4Nh2c2ZsZ3nle1-1Q','label_encoder.joblib')
label_encoder = joblib.load('label_encoder.joblib')

Downloading...
From: https://drive.google.com/uc?id=1dMKbq9sawiAVH9Z4Nh2c2ZsZ3nle1-1Q
To: /content/label_encoder.joblib
100%|██████████| 10.0M/10.0M [00:00<00:00, 37.9MB/s]


File downloaded successfully: label_encoder.joblib


In [None]:
download_drive_file('1i-0mMl7B0ey-hHUHujGoCiclesJtrEM7', 'sequences.csv')
sequences_a = pd.read_csv('sequences.csv')
sequences = sequences_a
sequences.head()

Downloading...
From: https://drive.google.com/uc?id=1i-0mMl7B0ey-hHUHujGoCiclesJtrEM7
To: /content/sequences.csv
100%|██████████| 58.9M/58.9M [00:01<00:00, 44.3MB/s]


File downloaded successfully: sequences.csv


Unnamed: 0,index,sequence
0,0,"[489206, 489237, 142933, 188025, 188028]"
1,1,"[133985, 138876, 315348]"
2,2,"[19817, 129680, 129680, 20979]"
3,3,"[443147, 12620, 12621]"
4,4,"[55082, 55082, 57935, 359096, 139532]"


In [None]:
# load the serialized label encoder from pickle file

import re

def create_full_sequence(df: pd.DataFrame) -> list:
    # Process all rows at once using vectorized operations
    prev_items = df['prev_items'].str.replace(r"[\[\]'\n\"]", '', regex=True).str.split()
    next_items = df['next_item'].values.astype(str)

    # Combine using numpy for fast array operations
    all_items = prev_items + pd.Series(next_items).apply(lambda x: [x])
    return all_items.to_list()


In [None]:
# sequences = create_full_sequence(sessions)

In [None]:
from joblib import Parallel, delayed
import numpy as np
from tqdm import tqdm

# Precompute the dictionary mapping (fastest possible lookup)
mapping = {item: idx for idx, item in enumerate(label_encoder.classes_)}

# Split data into chunks for parallel processing
def chunker(seq, chunk_size):
    return (seq[i:i+chunk_size] for i in range(0, len(seq), chunk_size))

# Worker function for parallel processing
def process_chunk(chunk, mapping):
    return [[mapping[item] for item in sublist] for sublist in chunk]

# Parallel execution with chunking and progress tracking
def parallel_transform(sequences, mapping, n_jobs=-1, chunk_size=1000):
    chunks = list(chunker(sequences, chunk_size))
    results = Parallel(n_jobs=n_jobs)(
        delayed(process_chunk)(chunk, mapping) for chunk in tqdm(chunks, desc="Processing chunks", unit="chunk")
    )

    return [sublist for chunk in results for sublist in chunk]


In [None]:
# transformed_seq = parallel_transform(sequences, mapping)
import ast
transformed_seq = [ast.literal_eval(seq) for seq in sequences['sequence']]

In [None]:
train_sequences, test_sequences = train_test_split(transformed_seq, test_size=0.1, random_state=42)
test_sequences, val_sequences = train_test_split(test_sequences, test_size=0.5, random_state=42)
np.random.choice(np.array(train_sequences, dtype=object))

[334229,
 187696,
 224931,
 224971,
 208412,
 224931,
 221934,
 221932,
 208412,
 101219,
 118212,
 56406]

## Dataset and Dataloader definition



In [None]:
class SessionDataset(Dataset):
    def __init__(self, sessions, session_ids):
        self.sessions = [torch.tensor(session, dtype=torch.long) for session in sessions]
        self.session_ids = session_ids  # Store session IDs to track session boundaries

    def __len__(self):
        return len(self.sessions)

    def __getitem__(self, idx):
        return self.sessions[idx], self.session_ids[idx]

def collate_fn(batch):
    sessions, session_ids = zip(*batch)  # Separate sequences and session IDs
    padded_sessions = pad_sequence(sessions, batch_first=True, padding_value=0)

    inputs = padded_sessions[:, :-1]   # All but last item as input
    targets = padded_sessions[:, 1:]   # All but first item as target
    mask = (inputs != 0).float()       # Mask for padding (0 means padded position)

    return inputs, targets, mask, list(session_ids)

## Model Definition

## 1. GRU Layer

In [None]:
import torch
import torch.nn as nn

class GRULayer(nn.Module):
    def __init__(self, input_size, hidden_size, output_size,
                 num_layers, batch_size, dropout_hidden_rate,
                 dropout_input_rate, optimizer=None):
        """
        GRU Layer for the GRU4REC model.

        Args:
            input_size (int): Input dimension size
            hidden_size (int): Hidden layer size
            output_size (int): Output dimension size
            num_layers (int): Number of GRU layers
            batch_size (int): Mini-batch size
            dropout_hidden_rate (float): Dropout rate for hidden layers
            dropout_input_rate (float): Dropout rate for input layer
        """
        super(GRULayer, self).__init__()

        self.hidden_size = hidden_size
        self.input_size = input_size
        self.output_size = output_size
        self.num_layers = num_layers
        self.batch_size = batch_size
        self.dropout_hidden_rate = dropout_hidden_rate
        self.dropout_input_rate = dropout_input_rate
        self.optimizer = optimizer

        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        # Dropout layers
        self.dropout_hidden = nn.Dropout(p=dropout_hidden_rate)
        self.dropout_input = nn.Dropout(p=dropout_input_rate)

        # Linear transformation from hidden state to output
        self.hidden_to_output = nn.Linear(hidden_size, output_size)

        # GRU layer
        self.gru = nn.GRU(32, hidden_size, num_layers,
                          dropout=dropout_hidden_rate if num_layers > 1 else 0,
                          batch_first=True)

        self.tanh = nn.Tanh()

        # Initialize one-hot buffer correctly
        # self.one_hot_buffer = torch.zeros(batch_size, output_size, dtype=torch.float).to(self.device)
        self.embedding = nn.Embedding(input_size, 32)

        self.to(self.device)

    def forward(self, input, hidden):
        """
        Forward pass through the GRU layer.

        Args:
            input (Tensor): Input batch of item indices (batch_size, seq_len)
            hidden (Tensor): Hidden state tensor

        Returns:
            logit (Tensor): Output predictions
            hidden (Tensor): Updated hidden state
        """
        embedded = self.embedding(input)

        # Apply input dropout if in training mode
        if self.training and self.dropout_input_rate > 0:
            embedded = self.dropout_input(embedded)

        # print(embedded.shape)

        # GRU expects input in (batch, seq_len, input_size) format
        # embedded = embedded.unsqueeze(1)
        # print(embedded.shape)

        # Forward through GRU
        output, hidden = self.gru(embedded, hidden)
        # print(output.shape)
        # print(hidden.shape)

        # Reshape and apply output transformations
        # output = output.reshape(-1, output.size(-1))
        # print(output.shape)
        output = self.dropout_hidden(output)
        # print(output.shape)
        # output = torch.squeeze(output, dim=1)
        # print(output.shape)
        logit = self.tanh(self.hidden_to_output(output))
        # print(logit.shape)

        return logit, hidden

    def one_hot_encode(self, input):
        """
        Convert input item indices into one-hot encoded vectors.

        Args:
            input (Tensor): Tensor of item indices (batch_size,)

        Returns:
            one_hot (Tensor): One-hot encoded representation of input (batch_size, output_size)
        """
        # Ensure input is long type for indexing
        input = input.long()

        # Ensure the one-hot buffer has the correct size
        one_hot = torch.zeros(input.shape[0], self.output_size, dtype=torch.float, device=self.device)

        # Ensure index values are within valid range
        if input.max() >= self.output_size:
            raise ValueError(f"Index value {input.max()} exceeds output size {self.output_size}")

        # Convert input to correct shape and scatter
        one_hot.scatter_(1, input.reshape(-1, 1), 1)  # Use view() instead of unsqueeze()

        return one_hot

    def init_hidden(self):
        """
        Initialize the hidden state for the GRU.

        Returns:
            hidden (Tensor): Initialized hidden state tensor
        """
        return torch.zeros(self.num_layers, self.batch_size, self.hidden_size, device=self.device)


## GRU4REC Model

In [None]:

class GRU4REC:
    def __init__(self, input_size, hidden_size,
                 output_size, num_layers,
                 dropout_hidden_rate, dropout_input_rate, batch_size,
                 optimizer, loss):
        super(GRU4REC, self).__init__()
        self.optimizer = optimizer
        self.loss = loss
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.gru = GRULayer(input_size=input_size,
                            hidden_size=hidden_size,
                            output_size=output_size,
                            num_layers=num_layers,
                            dropout_hidden_rate=dropout_hidden_rate,
                            dropout_input_rate=dropout_input_rate,
                            batch_size=batch_size,
                            optimizer=optimizer
                            )
        self.hidden = torch.zeros(num_layers, batch_size, hidden_size).to(self.device)

    def train(self, train_loader, val_loader, epochs):
        train_losses = []
        val_losses = []
        val_metrics = {}

        # define metrics
        test_metrics = {
            "mrr_at_k": None,
            "hit_at_k": None
        }

        prev_session_ids = None

        for epoch in range(epochs):
            epoch_loss = 0
            self.gru.train()
            progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}", leave=False)

            for i, (input, target, mask, session_ids) in enumerate(progress_bar):
                batch_size = input.size(0)  # Dynamically get batch size

                # Initialize hidden state based on current batch size
                # self.hidden = torch.zeros(self.gru.num_layers, batch_size, self.gru.hidden_size).to(self.device)
                input, target, mask = input.to(self.device), target.to(self.device), mask.to(self.device)

                # Reset hidden state if session has changed
                if prev_session_ids is not None:
                    reset_mask = (torch.tensor(session_ids) != torch.tensor(prev_session_ids)).to(self.device)
                    self.hidden[:, reset_mask, :] = 0
                prev_session_ids = session_ids

                # Forward pass
                logits, self.hidden = self.gru(input, self.hidden)
                self.hidden = self.hidden.detach()

                logits = logits.reshape(-1, logits.size(-1))  # [b*n, 500180]
                # print(logits.shape)
                target = target.reshape(-1)  # [b*n]
                # print(target.shape)
                loss = (self.loss(logits, target) * mask).sum() / mask.sum()

                # Backpropagation
                self.gru.optimizer.zero_grad()
                loss.backward()
                self.gru.optimizer.step()

                batch_loss = loss.item()
                epoch_loss += batch_loss

                # Update tqdm bar with loss
                progress_bar.set_postfix(loss=f"{batch_loss:.4f}")

            train_losses.append(epoch_loss / len(train_loader))

            # Validation
            val_loss, val_metrics = self.test(val_loader, 20)
            val_losses.append(val_loss)

            # Append metrics
            for metric, value in val_metrics.items():
                if metric not in val_metrics or not isinstance(val_metrics[metric], list):
                    val_metrics[metric] = []
                val_metrics[metric].append(value)

            # Print metrics
            print(f"Epoch {epoch+1}/{epochs} - Train Loss: {epoch_loss/len(train_loader):.4f},Val Loss: {val_loss:.4f}, Metric: {val_metrics}")

        return train_losses, val_losses, test_metrics

    def test(self, test_loader, k=10):
        self.gru.eval()
        test_metrics = {
            "mrr_at_k": None,
            "hit_at_k": None
        }

        total_loss = 0
        with torch.no_grad():

            for input, target, mask, session_ids in tqdm(test_loader):
                batch_size = input.size(0)
                hidden = torch.zeros(self.gru.num_layers, batch_size, self.gru.hidden_size).to(self.device)

                # Initialize hidden state based on current batch size
                input, target, mask = input.to(self.device), target.to(self.device), mask.to(self.device)
                logits, _ = self.gru(input, hidden)
                logits = logits.reshape(-1, logits.size(-1))  # [b*n, 500180]
                target = target.reshape(-1)
                # target = target.squeeze(-1)  # Ensure it has shape [batch_size]
                if target.dim() > 1:
                    target = target[:, 0]  # Take first element if needed

                loss = (self.loss(logits, target) * mask).sum() / mask.sum()
                total_loss += loss.item()

                # calculate for each metrics
                for i in range(logits.shape[0]):
                    scores_np = logits[i].cpu().numpy()
                    target_np = target[i].cpu().numpy()
                    evaluation = self.evaluate(scores_np, target_np, k)
                    total_relevant_label = np.where(pred == label).sum

                    first_relevant_rank = None
                    for row in range(pred.size(0)):
                        top_k_preds = np.argsort(scores_np[row], axis=0, order="desc")[:k]
                        if label[row].item() in top_k_preds:
                            if first_relevant_rank is None:
                                first_relevant_rank = np.where(top_k_preds == label[row].item())[0] + 1
                            top_k_correct += 1
                    mrr_at_k += 1 / first_relevant_rank if first_relevant_rank is not None else 0
        avg_loss = total_loss / len(test_loader)
        return avg_loss, test_metrics

    def evaluate(self, pred, true, k):
        """
        Optimized Hit@K and MRR@K calculation using batch-wise computation.
        """
        pred = pred.cpu().numpy()  # Move to CPU once
        true = true.cpu().numpy()

        # Get top-k predictions in one operation
        top_k_indices = np.argpartition(pred, -k, axis=1)[:, -k:]
        top_k_sorted = top_k_indices[np.argsort(pred[np.arange(pred.shape[0])[:, None], top_k_indices], axis=1)[:, ::-1]]

        # Compute Hits@K efficiently
        hits = np.isin(true[:, None], top_k_sorted)  # Boolean mask for hits
        hit_at_k_score = hits.mean() * 100  # Convert to percentage

        # Compute MRR@K efficiently
        ranks = np.where(hits, np.argsort(hits, axis=1) + 1, 0)  # Rank of true label
        reciprocal_ranks = np.where(ranks > 0, 1 / ranks, 0)
        mrr_at_k_score = reciprocal_ranks.mean() * 100  # Convert to percentage

        return float(hit_at_k_score), float(mrr_at_k_score)


## Tes perulangan *training*

In [None]:
batch_size = 8

train_session_ids = [i for i in range(len(train_sequences))]
val_session_ids = [i for i in range(len(train_sequences), len(train_sequences) + len(val_sequences))]


train_dataset = SessionDataset(train_sequences, train_session_ids)
val_dataset = SessionDataset(val_sequences, val_session_ids)

In [None]:
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

In [None]:
output_size = label_encoder.classes_.shape[0]
print(output_size)

gru4rec_model = GRU4REC(
    input_size=label_encoder.classes_.shape[0],
    hidden_size=128,
    output_size=output_size,
    num_layers=2,
    dropout_hidden_rate=0.2,
    dropout_input_rate=0.2,
    batch_size=batch_size,
    optimizer=None,
    loss=nn.CrossEntropyLoss()
)

gru4rec_model.gru.optimizer = Adam(gru4rec_model.gru.parameters(), lr=0.001)

# test train
train_loss, val_loss, test_metrics = gru4rec_model.train(train_loader, val_loader, epochs=20)


500180


  reciprocal_ranks = np.where(ranks > 0, 1 / ranks, 0)
 24%|██▍       | 3562/14778 [18:48<59:14,  3.16it/s]


IndexError: index 19 is out of bounds for axis 0 with size 16

In [None]:
# import gc
# del gru4rec_model
# gc.collect()
# torch.cuda.empty_cache()