# This notebook is just for training and saving the model in google colab notebook

In [1]:
!rmdir .ipynb_checkpoints/
!rmdir CoNaLa/.ipynb_checkpoints/

In [2]:
import torch
import torch.nn as nn
import math

class InputEmbedding(nn.Module):
	def __init__(self, d_model : int, vocab_size : int):
		super().__init__()
		self.d_model = d_model
		self.vocab_size = vocab_size
		self.embedding = nn.Embedding(num_embeddings=self.vocab_size, embedding_dim=self.d_model)

	def forward(self, x):
		return self.embedding(x) * math.sqrt(self.d_model)

class PositionalEncoding(nn.Module):
	def __init__(self, d_model : int, max_length : int):
		super().__init__()
		self.d_model = d_model
		self.max_length = max_length

		pe = torch.zeros(self.max_length, self.d_model)
		position = torch.arange(0,  self.max_length, dtype=torch.float).unsqueeze(1)

		div = torch.exp(-1 * (torch.arange(0, d_model, 2) / d_model) * math.log(10000))

		pe[:, 0::2] = torch.sin(position * div)
		pe[:, 1::2] = torch.cos(position * div)

		self.register_buffer("pe", pe) # `pe` will be used but not be trained

	def forward(self, x):
		return x + self.pe[:x.size(1)].unsqueeze(0)

class MultiHeadAttention(nn.Module):
	def __init__(self, d_model : int, num_heads : int, max_length : int, dropout : float):
		super().__init__()
		self.d_model = d_model
		self.num_heads = num_heads
		self.max_length = max_length
		self.dropout = nn.Dropout(p=dropout)

		assert d_model % num_heads == 0, "Model embedding dimension (d_model) must be divisible by the number of heads(num_heads)"
		self.d_k = self.d_model // self.num_heads

		self.query = nn.Linear(in_features=self.d_model, out_features=self.d_model)
		self.key = nn.Linear(in_features=self.d_model, out_features=self.d_model)
		self.value = nn.Linear(in_features=self.d_model, out_features=self.d_model)

		self.linear_f = nn.Linear(in_features=self.d_model, out_features=self.d_model)

	def forward(self, q, k, v, mask=None):
		Q = self.query(q)
		K = self.key(k)
		V = self.value(v)

		Qh = Q.view(Q.shape[0], Q.shape[1], self.num_heads, self.d_k)
		Kh = K.view(K.shape[0], K.shape[1], self.num_heads, self.d_k)
		Vh = V.view(V.shape[0], V.shape[1], self.num_heads, self.d_k)

		# (batch_size, max_length, num_heads, d_k) -> (batch_size, num_heads, max_length, d_k) | We do this in order to focus on each head
		Qr = Qh.transpose(1, 2)
		Kr = Kh.transpose(1, 2)
		Vr = Vh.transpose(1, 2)

		# -- Scaled dot product --
		# (batch_size, num_heads, max_length, d_k) @ (batch_size, num_heads, d_k, max_length) -> (batch_size, num_heads, max_length, max_length)

		attention_scores = (torch.matmul(Qr, Kr.transpose(-2, -1)) / math.sqrt(self.d_k))

		# Apply the mask
		if mask is not None:
			# Apply the mask
			# mask = mask.unsqueeze(1)  # (batch_size, 1, seq_len, seq_len) or (batch_size, 1, 1, seq_len)
            # Expand mask to all heads

			mask = mask.unsqueeze(dim=1).unsqueeze(dim=2)
			# print(attention_scores.shape)
			# print(mask.shape)
			# mask = mask.expand(q.shape[0], self.num_heads, *mask.shape[-2:])
			attention_scores = attention_scores.masked_fill_(mask == 0, -1e9)
		# Apply the softmax
		attention_scores = torch.softmax(input=attention_scores, dim=-1)
		# Apply the dropout
		attention_scores = self.dropout(attention_scores)

		attention_scores = torch.matmul(attention_scores, Vr)

		attention_scores = attention_scores.transpose(-2, -1).contiguous().view(Q.shape[0], Q.shape[1], self.d_model)

		return self.linear_f(attention_scores)

class LayerNormalization(nn.Module):

	def __init__(self, d_model : int, eps = 1e-5):
		super().__init__()
		self.eps = eps
		self.d_model = d_model

		self.gamma = nn.Parameter(torch.ones(self.d_model))
		self.beta = nn.Parameter(torch.zeros(self.d_model))

	def forward(self, x):
		mean = x.mean(dim=-1, keepdim=True)
		var = x.var(dim=-1, keepdim=True)

		result = (x - mean) / torch.sqrt(var ** 2 + self.eps)

		return self.gamma * result + self.beta

class FeedForwardNetwork(nn.Module):
	def __init__(self, d_model : int,  d_ff : int, dropout : float):
		super().__init__()
		self.d_model = d_model
		self.d_ff = d_ff

		self.dropout = nn.Dropout(p=dropout)
		self.linear_1 = nn.Linear(in_features=self.d_model, out_features=self.d_ff)
		self.relu = nn.ReLU()
		self.linear_2 = nn.Linear(in_features=self.d_ff, out_features=self.d_model)

	def forward(self, x):
		return self.linear_2(self.dropout(self.relu(self.linear_1(x))))

class Encoder(nn.Module):
	def __init__(self, d_model : int, num_heads : int, d_ff : int, dropout : float, max_length : int):
		super().__init__()
		self.self_attention = MultiHeadAttention(d_model=d_model, num_heads=num_heads, max_length=max_length, dropout=dropout)
		self.norm1 = LayerNormalization(d_model=d_model)
		self.feed_forward_network = FeedForwardNetwork(d_model=d_model, d_ff=d_ff, dropout=dropout)
		self.norm2 = LayerNormalization(d_model=d_model)
		self.dropout = nn.Dropout(p=dropout)

	def forward(self, x, encoder_mask=None):
		attention_output = self.self_attention(
			q=x,
			k=x,
			v=x,
			mask=encoder_mask
		)
		x = x + self.dropout(attention_output)
		x = self.norm1(x)

		feed_forward_output = self.feed_forward_network(x)
		x = x + self.dropout(feed_forward_output)
		x = self.norm2(x)

		return x

class Decoder(nn.Module):
	def __init__(self, d_model : int, num_heads : int, d_ff : int, dropout : float, max_length : int):
		super().__init__()
		self.self_attention = MultiHeadAttention(d_model=d_model, num_heads=num_heads, max_length=max_length, dropout=dropout)
		self.norm1 = LayerNormalization(d_model=d_model)
		self.cross_attention = MultiHeadAttention(d_model=d_model, num_heads=num_heads, max_length=max_length, dropout=dropout)
		self.norm2 = LayerNormalization(d_model=d_model)
		self.feed_forward_network = FeedForwardNetwork(d_model=d_model, d_ff=d_ff, dropout=dropout)
		self.norm3 = LayerNormalization(d_model=d_model)
		self.dropout = nn.Dropout(p=dropout)

	def forward(self, decoder_input, encoder_output, encoder_mask=None, decoder_mask=None):
		# Pass through the masked multi head attention
		masked_attention_output = self.self_attention(
			q=decoder_input,
			k=decoder_input,
			v=decoder_input,
			mask=decoder_mask
		)
		decoder_input = decoder_input + self.dropout(masked_attention_output)
		decoder_input = self.norm1(decoder_input)

		# Pass the result through the cross attention
		cross_attention_output = self.cross_attention(
			q=decoder_input, # the decoder is trying to "look up" some information from the encoder
			k=encoder_output,
			v=encoder_output,
			mask=encoder_mask
		)
		decoder_input = decoder_input + self.dropout(cross_attention_output)
		decoder_input = self.norm2(decoder_input)

		# Pass the result through the Feed Forward network
		feed_forward_output = self.feed_forward_network(decoder_input)
		decoder_input = decoder_input + self.dropout(feed_forward_output)

		x = decoder_input

		return x

class Transformer(nn.Module):
	def __init__(self, d_model : int, encoder_vocab_size : int, decoder_vocab_size : int, max_length : int, num_heads : int, d_ff : int, dropout : float, N : int):
		super().__init__()
		self.encoder_embeddings = InputEmbedding(
			d_model=d_model,
			vocab_size=encoder_vocab_size
		)
		self.decoder_embedding = InputEmbedding(
			d_model=d_model,
			vocab_size=decoder_vocab_size
		)
		self.positional_encoding = PositionalEncoding(d_model=d_model, max_length=max_length)
		self.encoder_layers = nn.ModuleList([Encoder(d_model=d_model, num_heads=num_heads, d_ff=d_ff, dropout=dropout, max_length=max_length) for _ in range(N)])
		self.decoder_layers = nn.ModuleList([Decoder(
			d_model=d_model, num_heads=num_heads, d_ff=d_ff, dropout=dropout, max_length=max_length
		) for _ in range(N)])
		self.linear_projection =  nn.Linear(in_features=d_model, out_features=decoder_vocab_size)

	def encode(self, encoder_input, encoder_mask):
		encoder_input = self.encoder_embeddings(encoder_input)
		encoder_input = self.positional_encoding(encoder_input)
		encoder_output = encoder_input
		for layer in self.encoder_layers:
			encoder_output = layer(
				x=encoder_output,
				encoder_mask=encoder_mask
			)

		return encoder_output

	def decode(self, encoder_output, decoder_input, encoder_mask, decoder_mask):
		decoder_input = self.decoder_embedding(decoder_input)
		decoder_input = self.positional_encoding(decoder_input)

		decoder_output = decoder_input
		for layer in self.decoder_layers:
			decoder_output = layer(
			 decoder_input=decoder_output,
			 encoder_output=encoder_output,
			 encoder_mask=encoder_mask,
			 decoder_mask=decoder_mask
			)

		return decoder_output

	def projection(self, x):
		return self.linear_projection(x)

	def forward(self, src, src_mask, tgt, tgt_mask):
		# Pass through the encoder
		encoder_output = self.encode(
			encoder_input=src,
			encoder_mask=src_mask
		)

		# Pass through the decoder
		decoder_output = self.decode(
			encoder_output=encoder_output,
			decoder_input=tgt,
			encoder_mask=src_mask,
			decoder_mask=tgt_mask
		)

		# Pass through the projection layer
		proj = self.projection(decoder_output)

		return proj

In [3]:
from transformers import RobertaTokenizer
import torch

def build_transformer(config):
	transformer = Transformer(
		d_model=config["d_model"],
		encoder_vocab_size=config["encoder_vocab_size"],
		decoder_vocab_size=config["decoder_vocab_size"],
		max_length=config["max_length"],
		num_heads=config["num_heads"],
		d_ff=config["d_ff"],
		dropout=config["dropout"],
		N=config["N"]
	)

	return transformer

def build_tokenizer():
	tokenizer = RobertaTokenizer.from_pretrained("FacebookAI/roberta-base")

	# define the special tokens
	special_tokens = {
		"additional_special_tokens" : ["<START>", "<END>", "<PAD>"]
	}
	tokenizer.add_special_tokens(special_tokens)

	return tokenizer

In [4]:
def get_config():
	return {
		"d_model" : 512,
		"encoder_vocab_size" : 50265,
		"decoder_vocab_size" : 50265,
		"max_length" : 200,
		"num_heads" : 4,
		"d_ff" : 2048,
		"dropout" : 0.1,
		"N" : 8,
		"epochs" : 3
	}

In [21]:
import torch
import torch.nn as nn
from tqdm import tqdm
from torch.utils.tensorboard import SummaryWriter
from torch.nn.utils import clip_grad_norm_

def train_model(config, writer, m=None):
	from dataset import load_dataset, create_dataloaders
	# Set up the device
	device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
	print(f"Using {device} for training.")
	# Set up the tokenizer
	tokenizer = build_tokenizer()
	# Get the config
	config = get_config()
	# Load the datasets
	train_data, test_data, validation_data = load_dataset()

	train_dataloader, test_dataloader, valid_dataloader = create_dataloaders(
		train_data=train_data,
		test_data=test_data,
		validation_data=validation_data
	)

	model = build_transformer(config)

	model.to(device)
	# Define the loss function and optimizer
	loss_fn = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id, label_smoothing=0.1).to(device)

	num_epochs = 100
	optimizer = torch.optim.AdamW(params=model.parameters(), lr=1e-5, betas=(0.9, 0.98), eps=1e-8)
	if m is not None:
		data = torch.load(f=m)
		model_state_dict = data["model_state_dict"]
		epoch = data["epoch"] + 1

		optimizer_state_dict = data["optimizer_state_dict"]
		optimizer.load_state_dict(optimizer_state_dict)

		model.load_state_dict(model_state_dict)


	scheduler = torch.optim.lr_scheduler.OneCycleLR(
		optimizer,
    max_lr=2e-5,
    epochs=num_epochs,
    steps_per_epoch=len(train_dataloader),
		pct_start=0.1
	)

	early_stopping_patience = 3
	early_stopping_counter = 0
	best_train_loss = float('inf')

	gradient_accumulation_steps = 2

	for epoch in range(0, 50):
		batch_iterator = tqdm(train_dataloader, desc=f"Processing Epoch: {epoch}")
		model.train()
		train_final_loss = 0
		step = 0
		for batch_idx, batch in enumerate(batch_iterator):
			encoder_input = batch["input_ids"].to(device)
			encoder_mask = batch["attention_mask"].to(device)
			decoder_input = batch["decoder_input_ids"].to(device)
			labels = batch["labels"].to(device)
			decoder_mask = batch["decoder_attention_mask"].to(device)

			# Do the forward pass
			preds = model(
				src=encoder_input,
				src_mask=encoder_mask,
				tgt=decoder_input,
				tgt_mask=decoder_mask
			)

			loss = loss_fn(preds.view(-1, preds.size(-1)), labels.view(-1))
			batch_iterator.set_postfix({f"Loss": f"{loss:.2f}"})
			loss.backward()

			optimizer.step()
			scheduler.step(loss)
			optimizer.zero_grad()

			step += 1
			train_final_loss += loss

			batch_iterator.set_postfix({
                "Loss": f"{loss.item():.3f}",
                "LR": f"{scheduler.get_last_lr()[0]:.2e}"
            })

		train_final_loss /= step
		# writer.add_scalar("Training/Loss", train_final_loss, epoch)

		print(f"\nEpoch: {epoch} | Train Loss: {train_final_loss}")

		if train_final_loss < best_val_loss:
			best_val_loss = train_final_loss
			early_stopping_counter = 0
			loss_for_model_name = math.floor(best_val_loss * 100)
			model_name = f"me{epoch}l{loss_for_model_name}.pth"
			torch.save(
					 obj={
								"model_state_dict": model.state_dict(),
								"optimizer_state_dict": optimizer.state_dict(),
								"epoch" : epoch,
								"train_loss" : train_final_loss
					 },
					f=f"models/{model_name}"
			)
		else:
			early_stopping_counter += 1
			if early_stopping_counter >= early_stopping_counter:
				print(f"Early stopping triggered after {epoch+1} epochs")
				break

		torch.cuda.empty_cache()
	writer.close()
	return model

In [22]:
from pathlib import Path

In [27]:
cfg = get_config()
writer = SummaryWriter()
m = Path("/content/models/me14l177.pth")

In [28]:
cfg

{'d_model': 512,
 'encoder_vocab_size': 50265,
 'decoder_vocab_size': 50265,
 'max_length': 200,
 'num_heads': 4,
 'd_ff': 2048,
 'dropout': 0.1,
 'N': 8,
 'epochs': 3}

In [29]:
writer

<torch.utils.tensorboard.writer.SummaryWriter at 0x7f173e2753d0>

In [30]:
m

PosixPath('/content/models/me14l177.pth')

In [34]:
torch.cuda.empty_cache()

In [35]:
train_model(cfg, writer, m=None)

Using cuda for training.


Processing Epoch: 0:   0%|          | 0/348 [00:00<?, ?it/s]


OutOfMemoryError: CUDA out of memory. Tried to allocate 38.00 MiB. GPU 0 has a total capacity of 14.75 GiB of which 35.06 MiB is free. Process 9531 has 14.71 GiB memory in use. Of the allocated memory 14.33 GiB is allocated by PyTorch, and 258.80 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)