### Imports

### Multi-Head Self-Attention

<center>
<img src="https://raw.githubusercontent.com/HosseinZaredar/Transformer-from-Scratch/main/SelfAttention.png" width="600" align="center"/>
</center>


In [5]:
class MHSelfAttention(nn.Module):
  def __init__(self, embed_dim, num_heads):
    super(MHSelfAttention, self).__init__()
    self.embed_dim = embed_dim
    self.num_heads = num_heads
    self.head_dim = embed_dim // num_heads

    assert (self.num_heads*self.head_dim == self.embed_dim), \
    'embed size must be divisible by number of heads'

    self.w_queries = nn.Linear(self.embed_dim, self.embed_dim, bias=False)
    self.w_keys = nn.Linear(self.embed_dim, self.embed_dim, bias=False)
    self.w_values = nn.Linear(self.embed_dim, self.embed_dim, bias=False)

    self.fc_out = nn.Linear(self.head_dim*self.num_heads , self.embed_dim)

  def forward(self, x):

    # shape of x = [batch_size, sentence_length, embedding_dim]
    batch_size = x.shape[0]
    sentence_len = x.shape[1]

    queries = self.w_queries(x).reshape(
        batch_size, sentence_len, self.num_heads, self.head_dim).permute(
            0, 2, 1, 3)

    keys = self.w_keys(x).reshape(
        batch_size, sentence_len, self.num_heads, self.head_dim).permute(
            0, 2, 3, 1)


    values = self.w_values(x).reshape(
        batch_size, sentence_len, self.num_heads, self.head_dim).permute(
            0, 2, 1, 3)

    attention_scores = T.einsum('bijk,bikl->bijl', queries, keys)
    attention_dist = T.softmax(attention_scores /
                               (self.embed_dim ** (1/2)), dim=-1)
    attention_out = T.einsum('bijk,bikl->bijl', attention_dist, values)
    concatenated_out = attention_out.permute(0, 2, 1, 3).reshape(
        batch_size, sentence_len, self.embed_dim)

    return concatenated_out

### Transformer Encoder

<center>
<img src="https://raw.githubusercontent.com/HosseinZaredar/Transformer-from-Scratch/main/Encoder.png" width="200" align="center"/>
</center>

In [6]:
class TransformerEncoder(nn.Module):
  def __init__(self, embed_dim, num_heads, forward_expansion, dropout=0.1):
    super(TransformerEncoder, self).__init__()

    self.attention = MHSelfAttention(embed_dim, num_heads)
    self.norm1 = nn.LayerNorm(embed_dim)
    self.norm2 = nn.LayerNorm(embed_dim)

    self.feed_forward = nn.Sequential(
        nn.Linear(embed_dim, forward_expansion*embed_dim),
        nn.ReLU(),
        nn.Linear(forward_expansion*embed_dim, embed_dim)
    )

    self.dropout = nn.Dropout(dropout)

  def forward(self, x):
    attention_out = self.dropout(self.attention(x))
    x = self.norm1(x + attention_out)
    forward_out = self.dropout(self.feed_forward(x))
    out = self.norm2(x + forward_out)

    return out

### End-to-End Classifier

1. An embedding layer
2. A single transformer encoder layer
3. A fully-connected network as a linear classifier

In [7]:
class Classifier(nn.Module):
  def __init__(self, vocab_size, max_length, embed_dim,
               num_heads, forward_expansion):
      super(Classifier, self).__init__()

      self.embedder = Embedding(vocab_size, max_length, embed_dim)
      self.encoder = TransformerEncoder(embed_dim, num_heads, forward_expansion)
      self.fc = nn.Linear(embed_dim, 1)

  def forward(self, x):
    embedding = self.embedder(x)
    encoding = self.encoder(embedding)
    compact_encoding = encoding.max(dim=1)[0]
    out = self.fc(compact_encoding)
    return out

### Load and Preprocess IMDb Dataset

In [10]:
import tensorflow_datasets as tfds

# Load the IMDb dataset
(train_data, test_data), info = tfds.load('imdb_reviews/subwords8k', 
                                          split=(tfds.Split.TRAIN, tfds.Split.TEST),
                                          with_info=True, as_supervised=True)

tokenizer = info.features['text'].encoder

BUFFER_SIZE = 10000
BATCH_SIZE = 64

train_batches = train_data.shuffle(BUFFER_SIZE).padded_batch(BATCH_SIZE)
test_batches = test_data.padded_batch(BATCH_SIZE)


2024-06-13 15:54:47.286683: W external/local_tsl/tsl/platform/cloud/google_auth_provider.cc:184] All attempts to get a Google authentication bearer token failed, returning an empty token. Retrieving token from files failed with "NOT_FOUND: Could not locate the credentials file.". Retrieving token from GCE failed with "FAILED_PRECONDITION: Error executing an HTTP request: libcurl code 6 meaning 'Couldn't resolve host name', error details: Could not resolve host: metadata.google.internal".


[1mDownloading and preparing dataset 80.23 MiB (download: 80.23 MiB, generated: Unknown size, total: 80.23 MiB) to /Users/aaronnguyen/tensorflow_datasets/imdb_reviews/subwords8k/1.0.0...[0m


  from .autonotebook import tqdm as notebook_tqdm
Dl Size...: 100%|██████████| 80/80 [00:37<00:00,  2.11 MiB/s]rl]
Dl Completed...: 100%|██████████| 1/1 [00:37<00:00, 37.88s/ url]
2024-06-13 15:57:12.647395: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M2
2024-06-13 15:57:12.647440: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 16.00 GB
2024-06-13 15:57:12.647450: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 5.33 GB
2024-06-13 15:57:12.647483: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-06-13 15:57:12.647495: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


[1mDataset imdb_reviews downloaded and prepared to /Users/aaronnguyen/tensorflow_datasets/imdb_reviews/subwords8k/1.0.0. Subsequent calls will reuse this data.[0m


### Initializing The Model

In [11]:
EMBED_DIM = 200
NUM_HEADS = 8
FORWARD_EXPANSION = 3
MAX_LENGTH = 512
VOCAB_SIZE = len(TEXT.vocab)

classifier = Classifier(
    VOCAB_SIZE, MAX_LENGTH, EMBED_DIM, NUM_HEADS, FORWARD_EXPANSION)
device = T.device('cuda' if T.cuda.is_available() else 'cpu')
classifier.to(device)

NameError: name 'TEXT' is not defined

### Training

In [None]:
optimizer = optim.SGD(classifier.parameters(), lr=1e-3)

In [None]:
criterion = nn.BCEWithLogitsLoss()
device = T.device('cuda' if T.cuda.is_available() else 'cpu')
criterion.to(device);

In [None]:
def binary_accuracy(preds, y):
    rounded_preds = T.round(T.sigmoid(preds))
    correct = (rounded_preds == y).float()
    acc = correct.sum() / len(correct)
    return acc

In [None]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [None]:
def train(model, iterator, optimizer, criterion):

    epoch_loss = 0
    epoch_acc = 0

    model.train()

    for batch in iterator:

        optimizer.zero_grad()

        input = batch.text
        if input.shape[1] > MAX_LENGTH:
          input = input[:, :MAX_LENGTH]

        predictions = model(input).squeeze(1)

        loss = criterion(predictions, batch.label)

        acc = binary_accuracy(predictions, batch.label)

        loss.backward()

        optimizer.step()

        epoch_loss += loss.item()
        epoch_acc += acc.item()

    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [None]:
def evaluate(model, iterator, criterion):
    epoch_loss = 0
    epoch_acc = 0
    model.eval()

    with T.no_grad():
        for batch in iterator:

            input = batch.text
            if input.shape[1] > MAX_LENGTH:
              input = input[:, :MAX_LENGTH]

            predictions = model(input).squeeze(1)
            loss = criterion(predictions, batch.label)
            acc = binary_accuracy(predictions, batch.label)
            epoch_loss += loss.item()
            epoch_acc += acc.item()

    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [None]:
N_EPOCHS = 10
best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

    start_time = time.time()

    train_loss, train_acc = train(
        classifier, train_iterator,optimizer, criterion)
    valid_loss, valid_acc = evaluate(classifier, valid_iterator, criterion)

    end_time = time.time()
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)

    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        T.save(classifier.state_dict(), 'sent-classifier.pt')

    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

Epoch: 01 | Epoch Time: 1m 2s
	Train Loss: 0.730 | Train Acc: 49.83%
	 Val. Loss: 0.702 |  Val. Acc: 50.51%
Epoch: 02 | Epoch Time: 1m 1s
	Train Loss: 0.719 | Train Acc: 51.80%
	 Val. Loss: 0.688 |  Val. Acc: 53.43%
Epoch: 03 | Epoch Time: 1m 1s
	Train Loss: 0.704 | Train Acc: 53.22%
	 Val. Loss: 0.689 |  Val. Acc: 53.61%
Epoch: 04 | Epoch Time: 1m 1s
	Train Loss: 0.695 | Train Acc: 55.47%
	 Val. Loss: 0.683 |  Val. Acc: 55.73%
Epoch: 05 | Epoch Time: 1m 1s
	Train Loss: 0.684 | Train Acc: 57.75%
	 Val. Loss: 0.684 |  Val. Acc: 56.09%
Epoch: 06 | Epoch Time: 1m 1s
	Train Loss: 0.670 | Train Acc: 59.65%
	 Val. Loss: 0.641 |  Val. Acc: 62.57%
Epoch: 07 | Epoch Time: 1m 1s
	Train Loss: 0.648 | Train Acc: 62.01%
	 Val. Loss: 0.618 |  Val. Acc: 65.43%
Epoch: 08 | Epoch Time: 1m 1s
	Train Loss: 0.627 | Train Acc: 64.58%
	 Val. Loss: 0.595 |  Val. Acc: 67.84%
Epoch: 09 | Epoch Time: 1m 1s
	Train Loss: 0.602 | Train Acc: 67.27%
	 Val. Loss: 0.586 |  Val. Acc: 68.76%
Epoch: 10 | Epoch Time: 1m 1

### Evaluation

In [None]:
classifier.load_state_dict(T.load('sent-classifier.pt'))

<All keys matched successfully>

In [None]:
test_loss, test_acc = evaluate(classifier, test_iterator, criterion)
print(f'Test Loss: {test_loss:.3f} |  Test Acc: {test_acc*100:.2f}%')

Test Loss: 0.578 |  Test Acc: 69.58%
