<a href="https://colab.research.google.com/github/txin-y/23springNeuralNetworkProject/blob/main/IMDBtextclassification_v2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install 'portalocker>=2.0.0'
!pip install portalocker


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting portalocker>=2.0.0
  Downloading portalocker-2.7.0-py2.py3-none-any.whl (15 kB)
Installing collected packages: portalocker
Successfully installed portalocker-2.7.0
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import torch
from torchtext.datasets import IMDB
import portalocker
from torch import nn
import torch.nn.functional as F
train_iter = iter(IMDB(split='train'))

In [None]:
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator


tokenizer = get_tokenizer('basic_english')
train_iter = IMDB(split='train')

def yield_tokens(data_iter):
    for _, text in data_iter:
        yield tokenizer(text)

vocab = build_vocab_from_iterator(yield_tokens(train_iter), specials=["<unk>"])
vocab.set_default_index(vocab["<unk>"])

In [None]:
text_pipeline = lambda x: vocab(tokenizer(x))
label_pipeline = lambda x: int(x)-1

In [None]:
from torch.utils.data import DataLoader
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def collate_batch(batch):
    label_list, text_list, offsets = [], [], [0]
    for (_label, _text) in batch:
         label_list.append(label_pipeline(_label))
         processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int64)
         text_list.append(processed_text)
         offsets.append(processed_text.size(0))
    label_list = torch.tensor(label_list, dtype=torch.float)
    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
    text_list = torch.cat(text_list)
    # print(label_list.size())
    return label_list.to(device), text_list.to(device), offsets.to(device)

train_iter = IMDB(split='train')

num_class = len(set([label for (label, text) in train_iter]))

In [None]:
from torch import nn, Tensor
class PositionalEncoding(nn.Module):

    def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 5000):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe = torch.zeros(max_len, 1, d_model)
        pe[:, 0, 0::2] = torch.sin(position * div_term)
        pe[:, 0, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x: Tensor) -> Tensor:
        """
        Arguments:
            x: Tensor, shape ``[seq_len, batch_size, embedding_dim]``
        """
        x = x + self.pe[:x.size(0)]
        return self.dropout(x)

In [None]:
class Net(nn.Module):
    """
    Text classifier based on a pytorch TransformerEncoder.
    """

    def __init__(
        self,
        vocab_size,
        d_model,
        # embeddings,
        nhead=4,
        dim_feedforward=2048,
        num_layers=2,
        dropout=0,
        activation="relu",
        classifier_dropout=0,
    ):

        super().__init__()

        # vocab_size, d_model = embeddings.size()
        assert d_model % nhead == 0, "nheads must divide evenly into d_model"

        # self.emb = nn.Embedding(vocab_size, d_model)
        self.emb = nn.EmbeddingBag(vocab_size, d_model)


        self.pos_encoder = PositionalEncoding(
            d_model=d_model,
            dropout=dropout,
            # vocab_size=vocab_size,
        )

        encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_model,
            nhead=nhead,
            dim_feedforward=dim_feedforward,
            dropout=dropout,
        )
        self.encoder = nn.TransformerEncoder(
            encoder_layer,
            num_layers=num_layers,
        )
        self.decoder = nn.Linear(d_model, num_class)
        self.d_model = d_model
        self.softmax = nn.Softmax(1)

    def init_weights(self) -> None:
        initrange = 0.1
        self.encoder.weight.data.uniform_(-initrange, initrange)
        self.decoder.bias.data.zero_()
        self.decoder.weight.data.uniform_(-initrange, initrange)


    def forward(self, x,offsets):
        x = self.emb(x,offsets) * math.sqrt(self.d_model)
        x = self.pos_encoder(x)
        x = self.encoder(x)
        x = x.mean(dim=1)
        x = self.decoder(x)
        x = self.softmax(x)
        return x

    # def forward(self, x):
    #     x = self.emb(x) * math.sqrt(self.d_model)
    #     x = self.pos_encoder(x)
    #     x = self.encoder(x)
    #     x = x.mean(dim=1)
    #     x = self.decoder(x)
    #     x = self.softmax(x)
    #     # print('net '+str(x.size()))
    #     return x

In [None]:
import math
model = Net(
    vocab_size=len(vocab),
    d_model=200,
    nhead=2,  # the number of heads in the multiheadattention models
    dim_feedforward=2048,  # the dimension of the feedforward network model in nn.TransformerEncoder
    num_layers=2,
    dropout=0.1,
    classifier_dropout=0.1,
).to(device)

In [None]:
class SelfAttention(nn.Module):
    def __init__(self, k, heads=4, mask=False):
      
      super().__init__()
      
      assert k % heads == 0
      
      self.k, self.heads = k, heads

      # These compute the queries, keys and values for all heads
      self.tokeys    = nn.Linear(k, k, bias=False)
      self.toqueries = nn.Linear(k, k, bias=False)
      self.tovalues  = nn.Linear(k, k, bias=False)

      # This will be applied after the multi-head self-attention operation.
      self.unifyheads = nn.Linear(k, k)

    def forward(self, x):

      b, t, k = x.size()
      h = self.heads

      queries = self.toqueries(x)
      keys    = self.tokeys(x)   
      values  = self.tovalues(x)

      s = k // h

      keys    = keys.view(b, t, h, s)
      queries = queries.view(b, t, h, s)
      values  = values.view(b, t, h, s)

       # - fold heads into the batch dimension
      keys = keys.transpose(1, 2).contiguous().view(b * h, t, s)
      queries = queries.transpose(1, 2).contiguous().view(b * h, t, s)
      values = values.transpose(1, 2).contiguous().view(b * h, t, s)
      
      queries = queries 
      keys    = keys

      # Get dot product of queries and keys, and scale
      dot = torch.bmm(queries, keys.transpose(1, 2))
      # -- dot has size (b*h, t, t) containing raw weights

      # scale the dot product
      dot = dot / (k ** (1/2))
    
      # normalize 
      dot = F.softmax(dot, dim=2)
      # - dot now contains row-wise normalized weights

      # apply the self attention to the values
      out = torch.bmm(dot, values).view(b, h, t, s)
      
      # swap h, t back, unify heads
      out = out.transpose(1, 2).contiguous().view(b, t, s * h)
    
      return self.unifyheads(out)

In [None]:
class TransformerBlock(nn.Module):
  def __init__(self, k, heads):
    super().__init__()

    self.attention = SelfAttention(k, heads=heads)

    self.norm1 = nn.LayerNorm(k)
    self.norm2 = nn.LayerNorm(k)

    self.ff = nn.Sequential(
    nn.Linear(k, 4 * k),
    nn.ReLU(),
    nn.Linear(4 * k, k))

  def forward(self, x):
    attended = self.attention(x)
    x = self.norm1(attended + x)

    fedforward = self.ff(x)
    return self.norm2(fedforward + x)

In [None]:
import torch.nn.functional as F
class Transformer(nn.Module):
    def __init__(self, k, heads, depth, seq_length, num_tokens, num_classes):
        super().__init__()

        self.num_tokens = num_tokens
        self.token_emb = nn.Embedding(num_tokens, k)
        self.pos_emb = nn.Embedding(seq_length, k)

		# The sequence of transformer blocks that does all the
		# heavy lifting
        tblocks = []
        for i in range(depth):
            tblocks.append(TransformerBlock(k=k, heads=heads))
        self.tblocks = nn.Sequential(*tblocks)

		# Maps the final output sequence to class logits
        self.toprobs = nn.Linear(k, num_classes)

    def forward(self, x):
        """
        :param x: A (b, t) tensor of integer values representing
                  words (in some predetermined vocabulary).
        :return: A (b, c) tensor of log-probabilities over the
                 classes (where c is the nr. of classes).
        """
		    # generate token embeddings
        tokens = self.token_emb(x)
        b, t, k = tokens.size()

		    # generate position embeddings
        positions = torch.arange(t)
        positions = self.pos_emb(positions)[None, :, :].expand(b, t, k)

        x = tokens + positions
        x = self.tblocks(x)

        # Average-pool over the t dimension and project to class
        # probabilities
        x = self.toprobs(x.mean(dim=1))
        return F.log_softmax(x, dim=1)

In [None]:
# model = Transformer(
#     k=200,
#     heads=4,
#     depth=2,  
#     seq_length=len(vocab),
#     num_tokens=len(vocab),
#     num_classes = 1,
# ).to(device)

In [None]:
import time

def train(dataloader):
    model.train()
    total_acc, total_count = 0, 0
    log_interval = 500
    start_time = time.time()

    for idx, (label, text, offsets) in enumerate(dataloader):
        optimizer.zero_grad()
        # predicted_label = model(text)
        predicted_label = model(text,offsets)
        predicted_label = torch.tensor(predicted_label.argmax(1),dtype=torch.float32)
        # print(predicted_label.shape)
        # print(predicted_label.dtype)
        # print(label.shape)
        # loss = criterion(predicted_label.squeeze(), label.squeeze())
        
        loss = criterion(predicted_label, label)
        # print(loss.item())
        loss.requires_grad = True
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1)
        optimizer.step()
        total_acc += (predicted_label == label).sum().item()
        # total_acc += (predicted_label.argmax(1) == label).sum().item()
        total_count += label.size(0)
        # if idx % 5 == 0:
          # print('pred'+str(predicted_label)+'label:'+str(label))
          # print(total_acc/total_count)
        if idx % log_interval == 0 and idx > 0:
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches '
                  '| accuracy {:8.3f}'.format(epoch, idx, len(dataloader),
                                              total_acc/total_count))
            total_acc, total_count = 0, 0
            start_time = time.time()

def evaluate(dataloader):
    model.eval()
    total_acc, total_count = 0, 0

    with torch.no_grad():
        for idx, (label, text, offsets) in enumerate(dataloader):
            predicted_label = model(text, offsets)
            predicted_label = torch.tensor(predicted_label.argmax(1),dtype=torch.float32)
            # loss = criterion(predicted_label.squeeze(), label.squeeze())
            loss = criterion(predicted_label, label)
            # total_acc += (predicted_label.argmax(1) == label).sum().item()
            total_acc += (predicted_label == label).sum().item()
            total_count += label.size(0)
    return total_acc/total_count

In [None]:
from torch.utils.data.dataset import random_split
from torchtext.data.functional import to_map_style_dataset
# Hyperparameters
EPOCHS = 30 # epoch
LR = 1  # learning rate
BATCH_SIZE = 50 # batch size for training
dataloader = DataLoader(train_iter, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch)
criterion = nn.BCELoss()
# criterion = torch.nn.CrossEntropyLoss()
# criterion = torch.nn.BCEWithLogitsLoss()
# 
# optimizer = torch.optim.SGD(model.parameters(), lr=LR)
# optimizer = torch.optim.Adam(params, LR, betas=(0.9, 0.999), eps=1e-08, weight_decay=0, amsgrad=False, *, foreach=None, maximize=False, capturable=False, differentiable=False, fused=None)
optimizer = torch.optim.Adam(model.parameters(),lr=LR)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95)
# scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.1)
total_accu = None
train_iter, test_iter = IMDB()
train_dataset = to_map_style_dataset(train_iter)
test_dataset = to_map_style_dataset(test_iter)

num_train = int(len(train_dataset) * 0.95)
split_train_, split_valid_ = \
    random_split(train_dataset, [num_train, len(train_dataset) - num_train])

train_dataloader = DataLoader(split_train_, batch_size=BATCH_SIZE,
                              shuffle=True, collate_fn=collate_batch)
valid_dataloader = DataLoader(split_valid_, batch_size=BATCH_SIZE,
                              shuffle=True, collate_fn=collate_batch)
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE,
                             shuffle=True, collate_fn=collate_batch)

In [None]:
for epoch in range(1, EPOCHS + 1):
    epoch_start_time = time.time()
    train(train_dataloader)
    accu_val = evaluate(valid_dataloader)
    if total_accu is not None and total_accu > accu_val:
      scheduler.step()
    else:
       total_accu = accu_val
    print('-' * 59)
    print('| end of epoch {:3d} | time: {:5.2f}s | '
          'valid accuracy {:8.3f} '.format(epoch,
                                           time.time() - epoch_start_time,
                                           accu_val))
    print('-' * 59)

  predicted_label = torch.tensor(predicted_label.argmax(1),dtype=torch.float32)
  predicted_label = torch.tensor(predicted_label.argmax(1),dtype=torch.float32)


-----------------------------------------------------------
| end of epoch   1 | time: 154.02s | valid accuracy    0.500 
-----------------------------------------------------------
-----------------------------------------------------------
| end of epoch   2 | time: 150.50s | valid accuracy    0.500 
-----------------------------------------------------------
-----------------------------------------------------------
| end of epoch   3 | time: 149.17s | valid accuracy    0.500 
-----------------------------------------------------------
-----------------------------------------------------------
| end of epoch   4 | time: 149.61s | valid accuracy    0.500 
-----------------------------------------------------------
-----------------------------------------------------------
| end of epoch   5 | time: 147.11s | valid accuracy    0.500 
-----------------------------------------------------------
-----------------------------------------------------------
| end of epoch   6 | time: 147