In [1]:
# import pytorch libraries
%matplotlib inline
import torch 
import torch.autograd as autograd 
import torch.nn as nn 
import torch.nn.functional as F
import torch.optim as optim
import numpy as np

You need to install

`pip install transformers`

# Text Classification
In this part of the tutorial we develop a continuous bag of words (CBOW) model for a text classification task described [here]( https://people.cs.umass.edu/~miyyer/pubs/2015_acl_dan.pdf). The CBOW model was first described [here](https://arxiv.org/pdf/1301.3781.pdf)

## Subjectivity Dataset
The subjectivity dataset has 5000 subjective and 5000 objective processed sentences. To get the data:
```
wget http://www.cs.cornell.edu/people/pabo/movie-review-data/rotten_imdb.tar.gz
```

In [2]:
def unpack_dataset():
    ! wget http://www.cs.cornell.edu/people/pabo/movie-review-data/rotten_imdb.tar.gz
    ! mkdir data
    ! tar -xvf rotten_imdb.tar.gz -C data

In [3]:
unpack_dataset()

--2024-02-29 16:19:15--  http://www.cs.cornell.edu/people/pabo/movie-review-data/rotten_imdb.tar.gz
Resolving www.cs.cornell.edu (www.cs.cornell.edu)... 132.236.207.36
Connecting to www.cs.cornell.edu (www.cs.cornell.edu)|132.236.207.36|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 519599 (507K) [application/x-gzip]
Saving to: ‘rotten_imdb.tar.gz.1’


2024-02-29 16:19:16 (1.37 MB/s) - ‘rotten_imdb.tar.gz.1’ saved [519599/519599]

mkdir: cannot create directory ‘data’: File exists
quote.tok.gt9.5000
plot.tok.gt9.5000
subjdata.README.1.0


In [4]:
!ls data

plot.tok.gt9.5000  quote.tok.gt9.5000  subjdata.README.1.0


In [5]:
! head -2 data/plot.tok.gt9.5000

the movie begins in the past where a young boy named sam attempts to save celebi from a hunter . 
emerging from the human psyche and showing characteristics of abstract expressionism , minimalism and russian constructivism , graffiti removal has secured its place in the history of modern art while being created by artists who are unconscious of their artistic achievements . 


In [6]:
from pathlib import Path
PATH = Path("data")
list(PATH.iterdir())

[PosixPath('data/plot.tok.gt9.5000'),
 PosixPath('data/subjdata.README.1.0'),
 PosixPath('data/quote.tok.gt9.5000')]

## Large Laguage Model

We will be using google model "flan-t5-small."
https://huggingface.co/google/flan-t5-small#usage

In [7]:
from transformers import T5Tokenizer, T5ForConditionalGeneration, T5EncoderModel

In [8]:
# use T5ForConditionalGeneration you use this model with a prompt

tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-small")
model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-small")

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [9]:
input_text = "translate English to Spanish: How old are you?"
input_ids = tokenizer(input_text, return_tensors="pt").input_ids
input_ids

tensor([[13959,  1566,    12,  5093,    10,   571,   625,    33,    25,    58,
             1]])

In [10]:
outputs = model.generate(input_ids, max_new_tokens=100)
print(tokenizer.decode(outputs[0]))

<pad> <unk> Cuántos a<unk> os están?</s>


## Tokenization
Each language model has it's own tokenization function. The tokenizer returns a list of ids correspoding to the tokenized text.

In [11]:
input_text = "translate English to Spanish: How old are you?"
input_ids = tokenizer(input_text, return_tensors="pt").input_ids
input_ids

tensor([[13959,  1566,    12,  5093,    10,   571,   625,    33,    25,    58,
             1]])

In [12]:
tokenizer(input_text, return_tensors="pt")

{'input_ids': tensor([[13959,  1566,    12,  5093,    10,   571,   625,    33,    25,    58,
             1]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [13]:
# language model output
outputs = model.generate(input_ids, max_new_tokens=100)
print(tokenizer.decode(outputs[0]))

<pad> <unk> Cuántos a<unk> os están?</s>


## Getting the model's (encoder) representation of a piece of text

In [14]:
## T5EncoderModel will return just the encoder, which is useful for classification tasks

model = T5EncoderModel.from_pretrained("google/flan-t5-small")

In [15]:
# Get model output (encoder hidden states)
with torch.no_grad():
    outputs = model(input_ids, return_dict=True)

In [16]:
# Extract the embeddings from the last hidden layer of the encoder
last_hidden_states = outputs.last_hidden_state

In [17]:
outputs.keys()

odict_keys(['last_hidden_state'])

In [18]:
last_hidden_states.shape

torch.Size([1, 11, 512])

In [19]:
sentence_embedding = torch.mean(last_hidden_states, dim=1)
sentence_embedding.shape

torch.Size([1, 512])

## Split dataset in train and validation

In [20]:
from pathlib import Path
PATH = Path("data")
list(PATH.iterdir())

[PosixPath('data/plot.tok.gt9.5000'),
 PosixPath('data/subjdata.README.1.0'),
 PosixPath('data/quote.tok.gt9.5000')]

In [21]:
# We need each line in the file 
def read_file(path):
    """ Read file returns a list of lines.
    """
    with open(path, encoding = "ISO-8859-1") as f:
        content = f.readlines()
    return content

In [22]:
from sklearn.model_selection import train_test_split

In [23]:
sub_content = read_file(PATH/"quote.tok.gt9.5000")
obj_content = read_file(PATH/"plot.tok.gt9.5000")
sub_y = np.zeros(len(sub_content))
obj_y = np.ones(len(obj_content))
X = np.append(sub_content, obj_content)
y = np.append(sub_y, obj_y)

In [24]:
X[0], y[0]

('smart and alert , thirteen conversations about one thing is a small gem . \n',
 0.0)

In [25]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [26]:
X_train[:5], y_train[:5]

(array(['will god let her fall or give her a new path ? \n',
        "the director's twitchy sketchbook style and adroit perspective shifts grow wearisome amid leaden pacing and indifferent craftsmanship ( most notably wretched sound design ) . \n",
        "welles groupie/scholar peter bogdanovich took a long time to do it , but he's finally provided his own broadside at publishing giant william randolph hearst . \n",
        'based on the 1997 john king novel of the same name with a rather odd synopsis : " a first novel about a seasoned chelsea football club hooligan who represents a disaffected society operating by brutal rules . \n',
        'yet , beneath an upbeat appearance , she is struggling desperately with the emotional and physical scars left by the attack . \n'],
       dtype='<U693'),
 array([1., 0., 0., 1., 1.]))

## Sentence encoding

In [27]:
def get_sentence_encoding(text, model=model):
    input_ids = tokenizer(text, return_tensors="pt").input_ids
    with torch.no_grad():
        outputs = model(input_ids, return_dict=True)
    last_hidden_states = outputs.last_hidden_state
    sentence_embedding = torch.mean(last_hidden_states, dim=1)
    return sentence_embedding[0].numpy()

In [28]:
v = get_sentence_encoding(X_train[0].strip())
v.shape

(512,)

In [29]:
x_train = np.vstack([get_sentence_encoding(x.strip()) for x in X_train])
x_train.shape

(8000, 512)

In [30]:
x_val = np.vstack([get_sentence_encoding(x.strip()) for x in X_val])
x_val.shape

(2000, 512)

## Training a Logistic regression model

In [31]:
from sklearn.linear_model import LogisticRegression

In [32]:
y_train.shape

(8000,)

In [33]:
clf = LogisticRegression(random_state=0).fit(x_train, y_train)

In [34]:
clf.score(x_val, y_val)

0.938

## Traning with gradient boosting 

In [35]:
from sklearn.ensemble import GradientBoostingClassifier

In [36]:
clf = GradientBoostingClassifier(n_estimators=50, learning_rate=0.1,
                                 max_depth=7, random_state=0).fit(x_train, y_train)

In [37]:
clf.score(x_val, y_val)

0.9175

# Finetuning a deep learning model

In [38]:
from transformers import T5Model, T5Config

In [39]:
# Load pre-trained T5 model configuration
config = T5Config.from_pretrained('google/flan-t5-small')

In [40]:
#config

In [41]:
config.dropout_rate, config.d_model

(0.1, 512)

In [42]:
class T5ForTextClassification(nn.Module):
    def __init__(self, config, num_labels=1):
        super(T5ForTextClassification, self).__init__()
        self.t5 = T5EncoderModel.from_pretrained("google/flan-t5-small")
        self.dropout = nn.Dropout(config.dropout_rate)
        self.classifier = nn.Linear(config.d_model, num_labels)

    def forward(self, input_ids, attention_mask=None):
        outputs = self.t5(input_ids, attention_mask=attention_mask)
        last_hidden_state = outputs.last_hidden_state

        # Mask out the padding tokens before calculating the mean
        attention_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size())
        sum_hidden_state = torch.sum(last_hidden_state * attention_mask_expanded, 1)
        sum_attention_mask = torch.clamp(attention_mask_expanded.sum(1), min=1e-9)
        pooled_output = sum_hidden_state / sum_attention_mask
        
        #pooled_output = last_hidden_state.mean(dim=1)  # You may customize this pooling strategy
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        return logits

In [43]:
model = T5ForTextClassification(config)

In [44]:
X_train[0]

'will god let her fall or give her a new path ? \n'

In [45]:
batch = tokenizer(list(X_train[:2]), padding=True, truncation=True, return_tensors="pt")
batch

{'input_ids': tensor([[   56,  8581,   752,   160,  1590,    42,   428,   160,     3,     9,
           126,  2071,     3,    58,     1,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0],
        [    8,  2090,    31,     7,     3,    17,  7820, 11971, 13278,  2567,
           869,    11,     3,     9,  9869,  3503,  4108,     7,  1604,  2112,
            23,  5529, 18905,   991,    35,     3,  5379,    53,    11,    16,
         25880, 25112,    41,   167,     3, 20283,     3,   210,    52, 25872,
          1345,   408,     3,    61,     3,     5,     1]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

In [46]:
y_hat = model(batch.input_ids, batch.attention_mask)
y_hat

tensor([[-0.0220],
        [ 0.0028]], grad_fn=<AddmmBackward0>)

In [47]:
y = torch.Tensor(np.array(y_train[:2, None]))
y

tensor([[1.],
        [0.]])

In [48]:
F.binary_cross_entropy_with_logits(y_hat, y)

tensor(0.6994, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)

## Getting ready to train in batches
1. The first step is to use the tokenize and pad to our batches of data.
3. Create a dataset

We have two choices: we could pad all the sequences at the same time to the maximum sequence length, or we could pad a batch of data at a time. The first option is easier, so we will start there.

In [49]:
list(X_train[:5])

['will god let her fall or give her a new path ? \n',
 "the director's twitchy sketchbook style and adroit perspective shifts grow wearisome amid leaden pacing and indifferent craftsmanship ( most notably wretched sound design ) . \n",
 "welles groupie/scholar peter bogdanovich took a long time to do it , but he's finally provided his own broadside at publishing giant william randolph hearst . \n",
 'based on the 1997 john king novel of the same name with a rather odd synopsis : " a first novel about a seasoned chelsea football club hooligan who represents a disaffected society operating by brutal rules . \n',
 'yet , beneath an upbeat appearance , she is struggling desperately with the emotional and physical scars left by the attack . \n']

In [50]:
batch = tokenizer(list(X_train[:3]), padding=True, truncation=True, return_tensors="pt")

In [51]:
batch

{'input_ids': tensor([[   56,  8581,   752,   160,  1590,    42,   428,   160,     3,     9,
           126,  2071,     3,    58,     1,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0],
        [    8,  2090,    31,     7,     3,    17,  7820, 11971, 13278,  2567,
           869,    11,     3,     9,  9869,  3503,  4108,     7,  1604,  2112,
            23,  5529, 18905,   991,    35,     3,  5379,    53,    11,    16,
         25880, 25112,    41,   167,     3, 20283,     3,   210,    52, 25872,
          1345,   408,     3,    61,     3,     5,     1,     0,     0,     0,
             0,     0,     0],
        [  168,    15,     7,   563,    23,    15,    87,   860, 17401,   158,
           449,     3, 12247,    26,     9,  5326,   36

In [52]:
x_train = tokenizer(list(X_train), padding=True, truncation=True, return_tensors="pt")

In [53]:
x_train["input_ids"].shape, x_train["attention_mask"].shape

(torch.Size([8000, 230]), torch.Size([8000, 230]))

In [54]:
x_val = tokenizer(list(X_val), padding=True, truncation=True, return_tensors="pt")
x_val["input_ids"].shape, x_val["attention_mask"].shape

(torch.Size([2000, 124]), torch.Size([2000, 124]))

In [55]:
from torch.utils.data import Dataset, DataLoader

In [56]:
class TextDataset(Dataset):
    def __init__(self, X, y):
        self.X = X["input_ids"]
        self.attention = X["attention_mask"]
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.attention[idx], self.y[idx, None]

In [57]:
train_ds = TextDataset(x_train, y_train)
val_ds = TextDataset(x_val, y_val)

In [58]:
#val_ds[0]

In [59]:
train_dl = DataLoader(train_ds, batch_size=8, shuffle=True)
val_dl = DataLoader(val_ds, batch_size=8, shuffle=False)

In [60]:
x, att, y = next(iter(train_dl))

In [61]:
x.shape, att.shape, y.shape

(torch.Size([8, 230]), torch.Size([8, 230]), torch.Size([8, 1]))

In [62]:
y_hat = model(x, att)

In [63]:
y_hat

tensor([[ 0.0078],
        [-0.0325],
        [ 0.0044],
        [-0.0146],
        [-0.0128],
        [ 0.0114],
        [-0.0031],
        [-0.0059]], grad_fn=<AddmmBackward0>)

In [64]:
F.binary_cross_entropy_with_logits(y_hat, y)

tensor(0.6968, dtype=torch.float64,
       grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)

In [65]:
pred = (y_hat > 0).int()
pred

tensor([[1],
        [0],
        [1],
        [0],
        [0],
        [1],
        [0],
        [0]], dtype=torch.int32)

In [66]:
pred.eq(y).sum().item()

2

## Training Loop

In [67]:
from torch.optim.lr_scheduler import OneCycleLR

In [68]:
def val_metrics(model, val_dl):
    model.eval()
    losses = []
    correct = 0
    total = 0
    for x, att, y in val_dl:
        y_hat = model(x.cuda(), att.cuda())
        loss = F.binary_cross_entropy_with_logits(y_hat, y.cuda())
        pred = (y_hat > 0).int().cpu()
        correct += pred.eq(y).sum().item()
        losses.append(loss.item())
        total += y.shape[0]
    return np.mean(losses), correct/total

In [69]:
model = model.cuda()
val_metrics(model, val_dl)

(0.6959807935734279, 0.3835)

In [70]:
def train_loop(model, optimizer, scheduler, train_dl, val_dl, epochs=10):
    for i in range(epochs):
        model.train()
        losses = []
        for x, att, y in train_dl:
            y_hat = model(x.cuda(), att.cuda())
            loss = F.binary_cross_entropy_with_logits(y_hat, y.cuda())
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            losses.append(loss.item())
            scheduler.step()
        train_loss = np.mean(losses)
        val_loss, val_acc = val_metrics(model, val_dl)
        print("train_loss %.3f val_loss %.3f val_acc %.3f" % (train_loss, val_loss, val_acc))

In [71]:
model = T5ForTextClassification(config).cuda()
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
epochs = 10
scheduler = OneCycleLR(optimizer, max_lr=1e-5, 
                       total_steps=len(train_dl) * epochs, 
                       div_factor=25.0, final_div_factor=100.0)

In [72]:
train_loop(model, optimizer, scheduler, train_dl, val_dl, epochs=epochs)

train_loss 0.689 val_loss 0.684 val_acc 0.581
train_loss 0.641 val_loss 0.498 val_acc 0.906
train_loss 0.401 val_loss 0.279 val_acc 0.925
train_loss 0.267 val_loss 0.210 val_acc 0.936
train_loss 0.207 val_loss 0.188 val_acc 0.944
train_loss 0.185 val_loss 0.165 val_acc 0.945
train_loss 0.169 val_loss 0.163 val_acc 0.947
train_loss 0.162 val_loss 0.154 val_acc 0.948
train_loss 0.151 val_loss 0.153 val_acc 0.947
train_loss 0.155 val_loss 0.153 val_acc 0.947


In [75]:
model = T5ForTextClassification(config).cuda()
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
epochs = 20
scheduler = OneCycleLR(optimizer, max_lr=1e-5, 
                       total_steps=len(train_dl) * epochs, 
                       div_factor=25.0, final_div_factor=100.0)

In [76]:
train_loop(model, optimizer, scheduler, train_dl, val_dl, epochs=epochs)

train_loss 0.692 val_loss 0.690 val_acc 0.510
train_loss 0.687 val_loss 0.678 val_acc 0.720
train_loss 0.636 val_loss 0.523 val_acc 0.908
train_loss 0.448 val_loss 0.326 val_acc 0.932
train_loss 0.305 val_loss 0.229 val_acc 0.938
train_loss 0.226 val_loss 0.187 val_acc 0.943
train_loss 0.191 val_loss 0.163 val_acc 0.949
train_loss 0.170 val_loss 0.152 val_acc 0.951
train_loss 0.149 val_loss 0.145 val_acc 0.953
train_loss 0.135 val_loss 0.145 val_acc 0.955
train_loss 0.124 val_loss 0.137 val_acc 0.956
train_loss 0.115 val_loss 0.135 val_acc 0.956
train_loss 0.112 val_loss 0.131 val_acc 0.957
train_loss 0.108 val_loss 0.130 val_acc 0.957
train_loss 0.102 val_loss 0.130 val_acc 0.958
train_loss 0.101 val_loss 0.128 val_acc 0.960
train_loss 0.096 val_loss 0.128 val_acc 0.959
train_loss 0.096 val_loss 0.128 val_acc 0.959
train_loss 0.096 val_loss 0.128 val_acc 0.959
train_loss 0.094 val_loss 0.127 val_acc 0.959
