In [54]:
# import pytorch libraries
%matplotlib inline
import torch 
import torch.autograd as autograd 
import torch.nn as nn 
import torch.nn.functional as F
import torch.optim as optim
import numpy as np

You need to install

`pip install transformers`

# Text Classification
In this part of the tutorial we develop a continuous bag of words (CBOW) model for a text classification task described [here]( https://people.cs.umass.edu/~miyyer/pubs/2015_acl_dan.pdf). The CBOW model was first described [here](https://arxiv.org/pdf/1301.3781.pdf)

## Subjectivity Dataset
The subjectivity dataset has 5000 subjective and 5000 objective processed sentences. To get the data:
```
wget http://www.cs.cornell.edu/people/pabo/movie-review-data/rotten_imdb.tar.gz
```

In [2]:
def unpack_dataset():
    ! wget http://www.cs.cornell.edu/people/pabo/movie-review-data/rotten_imdb.tar.gz
    ! mkdir data
    ! tar -xvf rotten_imdb.tar.gz -C data

In [3]:
unpack_dataset()

--2023-12-24 00:39:17--  http://www.cs.cornell.edu/people/pabo/movie-review-data/rotten_imdb.tar.gz
Resolving www.cs.cornell.edu (www.cs.cornell.edu)... 132.236.207.36
Connecting to www.cs.cornell.edu (www.cs.cornell.edu)|132.236.207.36|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 519599 (507K) [application/x-gzip]
Saving to: ‘rotten_imdb.tar.gz’


2023-12-24 00:39:18 (1.31 MB/s) - ‘rotten_imdb.tar.gz’ saved [519599/519599]

quote.tok.gt9.5000
plot.tok.gt9.5000
subjdata.README.1.0


In [4]:
!ls data

plot.tok.gt9.5000  quote.tok.gt9.5000  subjdata.README.1.0


In [5]:
! head -2 data/plot.tok.gt9.5000

the movie begins in the past where a young boy named sam attempts to save celebi from a hunter . 
emerging from the human psyche and showing characteristics of abstract expressionism , minimalism and russian constructivism , graffiti removal has secured its place in the history of modern art while being created by artists who are unconscious of their artistic achievements . 


In [6]:
from pathlib import Path
PATH = Path("data")
list(PATH.iterdir())

[PosixPath('data/plot.tok.gt9.5000'),
 PosixPath('data/subjdata.README.1.0'),
 PosixPath('data/quote.tok.gt9.5000')]

## Large Laguage Model

We will be using google model "flan-t5-small."
https://huggingface.co/google/flan-t5-small#usage

In [4]:
from transformers import T5Tokenizer, T5ForConditionalGeneration, T5EncoderModel

In [34]:
# use T5ForConditionalGeneration you use this model with a prompt

tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-small")
model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-small")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [7]:
input_text = "translate English to Spanish: How old are you?"
input_ids = tokenizer(input_text, return_tensors="pt").input_ids
input_ids

tensor([[13959,  1566,    12,  5093,    10,   571,   625,    33,    25,    58,
             1]])

In [8]:
outputs = model.generate(input_ids, max_new_tokens=100)
print(tokenizer.decode(outputs[0]))

<pad> <unk> Cuántos a<unk> os están?</s>


## Tokenization
Each language model has it's own tokenization function. The tokenizer returns a list of ids correspoding to the tokenized text.

In [9]:
input_text = "translate English to Spanish: How old are you?"
input_ids = tokenizer(input_text, return_tensors="pt").input_ids
input_ids

tensor([[13959,  1566,    12,  5093,    10,   571,   625,    33,    25,    58,
             1]])

In [10]:
# language model output
outputs = model.generate(input_ids, max_new_tokens=100)
print(tokenizer.decode(outputs[0]))

<pad> <unk> Cuántos a<unk> os están?</s>


## Getting the model's (encoder) representation of a piece of text

In [35]:
## T5EncoderModel will return just the encoder, which is useful for classification tasks

model = T5EncoderModel.from_pretrained("google/flan-t5-small")

In [39]:
# Get model output (encoder hidden states)
with torch.no_grad():
    outputs = model(input_ids, return_dict=True)

In [40]:
# Extract the embeddings from the last hidden layer of the encoder
last_hidden_states = outputs.last_hidden_state

In [42]:
outputs.keys()

odict_keys(['last_hidden_state'])

In [14]:
last_hidden_states.shape

torch.Size([1, 11, 512])

In [15]:
sentence_embedding = torch.mean(last_hidden_states, dim=1)
sentence_embedding.shape

torch.Size([1, 512])

## Split dataset in train and validation

In [16]:
from pathlib import Path
PATH = Path("data")
list(PATH.iterdir())

[PosixPath('data/plot.tok.gt9.5000'),
 PosixPath('data/subjdata.README.1.0'),
 PosixPath('data/quote.tok.gt9.5000')]

In [17]:
# We need each line in the file 
def read_file(path):
    """ Read file returns a list of lines.
    """
    with open(path, encoding = "ISO-8859-1") as f:
        content = f.readlines()
    return content

In [18]:
from sklearn.model_selection import train_test_split

In [19]:
sub_content = read_file(PATH/"quote.tok.gt9.5000")
obj_content = read_file(PATH/"plot.tok.gt9.5000")
sub_y = np.zeros(len(sub_content))
obj_y = np.ones(len(obj_content))
X = np.append(sub_content, obj_content)
y = np.append(sub_y, obj_y)

In [20]:
X[0], y[0]

('smart and alert , thirteen conversations about one thing is a small gem . \n',
 0.0)

In [21]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [22]:
X_train[:5], y_train[:5]

(array(['will god let her fall or give her a new path ? \n',
        "the director's twitchy sketchbook style and adroit perspective shifts grow wearisome amid leaden pacing and indifferent craftsmanship ( most notably wretched sound design ) . \n",
        "welles groupie/scholar peter bogdanovich took a long time to do it , but he's finally provided his own broadside at publishing giant william randolph hearst . \n",
        'based on the 1997 john king novel of the same name with a rather odd synopsis : " a first novel about a seasoned chelsea football club hooligan who represents a disaffected society operating by brutal rules . \n',
        'yet , beneath an upbeat appearance , she is struggling desperately with the emotional and physical scars left by the attack . \n'],
       dtype='<U693'),
 array([1., 0., 0., 1., 1.]))

## Sentence encoding

In [37]:
def get_sentence_encoding(text, model=model):
    input_ids = tokenizer(text, return_tensors="pt").input_ids
    with torch.no_grad():
        outputs = model(input_ids, return_dict=True)
    last_hidden_states = outputs.last_hidden_state
    sentence_embedding = torch.mean(last_hidden_states, dim=1)
    return sentence_embedding[0].numpy()

In [38]:
v = get_sentence_encoding(X_train[0].strip())
v.shape

(512,)

In [25]:
x_train = np.vstack([get_sentence_encoding(x.strip()) for x in X_train])
x_train.shape

(8000, 512)

In [26]:
x_val = np.vstack([get_sentence_encoding(x.strip()) for x in X_val])
x_val.shape

(2000, 512)

## Training a Logistic regression model

In [27]:
from sklearn.linear_model import LogisticRegression

In [28]:
y_train.shape

(8000,)

In [29]:
clf = LogisticRegression(random_state=0).fit(x_train, y_train)

In [30]:
clf.score(x_val, y_val)

0.938

## Traning with gradient boosting 

In [31]:
from sklearn.ensemble import GradientBoostingClassifier

In [32]:
clf = GradientBoostingClassifier(n_estimators=50, learning_rate=0.1,
                                 max_depth=7, random_state=0).fit(x_train, y_train)

In [33]:
clf.score(x_val, y_val)

0.9175

# Finetuning a deep learning model

In [43]:
from transformers import T5Model, T5Config

In [47]:
# Load pre-trained T5 model configuration
config = T5Config.from_pretrained('google/flan-t5-small')

In [50]:
#config

In [55]:
config.dropout_rate, config.d_model

(0.1, 512)

In [56]:
class T5ForTextClassification(nn.Module):
    def __init__(self, config, num_labels=1):
        super(T5ForTextClassification, self).__init__()
        self.t5 = T5EncoderModel.from_pretrained("google/flan-t5-small")
        self.dropout = nn.Dropout(config.dropout_rate)
        self.classifier = nn.Linear(config.d_model, num_labels)

    def forward(self, input_ids, attention_mask=None, labels=None):
        outputs = self.t5(input_ids, attention_mask=attention_mask)
        last_hidden_state = outputs.last_hidden_state
        pooled_output = last_hidden_state.mean(dim=1)  # You may customize this pooling strategy
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        return logits

In [57]:
model = T5ForTextClassification(config)

In [59]:
X_train[0]

'will god let her fall or give her a new path ? \n'

In [60]:
input_ids = tokenizer(X_train[0], return_tensors="pt").input_ids
input_ids

tensor([[  56, 8581,  752,  160, 1590,   42,  428,  160,    3,    9,  126, 2071,
            3,   58,    1]])

In [61]:
input_ids.shape

torch.Size([1, 15])

In [62]:
model(input_ids)

tensor([[0.0479]], grad_fn=<AddmmBackward0>)

## Getting reading to train in batches
1. The first step is to use the tokenizer to tokenize the data
2. Add paddin