In [1]:
# kind of a simplidied version of glue.py and run_glue.py from huggingface

In [2]:
import torchtext
import pandas as pd
from torchtext import datasets
import time
import torch as torch
import numpy as np
import torch.nn as nn
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
from sklearn.metrics import accuracy_score
from tqdm import tqdm

In [3]:
# bert imports
# To import BertTokenizerFast i must install from src
from transformers.data.processors.utils import InputFeatures
from transformers import BertForSequenceClassification,BertModel,BertConfig,BertTokenizerFast,BertTokenizer,AdamW,get_linear_schedule_with_warmup

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [5]:
# Download yelp  polarity
torchtext.utils.download_from_url(datasets.text_classification.URLS['YelpReviewPolarity'])
!tar -C .data -xvf .data/yelp_review_polarity_csv.tar.gz

yelp_review_polarity_csv/
yelp_review_polarity_csv/readme.txt
yelp_review_polarity_csv/test.csv
yelp_review_polarity_csv/train.csv


In [6]:
train_df = pd.read_csv('.data/yelp_review_polarity_csv/train.csv',names=['label','text'])
test_df = pd.read_csv('.data/yelp_review_polarity_csv/test.csv',names=['label','text'])

In [7]:
bert_model_name='bert-base-uncased'

In [8]:
#TODO YONIGO - where is this cached?
#config = BertConfig.from_pretrained(bert_model_name)

In [9]:
def convert_examples_to_features(examples,
                                 labels,
                                 tokenizer,          
                                 max_length=512,
                                 task=None,
                                 label_list=None,
                                 output_mode=None,
                                 pad_on_left=False,
                                 pad_token=0,
                                 pad_token_segment_id=0,
                                 mask_padding_with_zero=True,):
    features = []
    label_map = {label: i for i, label in enumerate(np.unique(labels))}
    for (ex_index, (example,label)) in enumerate(zip(tqdm(examples),labels)):
        inputs = tokenizer.encode_plus(example,add_special_tokens = True,max_length=max_length)
        input_ids, token_type_ids, attention_mask = inputs["input_ids"], inputs["token_type_ids"], inputs['attention_mask']
        padding_length = max_length - len(input_ids)
        input_ids = input_ids + ([pad_token] * padding_length)
        attention_mask = attention_mask + ([0 if mask_padding_with_zero else 1] * padding_length)
        token_type_ids = token_type_ids + ([pad_token_segment_id] * padding_length)
        features.append(InputFeatures(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, label=label_map[label]))
    return features

In [10]:
tokenizer = BertTokenizerFast.from_pretrained(bert_model_name, do_lower_case=True,cache_dir='.cache',max_length=128)
train_features = convert_examples_to_features(train_df['text'][0:100000].values,train_df['label'][0:100000].values,tokenizer)
test_features = convert_examples_to_features(test_df['text'].values,test_df['label'].values,tokenizer)

100%|██████████| 100000/100000 [00:38<00:00, 2618.01it/s]
100%|██████████| 38000/38000 [00:15<00:00, 2483.56it/s]


In [11]:
def create_dataset(features):
    # Convert to Tensors and build dataset
    all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
    all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long)
    all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long)
    all_labels = torch.tensor([f.label for f in features], dtype=torch.long)
    dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_labels)
    return dataset
train_dataset = create_dataset(train_features)
test_dataset = create_dataset(test_features)

In [12]:
# instead of using BertForSequenceClassification I wanted to implement a network myself
class BertWithLClassifier(nn.Module):
    def __init__(self, output_size):
        super().__init__()

        self.bert = BertModel.from_pretrained(bert_model_name,cache_dir='.cache')
        config = self.bert.config
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, output_size)

        nn.init.xavier_normal_(self.classifier.weight)
        nn.init.zeros_(self.classifier.bias)

    def freeze_bert(self):
        for param in self.bert.parameters():
            param.requires_grad = False

    def unfreeze_bert(self):
        for param in self.bert.parameters():
            param.requires_grad = True
    
    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
    ):

        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
        )

        pooled_output = outputs[1]

        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)

        return logits

In [13]:
# fit/test functions
def fit(iterator, model, optimizer, criterion,scheduler,gradient_accumulation_steps):
    train_loss = 0
    train_acc = 0
    model.train()
    all_y = []
    all_y_hat = []
    global_step = 0
    for step, batch in enumerate(tqdm(train_dataloader,'training epoch')):
        batch = tuple(t.to(device) for t in batch)
        inputs = {"input_ids": batch[0].to(device), "attention_mask": batch[1].to(device),'token_type_ids':batch[2].to(device), "labels": batch[3].to(device)}
        y = inputs['labels'].to(device)
        y_hat = model(**inputs)
        loss = criterion(y_hat, y)
        
        if gradient_accumulation_steps > 1:
            loss = loss / gradient_accumulation_steps
        loss.backward()
        
        train_loss += loss.item()
        if (step + 1) % gradient_accumulation_steps == 0:
            nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()  # Update learning rate schedule. TODO YONIGO - why not every epoch?
            model.zero_grad()
            global_step+=1
        all_y.append(y)
        all_y_hat.append(y_hat)
        
    y = torch.cat(all_y,dim=0)
    y_hat = torch.cat(all_y_hat,dim=0)
    acc = accuracy_score(y.cpu(),y_hat.argmax(1).detach().cpu())
    return train_loss / global_step, acc

def test(iterator, model, criterion):
    train_loss = 0
    train_acc = 0
    model.eval()
    all_y = []
    all_y_hat = []
    for step, batch in enumerate(tqdm(train_dataloader,'test epoch')):
        batch = tuple(t.to(device) for t in batch)
        inputs = {"input_ids": batch[0].to(device), "attention_mask": batch[1].to(device),'token_type_ids':batch[2].to(device), "labels": batch[3].to(device)}
        y = inputs['labels'].to(device)
        with torch.no_grad():
            y_hat = model(**inputs)
        loss = criterion(y_hat, y)
        train_loss += loss.item()
        
        all_y.append(y)
        all_y_hat.append(y_hat)
    y = torch.cat(all_y,dim=0)
    y_hat = torch.cat(all_y_hat,dim=0)
    acc = accuracy_score(y.cpu(),y_hat.argmax(1).detach().cpu())
    return train_loss / len(iterator.dataset), acc

In [14]:
def train_n_epochs(model, n, optimizer, scheduler, train_iterator, valid_iterator,gradient_accumulation_steps):

    criterion = nn.CrossEntropyLoss().to(device)
    for epoch in range(n):
        start_time = time.time()
        train_loss, train_acc = fit(train_iterator, model, optimizer, criterion,scheduler,gradient_accumulation_steps)
        valid_loss, valid_acc = test(valid_iterator, model, criterion)

        secs = int(time.time() - start_time)
        mins = secs / 60
        secs = secs % 60

        print('Epoch: %d' % (epoch), " | time in %d minutes, %d seconds" % (mins, secs))
        print(f'\tTrain Loss: {train_loss:.4f}\t|\tAccuracy: {train_acc :.6f}')
        print(f'\tValidation Loss: {valid_loss:.4f}\t|\tAccuracy: {valid_acc:.6f}') 

In [15]:
#TODO YONIGO - maybe clasfifier layer should get bigger learning rate?
def prepare_optimizer(model, t_total, lr=5e-5, adam_epsilon=1e-8, wd=0, warmup_steps=0):
     # Prepare optimizer and schedule (linear warmup and decay)
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
            "weight_decay": wd,
        },
        {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
    ]

    optimizer = AdamW(optimizer_grouped_parameters, lr=lr, eps=adam_epsilon)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=warmup_steps, num_training_steps=t_total
    )
    return optimizer,scheduler

In [16]:
# build dataladers
bs=8
gradient_accumulation_steps=4
train_sampler = RandomSampler(train_dataset)
train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=bs)

test_sampler = RandomSampler(test_dataset)
test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=bs)

In [17]:
num_lables = len(np.unique(train_df['label'].values))
model = BertWithLClassifier(num_lables).to(device)

In [18]:
num_train_epochs=3
t_total = len(train_dataloader) // gradient_accumulation_steps * num_train_epochs
optimizer,scheduler = prepare_optimizer(model,t_total)
train_n_epochs(model,num_train_epochs,optimizer,scheduler,train_dataloader,test_dataloader,gradient_accumulation_steps)

training epoch: 100%|██████████| 12500/12500 [56:19<00:00,  3.70it/s]
test epoch: 100%|██████████| 12500/12500 [18:20<00:00, 11.35it/s]
training epoch:   0%|          | 0/12500 [00:00<?, ?it/s]

Epoch: 0  | time in 74 minutes, 45 seconds
	Train Loss: 0.1865	|	Accuracy: 0.927950
	Validation Loss: 0.0265	|	Accuracy: 0.971950


training epoch: 100%|██████████| 12500/12500 [56:24<00:00,  3.69it/s]
test epoch: 100%|██████████| 12500/12500 [18:23<00:00, 11.33it/s]
training epoch:   0%|          | 0/12500 [00:00<?, ?it/s]

Epoch: 1  | time in 74 minutes, 54 seconds
	Train Loss: 0.0926	|	Accuracy: 0.968890
	Validation Loss: 0.0092	|	Accuracy: 0.991660


training epoch: 100%|██████████| 12500/12500 [56:25<00:00,  3.69it/s]
test epoch: 100%|██████████| 12500/12500 [18:23<00:00, 11.33it/s]

Epoch: 2  | time in 74 minutes, 54 seconds
	Train Loss: 0.0366	|	Accuracy: 0.990130
	Validation Loss: 0.0033	|	Accuracy: 0.997680





In [20]:
torch.save(model.state_dict(), '.saved_models/bert.pt')

In [24]:
74+75+75

224