In [7]:
# kind of a simplidied version of glue.py and run_glue.py from huggingface

In [2]:
import fasttext
import torchtext
import pandas as pd
from torchtext import datasets
import time
import torch as torch
import numpy as np
import torch.nn as nn
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
from sklearn.metrics import accuracy_score
from tqdm import tqdm

In [4]:
# bert imports
# To import BertTokenizerFast i must install from src
from transformers.data.processors.utils import InputFeatures
from transformers import BertForSequenceClassification,BertModel,BertConfig,BertTokenizerFast,BertTokenizer,AdamW,get_linear_schedule_with_warmup

In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

In [6]:
# Download yelp  polarity
torchtext.utils.download_from_url(datasets.text_classification.URLS['YelpReviewPolarity'])
!tar -C .data -xvf .data/yelp_review_polarity_csv.tar.gz

x yelp_review_polarity_csv/
x yelp_review_polarity_csv/readme.txt
x yelp_review_polarity_csv/test.csv
x yelp_review_polarity_csv/train.csv


In [7]:
train_df = pd.read_csv('.data/yelp_review_polarity_csv/train.csv',names=['label','text'])
test_df = pd.read_csv('.data/yelp_review_polarity_csv/test.csv',names=['label','text'])

In [8]:
bert_model_name='bert-base-uncased'
bs=32

In [9]:
#TODO YONIGO - where is this cached?
config = BertConfig.from_pretrained(bert_model_name)

In [10]:
def convert_examples_to_features(examples,
                                 labels,
                                 tokenizer,          
                                 max_length=512,
                                 task=None,
                                 label_list=None,
                                 output_mode=None,
                                 pad_on_left=False,
                                 pad_token=0,
                                 pad_token_segment_id=0,
                                 mask_padding_with_zero=True,):
    features = []
    label_map = {label: i for i, label in enumerate(np.unique(labels))}
    for (ex_index, (example,label)) in enumerate(zip(tqdm(examples),labels)):
        inputs = tokenizer.encode_plus(example,add_special_tokens = True,max_length=max_length)
        input_ids, token_type_ids, attention_mask = inputs["input_ids"], inputs["token_type_ids"], inputs['attention_mask']
        padding_length = max_length - len(input_ids)
        input_ids = input_ids + ([pad_token] * padding_length)
        attention_mask = attention_mask + ([0 if mask_padding_with_zero else 1] * padding_length)
        token_type_ids = token_type_ids + ([pad_token_segment_id] * padding_length)
        features.append(InputFeatures(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, label=label_map[label]))
    return features

In [13]:
tokenizer = BertTokenizerFast.from_pretrained(bert_model_name, do_lower_case=True,cache_dir='.cache')
features = convert_examples_to_features(train_df['text'].values,train_df['label'].values,tokenizer)

 73%|███████▎  | 411091/560000 [03:20<01:07, 2196.47it/s]

KeyboardInterrupt: 

In [186]:
from tokenizers import (ByteLevelBPETokenizer,
                        BPETokenizer,
                        SentencePieceBPETokenizer,
                        BertWordPieceTokenizer)
tokenizer = BertWordPieceTokenizer(lowercase=True)

In [195]:
tokenizer.vocab_files_names??

In [142]:
# Convert to Tensors and build dataset
all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long)
all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long)
all_labels = torch.tensor([f.label for f in features], dtype=torch.long)
train_dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_labels)

In [143]:
# build dataladers
train_sampler = RandomSampler(train_dataset)
train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=bs)

In [147]:
# instead of using BertForSequenceClassification I wanted to implement a network myself
class BertWithLClassifier(nn.Module):
    def __init__(self):
        super().__init__()

        self.bert = BertModel.from_pretrained(bert_model_name,)
        self.config = bert.config
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, self.config.num_labels)

        nn.init.xavier_normal_(self.classifier.weight)
        nn.init.zeros_(self.classifier.bias)
        
    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
    ):

        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
        )

        pooled_output = outputs[1]

        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)

        return logits

In [161]:
# fit/test functions
def fit(iterator, model, optimizer, criterion,scheduler):
    train_loss = 0
    train_acc = 0
    model.train()
    all_y = []
    all_y_hat = []
    for step, batch in enumerate(tqdm(train_dataloader,'training epoch')):
        batch = tuple(t.to(device) for t in batch)
        inputs = {"input_ids": batch[0], "attention_mask": batch[1],'token_type_ids':batch[2], "labels": batch[3]}
        optimizer.zero_grad()
        y = inputs['labels']
        y_hat = model(**inputs)
        loss = criterion(y_hat, y)
        train_loss += loss.item()
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        #TODO YONIGO - whys is this not in epoch granularity?
        scheduler.stop()
        
        all_y.append(y)
        all_y_hat.append(y_hat)
        
    y = torch.cat(all_y,dim=0)
    y_hat = torch.cat(all_y_hat,dim=0)
    acc = accuracy_score(y.cpu(),y_hat.argmax(1).detach().cpu())
    return train_loss / len(iterator.dataset), acc

def test(iterator, model, criterion):
    train_loss = 0
    train_acc = 0
    model.eval()
    all_y = []
    all_y_hat = []
    for step, batch in enumerate(tqdm(train_dataloader,'test epoch')):
        batch = tuple(t.to(device) for t in batch)
        inputs = {"input_ids": batch[0], "attention_mask": batch[1],'token_type_ids':batch[2], "labels": batch[3]}
        y = inputs['labels']
        with torch.no_grad():
            y_hat = model(**inputs)
        loss = criterion(y_hat, y)
        train_loss += loss.item()
        
        all_y.append(y)
        all_y_hat.append(y_hat)
    y = torch.cat(all_y,dim=0)
    y_hat = torch.cat(all_y_hat,dim=0)
    acc = accuracy_score(y.cpu(),y_hat.argmax(1).detach().cpu())
    return train_loss / len(iterator.dataset), acc

In [162]:
def train_n_epochs(model, n, optimizer, scheduler, train_iterator, valid_iterator):

    criterion = nn.CrossEntropyLoss().to(device)
    for epoch in range(n):
        start_time = time.time()
        train_loss, train_acc = fit(train_iterator, model, optimizer, criterion,scheduler)
        valid_loss, valid_acc = test(valid_iterator, model, criterion)

        secs = int(time.time() - start_time)
        mins = secs / 60
        secs = secs % 60

        print('Epoch: %d' % (epoch), " | time in %d minutes, %d seconds" % (mins, secs))
        print(f'\tTrain Loss: {train_loss:.4f}\t|\tAccuracy: {train_acc :.6f}')
        print(f'\tValidation Loss: {valid_loss:.4f}\t|\tAccuracy: {valid_acc:.6f}') 

In [163]:
def prepare_optimizer(model, t_total, lr=5e-5, adam_epsilon=1e-8, wd=0, warmup_steps=0):
     # Prepare optimizer and schedule (linear warmup and decay)
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
            "weight_decay": wd,
        },
        {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
    ]

    optimizer = AdamW(optimizer_grouped_parameters, lr=lr, eps=adam_epsilon)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=warmup_steps, num_training_steps=t_total
    )
    return optimizer,scheduler

In [164]:
model = BertWithLClassifier()
num_train_epochs=2
t_total = len(train_dataloader) // 1 * num_train_epochs
optimizer,scheduler = prepare_optimizer(model,t_total)
train_n_epochs(model,num_train_epochs,optimizer,scheduler,train_dataloader,train_dataloader)

training epoch: 100%|██████████| 1/1 [00:11<00:00, 11.00s/it]
test epoch: 100%|██████████| 1/1 [00:02<00:00,  2.82s/it]
training epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Epoch: 0  | time in 0 minutes, 13 seconds
	Train Loss: 0.2515	|	Accuracy: 0.600000
	Validation Loss: 0.0809	|	Accuracy: 0.800000


training epoch: 100%|██████████| 1/1 [00:10<00:00, 10.65s/it]
test epoch: 100%|██████████| 1/1 [00:02<00:00,  2.97s/it]

Epoch: 1  | time in 0 minutes, 13 seconds
	Train Loss: 0.1075	|	Accuracy: 0.600000
	Validation Loss: 0.0781	|	Accuracy: 0.800000





In [128]:
batch = next(iter(train_dataloader))


In [150]:
for step, batch in enumerate(tqdm(train_dataloader)):
    batch = tuple(t.to(device) for t in batch)
    inputs = {"input_ids": batch[0], "attention_mask": batch[1],'token_type_ids':batch[2], "labels": batch[3]}
    y_hat = model(**inputs)
    print(y_hat)

100%|██████████| 1/1 [00:05<00:00,  5.32s/it]

tensor([[-0.1611,  1.0176],
        [-0.0727,  1.0819],
        [-0.2314,  1.4416],
        [-0.3342,  1.3276],
        [-0.4491,  0.4912]], grad_fn=<AddmmBackward>)





In [136]:
#outputs = model(**inputs)

In [1]:
from transformers import BertTokenizerFast