In [12]:
import torch
import csv 
from torch.utils.data import TensorDataset, DataLoader, random_split
import torch.nn as nn
import torch.nn.functional as F
import transformers
# from transformers import BertModel, BertTokenizer
# from transformers import AdamW
# from transformers import get_linear_schedule_with_warmup
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

In [69]:
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
bert_model = BertModel.from_pretrained(model_name)

In [70]:
def prepare_data(filename):
    text_list = []
    label_list = []
    f = open(filename, 'r')
    reader = csv.reader(f, delimiter='\t')
    for line in reader:
        text_list.append(line[0])
        label_list.append(int(line[1]))
    return text_list, label_list

In [71]:
# Function to get token ids for a list of texts 
def encode_fn(text_list):
    all_input_ids = []    
    for text in text_list:
        input_ids = tokenizer.encode(
                        text,                      
                        add_special_tokens = True,  
                        max_length = 512,           
                        truncation=True,
                        pad_to_max_length = True,     
                        return_tensors = 'pt'       
                   )
        all_input_ids.append(input_ids)    
    all_input_ids = torch.cat(all_input_ids, dim=0)
    return all_input_ids

In [72]:
def split_data(ratio, batch_size, filename):
    
    text_list, label_list = prepare_data(filename)
    all_input_ids = encode_fn(text_list)
    labels = torch.tensor(label_list, dtype=torch.float)
    # Split data into train and validation
    dataset = TensorDataset(all_input_ids, labels)
    train_size = int(ratio * len(dataset))
    valid_size = len(dataset) - train_size
    train_dataset, valid_dataset = random_split(dataset, [train_size, valid_size])

    # Create train and validation dataloaders
    train_dataloader = DataLoader(train_dataset, batch_size = batch_size, shuffle = True)
    valid_dataloader = DataLoader(valid_dataset, batch_size = batch_size, shuffle = False)
    
    return train_dataloader, valid_dataloader

In [73]:
ratio = 0.8
batch_size = 2
filename = "head.tsv"
train_dataloader, valid_dataloader = split_data(ratio, batch_size, filename)

In [74]:
print(len(train_dataloader))
print(len(valid_dataloader))

8
2


In [75]:
class Bert(nn.Module):

    def __init__(self, config):
        super().__init__()
        self.output_size = config.output_size
        self.bert = config.bert_model
        self.dropout = nn.Dropout(config.dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, config.output_size)

    def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None):
        print(input_ids.shape)
        print(labels.shape)
        _, pooled_output = self.bert(input_ids, token_type_ids, attention_mask,)
        # pooled_output: [batch_size, dim=768]
        pooled_output = self.dropout(pooled_output)
        print(pooled_output.shape)
        logits = self.classifier(pooled_output)
        print(logits.shape)
        logits = logits.squeeze(-1)
        print(logits.shape)
        # logits: [batch_size, output_dim]
        if labels is not None:
            loss = F.binary_cross_entropy_with_logits(logits, labels)
            return loss, logits
        else:
            return logits

In [76]:
class BertConfig:
    """ base GPT config, params common to all GPT versions """
    dropout_prob = 0.1

    def __init__(self, hidden_size, output_size, bert_model, **kwargs):
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.bert_model = bert_model
        for k, v in kwargs.items():
            setattr(self, k, v)

In [77]:
HIDDEN_SIZE = 768
OUTPUT_SIZE = 1

In [78]:
mconf = BertConfig(HIDDEN_SIZE, OUTPUT_SIZE, bert_model)

In [79]:
model = Bert(mconf)

In [80]:
# model

In [81]:
for par in model.parameters():
    par.requires_grad = False
model.classifier.weight.requires_grad = True

In [82]:
print('{} : all params: {:4f}M'.format(model._get_name(), sum(p.numel() for p in model.parameters()) / 1000 / 1000))
print('{} : need grad params: {:4f}M'.format(model._get_name(), sum(p.numel() for p in model.parameters() if p.requires_grad) / 1000 / 1000))

Bert : all params: 109.483009M
Bert : need grad params: 0.000768M


In [83]:
optimizer = AdamW(model.parameters(), lr=2e-5)
epochs = 4
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

In [13]:
for epoch in range(epochs):
    model.train()
    total_loss, total_val_loss = 0, 0
    total_eval_accuracy = 0
    for step, batch in enumerate(train_dataloader):
        model.zero_grad()
        loss, logits = model(batch[0].to(device), token_type_ids=None, attention_mask=None, labels=batch[1].to(device))
        total_loss += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step() 
        scheduler.step()
        
    model.eval()
    for i, batch in enumerate(valid_dataloader):
        with torch.no_grad():
            loss, logits = model(batch[0].to(device), token_type_ids=None, attention_mask=None, labels=batch[1].to(device))
                
            total_val_loss += loss.item()
            
            logits = logits.detach().cpu().numpy()
            label_ids = batch[1].to('cpu').numpy()
            total_eval_accuracy += flat_accuracy(logits, label_ids)
    avg_train_loss = total_loss / len(train_dataloader)
    avg_val_loss = total_val_loss / len(val_dataloader)
    avg_val_accuracy = total_eval_accuracy / len(val_dataloader)
    
    print(f'Train loss     : {avg_train_loss}')
    print(f'Validation loss: {avg_val_loss}')
    print(f'Accuracy: {avg_val_accuracy:.2f}')
    print('\n')