In [1]:
import torch
from torch.utils.data import TensorDataset, DataLoader, random_split
import torch.nn as nn
import torch.nn.functional as F
import csv 
import numpy as np
from tqdm import tqdm
import math
import transformers
from transformers import BertModel, BertTokenizer
from transformers import get_linear_schedule_with_warmup
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [2]:
# set up logging
import logging
logging.basicConfig(
        format="%(asctime)s - %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO,
)
logger = logging.getLogger(__name__)

In [5]:
model_name = 'bert-base-uncased'
# large_model_name = 'bert-large-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)

In [6]:
class DataProcess():
    
    def __init__(self, root, ratio, batch_size):
        self.root = root
        self.ratio = ratio
        self.batch_size = batch_size
    
    def prepare_data(self):
        text_list = []
        label_list = []
        with open(root, 'r') as f:
            reader = csv.reader(f, delimiter='\t')
            for line in reader:
                text_list.append(line[0])
                label_list.append(int(line[1]))
        return text_list, label_list

    # Function to get token ids for a list of texts 
    def encode_data(self, text_id_root, labels_root):
        text_list, label_list = self.prepare_data()
        all_input_ids = []    
        for text in text_list:
            input_ids = tokenizer.encode(
                            text,                      
                            add_special_tokens = True,             
                            truncation=True,
                            padding = 'max_length',     
                            return_tensors = 'pt'       
                       )
            all_input_ids.append(input_ids)    
        all_input_ids = torch.cat(all_input_ids, dim=0)
        labels = torch.tensor(label_list, dtype=torch.float)
        # Save tensor
        torch.save(all_input_ids, text_id_root)
        torch.save(labels, labels_root)
        return all_input_ids, labels
    
    def load_data(self, text_id_root, labels_root):
        all_input_ids = torch.load(text_id_root)
        labels = torch.load(labels_root)
        # Split data into train and validation
        dataset = TensorDataset(all_input_ids, labels)
        train_size = int(self.ratio * len(dataset))
        valid_size = len(dataset) - train_size
        train_dataset, valid_dataset = random_split(dataset, [train_size, valid_size])

        # Create train and validation dataloaders
        train_dataloader = DataLoader(train_dataset, batch_size = self.batch_size, shuffle = True)
        valid_dataloader = DataLoader(valid_dataset, batch_size = self.batch_size, shuffle = False)

        return train_dataloader, valid_dataloader

In [7]:
ratio = 0.8
batch_size = 32
root = "data/all_data_1014.tsv"
text_id_root = "data/text_id_tensor.pt"
labels_root = "data/labels_tensor.pt"

In [8]:
processor = DataProcess(root, ratio, batch_size)

In [9]:
# all_input_ids, labels = processor.encode_data(text_id_root, labels_root)

In [10]:
train_dataloader, valid_dataloader = processor.load_data(text_id_root ,labels_root)

In [11]:
print("Num of train_dataloader: ", len(train_dataloader))
print("Num of valid_dataloader: ", len(valid_dataloader))

Num of train_dataloader:  4751
Num of valid_dataloader:  528


In [12]:
class Bert(nn.Module):

    def __init__(self, config):
        super().__init__()
        self.bert = BertModel.from_pretrained(config.model_name)
        self.hidden_size = self.bert.pooler.dense.weight.shape[1]
        self.dropout = nn.Dropout(config.dropout_prob)
        self.fc = nn.Linear(self.hidden_size, config.output_size)
        
    def configure_optimizers(self, train_config):
#         param_optimizer = list(model.named_parameters())  
#         no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
#         optimizer_grouped_parameters = [
#                 {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': train_config.weight_decay},
#                 {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}]
#         optimizer = transformers.AdamW(optimizer_grouped_parameters, lr=train_config.learning_rate, betas=train_config.betas)
        optimizer = torch.optim.AdamW(self.parameters(), lr=train_config.learning_rate, betas=train_config.betas)
        return optimizer

    def forward(self, input_ids, labels=None, token_type_ids=None, attention_mask=None):
        _, pooled_output = self.bert(input_ids, token_type_ids, attention_mask,)
        # pooled_output: [batch_size, dim=768]
        x = self.dropout(pooled_output)
        y_pred = self.fc(x).squeeze(-1)
        # y_pred: [batch_size, output_dim]
        if labels is not None:
            loss = F.binary_cross_entropy_with_logits(y_pred, labels)
            return y_pred, loss
        else:
            return y_pred 

In [13]:
class BertConfig:
    """ base GPT config, params common to all GPT versions """
    dropout_prob = 0.1

    def __init__(self, output_size, model_name, **kwargs):
        self.output_size = output_size
        self.model_name = model_name
        for k, v in kwargs.items():
            setattr(self, k, v)

In [14]:
class Trainer:

    def __init__(self, model, train_loader, test_loader, config):
        self.model = model
        self.train_loader = train_loader
        self.test_loader = test_loader
        self.config = config

        # take over whatever gpus are on the system
        self.device = 'cpu'
        if torch.cuda.is_available():
            self.device = torch.cuda.current_device()
            self.model = torch.nn.DataParallel(self.model).to(self.device)

    def save_checkpoint(self):
        # DataParallel wrappers keep raw model object in .module attribute
        raw_model = self.model.module if hasattr(self.model, "module") else self.model
        logger.info("saving %s", self.config.ckpt_path)
        torch.save(raw_model.state_dict(), self.config.ckpt_path)
        
    def binary_accuracy(self, preds, y):
        rounded_preds = torch.round(torch.sigmoid(preds))
        correct = (rounded_preds == y).float()
        acc = correct.sum() / len(correct)
        return acc

    def train(self):
        model, config = self.model, self.config
        raw_model = model.module if hasattr(self.model, "module") else model
        optimizer = raw_model.configure_optimizers(config)

        def run_epoch(split):
            is_train = split == 'train'
            model.train(is_train)
            loader = self.train_loader if is_train else self.test_loader
            
            losses = []
            all_y = []
            all_y_pred = []
            pbar = tqdm(enumerate(loader), total=len(loader)) if is_train else enumerate(loader)
            for it, (x, y) in pbar:
                # place data on the correct device
                x = x.to(self.device)
                y = y.to(self.device)
                # forward the model
                with torch.set_grad_enabled(is_train):
                    y_pred, loss = model(x, y)
                    loss = loss.mean() # collapse all losses if they are scattered on multiple gpus
                    losses.append(loss.item())
                    step_score = self.binary_accuracy(y_pred, y)
                    all_y.extend(y)
                    all_y_pred.extend(y_pred)
                
                if is_train:

                    # backprop and update the parameters
                    model.zero_grad()
                    loss.backward()
                    torch.nn.utils.clip_grad_norm_(model.parameters(), config.grad_norm_clip)
                    optimizer.step()

                    # decay the learning rate based on our progress
                    if config.lr_decay:
                        self.tokens += (y >= 0).sum() # number of tokens processed this step (i.e. label is not -100)
                        if self.tokens < config.warmup_tokens:
                            # linear warmup
                            lr_mult = float(self.tokens) / float(max(1, config.warmup_tokens))
                        else:
                            # cosine learning rate decay
                            progress = float(self.tokens - config.warmup_tokens) / float(max(1, config.final_tokens - config.warmup_tokens))
                            lr_mult = max(0.1, 0.5 * (1.0 + math.cos(math.pi * progress)))
                        lr = config.learning_rate * lr_mult
                        for param_group in optimizer.param_groups:
                            param_group['lr'] = lr
                    else:
                        lr = config.learning_rate

                    # report progress
                    pbar.set_description(f"epoch {epoch+1} iter {it}: train loss {loss.item():.5f}. score {step_score:.5f}. lr {lr:e}")

            if not is_train:
                test_loss = float(np.mean(losses))
                all_y = torch.stack(all_y, dim=0)
                all_y_pred = torch.stack(all_y_pred, dim=0)
                test_score = self.binary_accuracy(all_y_pred, all_y)
                logger.info("test loss: %f", test_loss)
                logger.info("test score: %f", test_score)
                return test_loss

        self.tokens = 0 # counter used for learning rate decay
        best_loss = float('inf')
#         best_loss = run_epoch('test')
        for epoch in range(config.max_epochs):

            run_epoch('train')
            if self.test_loader is not None:
                test_loss = run_epoch('test')

            # supports early stopping based on the test loss, or just save always if no test set is provided
            good_model = self.test_loader is None or test_loss < best_loss
            if self.config.ckpt_path is not None and good_model:
                best_loss = test_loss
                self.save_checkpoint()

In [36]:
class TrainerConfig:
    # optimization parameters
    max_epochs = 10
    learning_rate = 3e-4
    betas = (0.9, 0.95)
    grad_norm_clip = 1.0
    weight_decay = 0.1 # only applied on matmul weights
    # learning rate decay params: linear warmup followed by cosine decay to 10% of original
    lr_decay = False
    warmup_tokens = 375e6 # these two numbers come from the GPT-3 paper, but may not be good defaults elsewhere
    final_tokens = 260e9 # (at what point we reach 10% of original LR)
    # checkpoint settings
    ckpt_path = 'bert-model.pt'
    num_workers = 0 # for DataLoader

    def __init__(self, **kwargs):
        for k,v in kwargs.items():
            print(k,v)
            setattr(self, k, v)

In [37]:
output_size = 1

In [38]:
mconf = BertConfig(output_size, model_name)

In [39]:
model = Bert(mconf).to(device)

In [40]:
# model

In [41]:
for par in model.bert.embeddings.parameters():
    par.requires_grad = False
for par in model.bert.encoder.parameters():
    par.requires_grad = False

In [42]:
print('{} : all params: {:4f}M'.format(model._get_name(), sum(p.numel() for p in model.parameters()) / 1000 / 1000))
print('{} : need grad params: {:4f}M'.format(model._get_name(), sum(p.numel() for p in model.parameters() if p.requires_grad) / 1000 / 1000))

Bert : all params: 335.142913M
Bert : need grad params: 1.050625M


In [43]:
# model.load_state_dict(torch.load("bert-model.pt"))

In [44]:
tconf = TrainerConfig(max_epochs=1, learning_rate=6e-4, lr_decay=True, 
                      warmup_tokens=32*200, final_tokens=1*batch_size*len(train_dataloader),
                      num_workers=1)

max_epochs 1
learning_rate 0.0006
lr_decay True
warmup_tokens 6400
final_tokens 152032
num_workers 1


In [45]:
trainer = Trainer(model, train_dataloader, valid_dataloader, tconf)

In [47]:
# trainer.train()

In [48]:
class Predict:
    
    def __init__(self, model):
        self.model = model
    
    def predict(self, text):
        input_ids = tokenizer.encode(
                        text,                      
                        add_special_tokens = True,             
                        truncation=True,
                        padding = 'max_length',     
                        return_tensors = 'pt'       
                   ).to(device)
        self.model.eval()
        pred = torch.sigmoid(self.model(input_ids)[0])
        return pred.item()
    
    def count_acc(self, text_list, local):
        result = []
        for text in text_list:
            result.append(self.predict(text))
        result = torch.tensor(result, dtype = torch.float)
        if local:
            acc = sum(result > 0.5).item()/len(result)
        else:
            acc = sum(result < 0.5).item()/len(result)
        return result, acc
        

In [49]:
# model.load_state_dict(torch.load("bert-model.pt"))

In [50]:
predict = Predict(model)

In [51]:
test_text = []
with open('data/test_data.tsv') as f:
    reader= csv.reader(f, delimiter='\t')
    for line in reader:
        test_text.append(line[3])

In [52]:
fout = open('bert-predict.tsv','w')
for text in test_text:
    prob = predict.predict(text)
    fout.write('{}\n'.format(prob))

In [55]:
len(test_text)

10000

In [62]:
text = test_text[9999]
text

"French Critics Slam Netflix 's Emily In Paris . The latest Netflix show Emily In Paris has been talked a lot since its release . It tells a story about Emily , played by Lily Collins , an American woman who gets a once in a lifetime job opportunity to move to Paris and work at a prestigious French marketing firm . Initially , it takes time for her to settle down due to cultural differences and later finds herself in a love triangle . The show has been criticised for different reasons as well including the outfits . The fashion side of the Twitter does not seem to agree with clothing choices worn by the main character . Most of the outfits look like they were borrowed from CW 's series from 2012 . Coming to another criticism , it turns out that the execution portrayed French stereotypes and French viewers are not impressed with it . Throughout the series , Emily has been portrayed as a typical American girl who falls victim to French people 's behaviour . If you go to see , she does no

In [63]:
predict.predict(text)

0.0001725618349155411

In [64]:
torch.save(model.state_dict(), "bert-large-model")