In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline
import matplotlib.pyplot as plt

pip install pytorch-transformers

In [2]:
import torch
import torch.nn as nn
import pickle
from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
                              TensorDataset)
from tqdm import tqdm_notebook, trange
import os
from pytorch_transformers import BertConfig, BertTokenizer, BertModel
from pytorch_transformers.optimization import AdamW, WarmupLinearSchedule

from torch.utils.data import Dataset, DataLoader

In [3]:
import numpy as np
import torch.optim as optim
from torch.optim import lr_scheduler
import time
import copy
import torch.nn.functional as F

In [4]:
class BertForSequenceClassification(nn.Module):
    """BERT model for classification.
    This module is composed of the BERT model with a linear layer on top of
    the pooled output.
    """
    def __init__(self, num_labels=1):
        super(BertForSequenceClassification, self).__init__()
        self.num_labels = num_labels
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, num_labels)
        nn.init.xavier_normal_(self.classifier.weight)
        
    def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None):
        outputs = self.bert(input_ids, token_type_ids, attention_mask)
        pooled_output = outputs[1]
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        return logits
    
    def freeze_bert_encoder(self):
        for param in self.bert.parameters():
            param.requires_grad = False
    
    def unfreeze_bert_encoder(self):
        for param in self.bert.parameters():
            param.requires_grad = True

In [5]:
config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)

In [6]:
num_labels = 1
model = BertForSequenceClassification(num_labels)

In [7]:
from pathlib import Path
PATH = Path("/home/yinterian/data/aclImdb/")

In [8]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [10]:
path = PATH/"train/pos/0_9.txt"
z = tokenizer.tokenize(path.read_text())
z[:10]

['bro', '##m', '##well', 'high', 'is', 'a', 'cartoon', 'comedy', '.', 'it']

In [11]:
ids = tokenizer.convert_tokens_to_ids(z)
ids[:10]

[22953, 2213, 4381, 2152, 2003, 1037, 9476, 4038, 1012, 2009]

In [12]:
tokens_tensor = torch.tensor([ids])

In [13]:
logits = model(tokens_tensor)

In [14]:
logits 

tensor([[-0.5909]], grad_fn=<AddmmBackward0>)

Based on these tutorials
* https://pytorch.org/hub/huggingface_pytorch-pretrained-bert_bert/
* https://github.com/huggingface/pytorch-transformers/blob/master/README.md
* https://medium.com/huggingface/multi-label-text-classification-using-bert-the-mighty-transformer-69714fa3fb3d
* https://towardsdatascience.com/bert-classifier-just-another-pytorch-model-881b3cf05784

In [15]:
def text2ids(text, max_seq_length=300):
    tok_text = tokenizer.tokenize(text)
    if len(tok_text) > max_seq_length:
            tok_text = tok_text[:max_seq_length]
    ids_text  = tokenizer.convert_tokens_to_ids(tok_text)
    padding = [0] * (max_seq_length - len(ids_text))
    ids_text += padding
    return np.array(ids_text)

In [16]:
text2ids(path.read_text())

array([22953,  2213,  4381,  2152,  2003,  1037,  9476,  4038,  1012,
        2009,  2743,  2012,  1996,  2168,  2051,  2004,  2070,  2060,
        3454,  2055,  2082,  2166,  1010,  2107,  2004,  1000,  5089,
        1000,  1012,  2026,  3486,  2086,  1999,  1996,  4252,  9518,
        2599,  2033,  2000,  2903,  2008, 22953,  2213,  4381,  2152,
        1005,  1055, 18312,  2003,  2172,  3553,  2000,  4507,  2084,
        2003,  1000,  5089,  1000,  1012,  1996, 25740,  2000,  5788,
       13732,  1010,  1996, 12369,  3993,  2493,  2040,  2064,  2156,
        2157,  2083,  2037, 17203,  5089,  1005, 13433,  8737,  1010,
        1996,  9004, 10196,  4757,  1997,  1996,  2878,  3663,  1010,
        2035, 10825,  2033,  1997,  1996,  2816,  1045,  2354,  1998,
        2037,  2493,  1012,  2043,  1045,  2387,  1996,  2792,  1999,
        2029,  1037,  3076,  8385,  2699,  2000,  6402,  2091,  1996,
        2082,  1010,  1045,  3202,  7383,  1012,  1012,  1012,  1012,
        1012,  1012,

In [17]:
class ImdbDataset(Dataset):
    def __init__(self, PATH, train="train"):
        self.path_to_images = PATH/train
        self.pos_files = list((self.path_to_images/"pos").iterdir())
        self.neg_files = list((self.path_to_images/"neg").iterdir()) 
        self.files = self.pos_files + self.neg_files
        self.y = np.concatenate((np.ones(len(self.pos_files), dtype=int),
                                np.zeros(len(self.neg_files), dtype=int)), axis=0)
        
    def __getitem__(self, index):
        path = self.files[index]
        x = text2ids(path.read_text())
        return x, self.y[index]
    
    def __len__(self):
        return len(self.y)

In [18]:
train_ds = ImdbDataset(PATH)
valid_ds = ImdbDataset(PATH, "test")

In [19]:
batch_size = 10
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
valid_dl = DataLoader(valid_ds, batch_size=batch_size)

In [20]:
x, y = train_ds[0]

In [21]:
x, y = next(iter(train_dl))

In [22]:
x[3]

tensor([ 2004, 15444,  2890,  1998,  7369,  2012,  1996,  4578,  1997,  2037,
         6217,  1012,  1999,  4266,  4841,  2245,  1997,  1996,  3212,  2004,
         1037,  2173,  2005,  2299,  1998,  3153,  1012, 25755,  2001,  2145,
         1037,  2261,  2086,  2185,  1012,  5965,  1998, 14580,  3153,  2039,
         1996,  2237,  1012,  1026,  7987,  1013,  1028,  1026,  7987,  1013,
         1028,  1996,  5436,  2003, 11519,  1010,  2021,  2040, 14977,  1012,
         1012,  1012,  2011,  1996,  2126,  1010,  5060,  1996, 12081,  4395,
         2005,  9306,  6723,  2571,  1998,  1037,  1043, 10278, 25373, 12776,
         9463,  3608,  1012,  1026,  7987,  1013,  1028,  1026,  7987,  1013,
         1028,  1037,  7170,  1997, 12415,  4068,  2774,  1010,  2164,  1996,
         3297,  1000,  2292,  1005,  1055,  2227,  1996,  2189,  1998,  3153,
         1000,  1012,  1999,  2008,  3496,  1010, 14580,  1005,  1055,  3082,
        25430, 29046,  4377, 21526,  2015,  5965,  1999,  1996, 

In [23]:
def train_model(model, optimizer, num_epochs=25):
    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        for x, y in train_dl:
            x = x.cuda()
            y = y.unsqueeze(1).float().cuda()
            optimizer.zero_grad()
            logits = model(x)
            loss = F.binary_cross_entropy_with_logits(logits, y)            
            loss.backward()
            optimizer.step()
                
            running_loss += loss.item() * x.size(0)
        epoch_loss = running_loss / len(train_ds)
        val_loss, accuracy = eval_model(model)
        print('train loss: {:.3f}, valid loss {:.3f} accuracy {:.3f}'.format(
            epoch_loss, val_loss, accuracy))

In [24]:
def eval_model(model):
    model.eval()
    running_loss = 0.0
    correct = 0
    for x, y in valid_dl:
        x = x.cuda()
        y = y.unsqueeze(1).float().cuda()
        logits = model(x)
        loss = F.binary_cross_entropy_with_logits(logits, y) 
        y_pred = logits > 0
        correct += (y_pred.float() == y).float().sum()
        running_loss += loss.item() * x.size(0)
    accuracy = correct / len(valid_ds)
    epoch_loss = running_loss / len(valid_ds)
    return epoch_loss, accuracy.item() 

In [25]:
model = model.cuda()

In [26]:
lrlast = .0001
lrmain = .00001
optimizer = optim.Adam(
    [
        {"params":model.bert.parameters(),"lr": lrmain},
        {"params":model.classifier.parameters(), "lr": lrlast},
       
   ])

In [27]:
train_model(model, optimizer, num_epochs=2)

train loss: 0.286, valid loss 0.201 accuracy 0.922
train loss: 0.166, valid loss 0.210 accuracy 0.922
