In [154]:
import pickle
import numpy as np
import pandas as pd
from torch.utils.data import Dataset,DataLoader,RandomSampler,SequentialSampler,TensorDataset
from pytorch_pretrained_bert.modeling import BertForSequenceClassification,BertForMaskedLM
from pytorch_pretrained_bert.optimization import BertAdam
from pytorch_util import *
from sklearn.metrics import roc_auc_score

In [139]:
batch_size = 16
num_train_epochs = 4
max_len = 64

In [140]:
with open('../Data/precessed_reviews.pickle', 'rb') as fp:
    precessed_reviews = pickle.load(fp)
with open('../Data/targets.pickle', 'rb') as fp:
    targets = pickle.load(fp) 
with open('../Data/precessed_reviews_test.pickle', 'rb') as fp:
    precessed_reviews_test = pickle.load(fp)  
with open('../Data/precessed_reviews_unlabeled.pickle', 'rb') as fp:
    precessed_reviews_unlabeled = pickle.load(fp)     

In [146]:
LM_text = precessed_reviews + precessed_reviews_test + precessed_reviews_unlabeled

In [166]:
class TextGenerator(Dataset):
    def __init__(self, reviews,targets,max_len=64):
        self.reviews = reviews
        self.targets = targets
        self.max_len = max_len
        self.length = [len(r)+2 for r in reviews]#+2 for CLS and SEP
        self.segment_ids = np.zeros(max_len,dtype=np.long)
        self.CLS = [101]
        self.SEP = [102]

    def __len__(self):
        return len(self.reviews)

    def __getitem__(self, index):
        review = self.reviews[index]
        len_ = self.length[index]
        if len_ < self.max_len:
            padding = [0] * (self.max_len - len_)
            input_mask = [1] * len_
            input_ids = self.CLS + review + self.SEP + padding
            input_mask += padding
        elif len_ == self.max_len:
            input_ids = self.CLS + review + self.SEP
            input_mask = [1] * len_
        else:
            start = np.random.randint(0,len_-self.max_len+1)
            input_ids = self.CLS + review[start:start+self.max_len-2] + self.SEP
            input_mask = [1] * self.max_len
        return np.array(input_ids,dtype=np.long),np.array(input_mask,dtype=np.long),\
                self.segment_ids,self.targets[index]

In [91]:
def random_word(tokens):
    MASK = 103
    output_label = []

    for i, token in enumerate(tokens):
        prob = np.random.rand()
        # mask token with 15% probability
        if prob < 0.15:
            prob = prob/0.15
            
            # append current token to output (we will predict these later)
            output_label.append(token)
            
            # 80% randomly change token to mask token
            if prob < 0.8:
                tokens[i] = MASK

            # 10% randomly change token to random token, token goes from 0 to 30521
            elif prob < 0.9:
                tokens[i] = np.random.randint(0,30522)

            # -> rest 10% randomly keep current token
        else:
            # no masking token (will be ignored by loss function later)
            output_label.append(-1)

    return tokens, output_label

In [138]:
class TextGenerator_LM(Dataset):
    def __init__(self, reviews,max_len=64):
        self.reviews = reviews
        self.max_len = max_len
        self.length = [len(r)+2 for r in reviews]#+2 for CLS and SEP
        self.segment_ids = np.zeros(max_len,dtype=np.long)
        self.CLS = [101]
        self.SEP = [102]

    def __len__(self):
        return len(self.reviews)

    def __getitem__(self, index):
        review = self.reviews[index]
        len_ = self.length[index]
        if len_ < self.max_len:
            review, target = random_word(review)
            padding = [0] * (self.max_len - len_)
            input_mask = [1] * len_
            input_ids = self.CLS + review + self.SEP + padding
            target = [-1] + target + [-1]*(self.max_len - len_ + 1)
            input_mask += padding
        elif len_ == self.max_len:
            review, target = random_word(review)
            input_ids = self.CLS + review + self.SEP
            target = [-1] + target + [-1]
            input_mask = [1] * len_
        else:
            start = np.random.randint(0,len_-self.max_len+1)
            review, target = random_word(review[start:start+self.max_len-2])
            input_ids = self.CLS + review + self.SEP
            target = [-1] + target + [-1]
            input_mask = [1] * self.max_len
        return np.array(input_ids,dtype=np.long),np.array(input_mask,dtype=np.long),\
                self.segment_ids,np.array(target,dtype=np.long)

Dataset

In [5]:
# train_size = 20000
# train_gen = TextGenerator(precessed_reviews[:train_size],targets[:train_size],max_len)
# train_gen = DataLoader(train_gen,batch_size,sampler=RandomSampler(train_gen),num_workers=2)

# val_gen = TextGenerator(precessed_reviews[train_size:],targets[train_size:],max_len)
# val_gen = DataLoader(val_gen,batch_size,sampler=SequentialSampler(train_gen),num_workers=2)

In [147]:
# train with all dataset for better test performance
train_gen = TextGenerator_LM(LM_text,max_len)
train_gen = DataLoader(train_gen,batch_size,sampler=RandomSampler(train_gen),num_workers=2)

In [153]:
#inputs,inputs_mask,segment_ids,ys = next(iter(train_gen))

Model

In [155]:
model = BertForMaskedLM.from_pretrained('bert-base-uncased').to('cuda:0')

Optimizer

In [156]:
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
    ]

In [157]:
num_train_optimization_steps = int(len(train_gen) * num_train_epochs)

In [158]:
opt = BertAdam(optimizer_grouped_parameters,
                     lr=5e-5,
                     warmup=0.1,
                     t_total=num_train_optimization_steps)

Loss

In [159]:
def loss_func(model,data):
    inputs,inputs_mask,segment_ids,ys = data
    loss = model(inputs,segment_ids,inputs_mask,ys)
    return loss

Training

In [160]:
# model = fit(num_train_epochs, model, loss_func, opt, train_gen, val_gen)
model = fit(num_train_epochs, model, loss_func, opt, train_gen, lossBest=None)

epoch:0, train_loss:2.990750481300354
epoch:1, train_loss:2.831921937408447
epoch:2, train_loss:2.7265457973861693
epoch:3, train_loss:2.6425181249427796
Training completed in 4677.281453609467s


In [164]:
torch.save(model, '../Model/BertForMaskedLM.pt')

In [165]:
torch.save(model.bert, '../Model/BertBase.pt')

Fine-Tune

In [169]:
train_gen = TextGenerator(precessed_reviews,targets,max_len)
train_gen = DataLoader(train_gen,batch_size,sampler=RandomSampler(train_gen),num_workers=2)

In [170]:
model_ft = BertForSequenceClassification.from_pretrained('bert-base-uncased',num_labels=2).to('cuda:0')

In [171]:
model_ft.bert = model.bert

In [172]:
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
    ]

In [173]:
num_train_optimization_steps = int(len(train_gen) * num_train_epochs)

In [174]:
opt = BertAdam(optimizer_grouped_parameters,
                     lr=5e-5,
                     warmup=0.1,
                     t_total=num_train_optimization_steps)

In [176]:
model_ft = fit(num_train_epochs, model_ft, loss_func, opt, train_gen, lossBest=None)

epoch:0, train_loss:0.4091405396906614
epoch:1, train_loss:0.318999502600365
epoch:2, train_loss:0.26345209118632285
epoch:3, train_loss:0.21854930106555226
Training completed in 1039.646898984909s


In [179]:
def batchedSeq(reviews,max_len,window_len=32):
    # used for TTA
    CLS = [101]
    SEP = [102]
    length = [len(i)+2 for i in reviews]
    input_ids = []
    input_masks = []
    seg_len = []
    for len_,review in zip(length,reviews):
        if len_ < max_len:
            padding = [0] * (max_len - len_)
            input_mask = [1] * len_
            input_id = CLS + review + SEP + padding
            input_mask += padding
            input_ids.append(input_id)
            input_masks.append(input_mask)
            seg_len.append(1)
        elif len_ == max_len:
            input_id = CLS + review + SEP
            input_mask = [1] * len_
            input_ids.append(input_id)
            input_masks.append(input_mask)
            seg_len.append(1)
        else:
            _len_seg = (len_ - max_len)//window_len + 1
            for j in range(_len_seg):
                input_id = CLS + review[j*window_len:j*window_len+(max_len-2)] + SEP
                input_mask = [1] * max_len
                input_ids.append(input_id)
                input_masks.append(input_mask)
            seg_len.append(_len_seg)
            
    return np.array(input_ids,dtype=np.long),np.array(input_masks,dtype=np.long),\
                np.zeros((len(input_ids),max_len),dtype=np.long),seg_len

In [14]:
inputs,inputs_mask,segment_ids,seg_len = batchedSeq(precessed_reviews[train_size:],max_len)
TTA_val_gen = TensorDataset(*[torch.tensor(i) for i in [inputs,segment_ids,inputs_mask]])
TTA_val_gen = DataLoader(TTA_val_gen,batch_size,num_workers=2)

In [183]:
def predict(model,dataloader,to_numpy=True):
    # dataloader return Xs only
    model.eval()
    with torch.no_grad():
        out = torch.cat([model(*data2cuda(data)) for data in dataloader])
        return out.cpu().detach().numpy() if to_numpy else out

In [17]:
yhat = predict(model,TTA_val_gen)

In [18]:
seg_len = np.cumsum(np.array(seg_len))
seg_len = np.insert(seg_len, 0, 0)

In [20]:
yhat_agg = []
for i in range(seg_len.shape[0]-1):
    yhat_agg.append(yhat[seg_len[i]:seg_len[i+1]].mean(0))
yhat_agg = np.array(yhat_agg)

In [21]:
# validation accuracy
np.sum(yhat_agg.argmax(1) == np.array(targets[train_size:]))/yhat_agg.shape[0]

0.9132

In [22]:
yhat_prob = np.exp(yhat)
yhat_prob = yhat_prob/yhat_prob.sum(1,keepdims=True)

In [23]:
yhat_agg2 = []
for i in range(seg_len.shape[0]-1):
    yhat_agg2.append(yhat_prob[seg_len[i]:seg_len[i+1]].mean(0))
yhat_agg2 = np.array(yhat_agg2)

In [24]:
# validation accuracy
np.sum(yhat_agg2.argmax(1) == np.array(targets[train_size:]))/yhat_agg.shape[0]

0.9096

In [25]:
# AUC
roc_auc_score(np.array(targets[train_size:]),yhat_agg2[:,1])

0.9638380238417107

Submission

In [177]:
submission = pd.read_csv('../Data/sampleSubmission.csv')

In [180]:
inputs,inputs_mask,segment_ids,seg_len = batchedSeq(precessed_reviews_test,max_len)
TTA_val_gen = TensorDataset(*[torch.tensor(i) for i in [inputs,segment_ids,inputs_mask]])
TTA_val_gen = DataLoader(TTA_val_gen,batch_size,num_workers=2)

In [184]:
ytest = predict(model_ft,TTA_val_gen)

Exception ignored in: <function _DataLoaderIter.__del__ at 0x7f5e8372fb70>
Traceback (most recent call last):
  File "/home/will/anaconda3/envs/pytorch/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 717, in __del__
    self._shutdown_workers()
  File "/home/will/anaconda3/envs/pytorch/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 713, in _shutdown_workers
    w.join()
  File "/home/will/anaconda3/envs/pytorch/lib/python3.7/multiprocessing/process.py", line 138, in join
    assert self._parent_pid == os.getpid(), 'can only join a child process'
AssertionError: can only join a child process
Exception ignored in: <function _DataLoaderIter.__del__ at 0x7f5e8372fb70>
Traceback (most recent call last):
  File "/home/will/anaconda3/envs/pytorch/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 717, in __del__
    self._shutdown_workers()
  File "/home/will/anaconda3/envs/pytorch/lib/python3.7/site-packages/torch/utils/data/dataloader.py",

In [185]:
seg_len = np.cumsum(np.array(seg_len))
seg_len = np.insert(seg_len, 0, 0)

In [186]:
yhat_agg = []
for i in range(seg_len.shape[0]-1):
    yhat_agg.append(ytest[seg_len[i]:seg_len[i+1]].mean(0))
yhat_agg = np.array(yhat_agg)

In [187]:
yhat_prob = np.exp(yhat_agg)
yhat_prob = yhat_prob/yhat_prob.sum(1,keepdims=True)

In [188]:
submission.iloc[:,1] = yhat_prob[:,1]

In [189]:
submission.to_csv('../Submission/BERT_ft.csv',index=False)