In [1]:
import logging
import random, time, os

import numpy as np
import torch
from tqdm import tqdm
from torch.utils.data import DataLoader
from sklearn.model_selection import KFold

logging.basicConfig(level=logging.INFO, format='%(asctime)-15s %(levelname)s: %(message)s')

# set seed
seed = 666
random.seed(seed)
np.random.seed(seed)
torch.cuda.manual_seed(seed)
torch.manual_seed(seed)
root = '/home/admin/jupyter/download/'

# set cuda
gpu = 0
use_cuda = gpu >= 0 and torch.cuda.is_available()
if use_cuda:
    torch.cuda.set_device(gpu)
    device = torch.device("cuda", gpu)
else:
    device = torch.device("cpu")
logging.info("Use cuda: %s, gpu id: %d.", use_cuda, gpu)

2021-01-13 15:42:32,544 INFO: Use cuda: True, gpu id: 0.


In [2]:
class Vocab():
    """docstring for Vocab"""
    def __init__(self, text, train=False):
        super(Vocab, self).__init__()
        self.words = text
        self.itos = ['[PAD]','[UNK]','[CLS]','[SEP]','[MASK]']
        self.unk_idx = 1
        if train:
            self.build_vocab(5)


    def build_vocab(self, min_freq=1):
        from collections import Counter, OrderedDict
        from tqdm import tqdm 
        counter = Counter()
        for label,line in tqdm(self.words,total=len(self.words),desc='text loading'):
            line = line.split(' ')
            counter.update(line)
        word_freq = sorted(counter.items(), key=lambda x: x[1], reverse=True)
        word_freq_dict = OrderedDict(word_freq)
        for word, freq in word_freq_dict.items():
            if freq < min_freq:
                break
            self.itos.append(word)
        print('vocab size is %d'%len(self.itos))
        self.stoi = {v:i for i,v in enumerate(self.itos)}

    def load_vectors(self, file):
        with open(file,'r') as f:
            lines = f.readlines()
        vecmat = np.genformtxt(lines[1:])
        words, vec = vecmat[:,0],vecmat[:,1:]
        vec_size = vec.shape[1]
        vec = np.vstack((np.zeros(2,vec_size),vec))
        idx = [words.index(w) for w in self.itos]
        self.vector = vec[idx]
        
    @classmethod
    def from_prevocab(cls, vocab_file):
        f = open(vocab_file, 'r')
        lines = f.readlines()
        lines = list(map(lambda x: x.strip(), lines))
        cls.stoi = {v:i for i,v in enumerate(lines)}
        return cls

In [3]:
from torch.utils.data import Dataset
class NLPDataSet(Dataset):
    def __init__(self, vocab, data, max_words, max_sent):
        super(NLPDataSet,self).__init__()

        self.max_sent = max_sent
        self.max_words = max_words
        self._data = data
        self._vocab = vocab
        self.content = self.data_deal()

    def sentc_clip(self, doc):
        doc = doc.split(' ')
        index = list(range(0, len(doc), self.max_words-2))
        index.append(len(doc))
        sentce = []
        for i in range(len(index)-1):
            line = doc[index[i]:index[i+1]]
            line = ['<cls>']+line+['<sep>']
            if len(line) < self.max_words:
                line = line+['<pad>']*(self.max_words-len(line))
            sentce.append(line)
        if len(sentce) > self.max_sent:
            segment_i = int(self.max_sent/2)
            segment_sec = int(segment_i/2)
            sentce = sentce[:segment_i]+random.choices(sentce[segment_i:],k=segment_sec)+sentce[-segment_sec:]
        if len(sentce) < self.max_sent:
            sentce.extend([[0]*self.max_words]*(self.max_sent - len(sentce)))
        return sentce
        
    def data_deal(self):
        doc = []
        for label,text in tqdm(self._data, total=len(self._data),desc='segment doc'):
            res = self.sentc_clip(text)
            res = [[self._vocab.stoi.get(s,0) for s in l] for l in res]
            doc.append((label, res))
        return doc
    def __getitem__(self, idx):
        label,text = self.content[idx]
        return torch.tensor(label), torch.LongTensor(text)
    def __len__(self):
        return len(self.content)

In [4]:
def collate_wrapper(batch):
    transposed_data = list(zip(*batch))
    doclen=max([len(s) for s in transposed_data[1]])
    inp = torch.stack(transposed_data[1], 0)
    inp2 = torch.zeros_like(inp)
    mask_id = torch.ones_like(inp)
    tgt = torch.stack(transposed_data[0], 0)
    return tgt,(inp,inp2,mask_id)

In [5]:
# build module
import torch.nn as nn
import torch.nn.functional as F


class Attention(nn.Module):
    def __init__(self, hidden_size):
        super(Attention, self).__init__()
        self.weight = nn.Parameter(torch.Tensor(hidden_size, hidden_size))
        self.weight.data.normal_(mean=0.0, std=0.05)

        self.bias = nn.Parameter(torch.Tensor(hidden_size))
        b = np.zeros(hidden_size, dtype=np.float32)
        self.bias.data.copy_(torch.from_numpy(b))

        self.query = nn.Parameter(torch.Tensor(hidden_size))
        self.query.data.normal_(mean=0.0, std=0.05)

    def forward(self, batch_hidden, batch_masks):
        # batch_hidden: b x len x hidden_size (2 * hidden_size of lstm)
        # batch_masks:  b x len

        # linear
        key = torch.matmul(batch_hidden, self.weight) + self.bias  # b x len x hidden

        # compute attention
        outputs = torch.matmul(key, self.query)  # b x len

        masked_outputs = outputs.masked_fill((1 - batch_masks).bool(), float(-1e32))

        attn_scores = F.softmax(masked_outputs, dim=1)  # b x len

        # 对于全零向量，-1e32的结果为 1/len, -inf为nan, 额外补0
        masked_attn_scores = attn_scores.masked_fill((1 - batch_masks).bool(), 0.0)

        # sum weighted sources
        batch_outputs = torch.bmm(masked_attn_scores.unsqueeze(1), key).squeeze(1)  # b x hidden

        return batch_outputs, attn_scores


# build word encoder
bert_path = '/home/admin/jupyter/download/emb/'
dropout = 0.15

from transformers import BertModel


class WordBertEncoder(nn.Module):
    def __init__(self):
        super(WordBertEncoder, self).__init__()
        self.dropout = nn.Dropout(dropout)

#         self.tokenizer = WhitespaceTokenizer()
        self.bert = BertModel.from_pretrained(bert_path)

        self.pooled = False
        logging.info('Build Bert encoder with pooled {}.'.format(self.pooled))

    def encode(self, tokens):
        pass
#         tokens = self.tokenizer.tokenize(tokens)
#         return tokens

    def get_bert_parameters(self):
        no_decay = ['bias', 'LayerNorm.weight']
        optimizer_parameters = [
            {'params': [p for n, p in self.bert.named_parameters() if not any(nd in n for nd in no_decay)],
             'weight_decay': 0.01},
            {'params': [p for n, p in self.bert.named_parameters() if any(nd in n for nd in no_decay)],
             'weight_decay': 0.0}
        ]
        return optimizer_parameters

    def forward(self, input_ids, token_type_ids):
        # input_ids: sen_num x bert_len
        # token_type_ids: sen_num  x bert_len

        # sen_num x bert_len x 256, sen_num x 256
        sequence_output, pooled_output = self.bert(input_ids=input_ids, token_type_ids=token_type_ids)

        if self.pooled:
            reps = pooled_output
        else:
            reps = sequence_output[:, 0, :]  # sen_num x 256

        if self.training:
            reps = self.dropout(reps)

        return reps


class WhitespaceTokenizer():
    """WhitespaceTokenizer with vocab."""

    def __init__(self):
        vocab_file = bert_path + 'vocab.txt'
        self._token2id = self.load_vocab(vocab_file)
        self._id2token = {v: k for k, v in self._token2id.items()}
        self.max_len = 256
        self.unk = 1

        logging.info("Build Bert vocab with size %d." % (self.vocab_size))

    def load_vocab(self, vocab_file):
        f = open(vocab_file, 'r')
        lines = f.readlines()
        lines = list(map(lambda x: x.strip(), lines))
        vocab = dict(zip(lines, range(len(lines))))
        return vocab

    def tokenize(self, tokens):
        assert len(tokens) <= self.max_len - 2
        tokens = ["[CLS]"] + tokens + ["[SEP]"]
        output_tokens = self.token2id(tokens)
        return output_tokens

    def token2id(self, xs):
        if isinstance(xs, list):
            return [self._token2id.get(x, self.unk) for x in xs]
        return self._token2id.get(xs, self.unk)

    @property
    def vocab_size(self):
        return len(self._id2token)


# build sent encoder
sent_hidden_size = 256
sent_num_layers = 2


class SentEncoder(nn.Module):
    def __init__(self, sent_rep_size):
        super(SentEncoder, self).__init__()
        self.dropout = nn.Dropout(dropout)

        self.sent_lstm = nn.LSTM(
            input_size=sent_rep_size,
            hidden_size=sent_hidden_size,
            num_layers=sent_num_layers,
            batch_first=True,
            bidirectional=True
        )

    def forward(self, sent_reps, sent_masks):
        # sent_reps:  b x doc_len x sent_rep_size
        # sent_masks: b x doc_len

        sent_hiddens, _ = self.sent_lstm(sent_reps)  # b x doc_len x hidden*2
        sent_hiddens = sent_hiddens * sent_masks.unsqueeze(2)

        if self.training:
            sent_hiddens = self.dropout(sent_hiddens)

        return sent_hiddens

In [6]:
# build model
class Model(nn.Module):
    def __init__(self, cls_size):
        super(Model, self).__init__()
        self.sent_rep_size = 256
        self.doc_rep_size = sent_hidden_size * 2
        self.all_parameters = {}
        parameters = []
        self.word_encoder = WordBertEncoder()
        bert_parameters = self.word_encoder.get_bert_parameters()

        self.sent_encoder = SentEncoder(self.sent_rep_size)
        self.sent_attention = Attention(self.doc_rep_size)
        parameters.extend(list(filter(lambda p: p.requires_grad, self.sent_encoder.parameters())))
        parameters.extend(list(filter(lambda p: p.requires_grad, self.sent_attention.parameters())))

        self.out = nn.Linear(self.doc_rep_size, cls_size, bias=True)
        parameters.extend(list(filter(lambda p: p.requires_grad, self.out.parameters())))

        if use_cuda:
            self.to(device)

        if len(parameters) > 0:
            self.all_parameters["basic_parameters"] = parameters
        self.all_parameters["bert_parameters"] = bert_parameters

        logging.info('Build model with bert word encoder, lstm sent encoder.')

        para_num = sum([np.prod(list(p.size())) for p in self.parameters()])
        logging.info('Model param num: %.2f M.' % (para_num / 1e6))

    def forward(self, batch_inputs):
        # batch_inputs(batch_inputs1, batch_inputs2): b x doc_len x sent_len
        # batch_masks : b x doc_len x sent_len
        batch_inputs1, batch_inputs2, batch_masks = batch_inputs
        batch_size, max_doc_len, max_sent_len = batch_inputs1.shape[0], batch_inputs1.shape[1], batch_inputs1.shape[2]
        batch_inputs1 = batch_inputs1.view(batch_size * max_doc_len, max_sent_len)  # sen_num x sent_len
        batch_inputs2 = batch_inputs2.view(batch_size * max_doc_len, max_sent_len)  # sen_num x sent_len
        batch_masks = batch_masks.view(batch_size * max_doc_len, max_sent_len)  # sen_num x sent_len

        sent_reps = self.word_encoder(batch_inputs1, batch_inputs2)  # sen_num x sent_rep_size

        sent_reps = sent_reps.view(batch_size, max_doc_len, self.sent_rep_size)  # b x doc_len x sent_rep_size
        batch_masks = batch_masks.view(batch_size, max_doc_len, max_sent_len)  # b x doc_len x max_sent_len
        sent_masks = batch_masks.bool().any(2).float()  # b x doc_len

        sent_hiddens = self.sent_encoder(sent_reps, sent_masks)  # b x doc_len x doc_rep_size
        doc_reps, atten_scores = self.sent_attention(sent_hiddens, sent_masks)  # b x doc_rep_size

        batch_outputs = self.out(doc_reps)  # b x num_labels

        return batch_outputs
    


In [7]:
# build optimizer
learning_rate = 2e-4
bert_lr = 5e-5
decay = .75
decay_step = 1000
from transformers import AdamW, get_linear_schedule_with_warmup


class Optimizer:
    def __init__(self, model_parameters, steps):
        self.all_params = []
        self.optims = []
        self.schedulers = []

        for name, parameters in model_parameters.items():
            if name.startswith("basic"):
                optim = torch.optim.Adam(parameters, lr=learning_rate)
                self.optims.append(optim)

                l = lambda step: decay ** (step // decay_step)
                scheduler = torch.optim.lr_scheduler.LambdaLR(optim, lr_lambda=l)
                self.schedulers.append(scheduler)
                self.all_params.extend(parameters)
            elif name.startswith("bert"):
                optim_bert = AdamW(parameters, bert_lr, eps=1e-8)
                self.optims.append(optim_bert)

                scheduler_bert = get_linear_schedule_with_warmup(optim_bert, 0, steps)
                self.schedulers.append(scheduler_bert)

                for group in parameters:
                    for p in group['params']:
                        self.all_params.append(p)
            else:
                Exception("no nameed parameters.")

        self.num = len(self.optims)

    def step(self):
        for optim, scheduler in zip(self.optims, self.schedulers):
            optim.step()
            scheduler.step()
            optim.zero_grad()

    def zero_grad(self):
        for optim in self.optims:
            optim.zero_grad()

    def get_lr(self):
        lrs = tuple(map(lambda x: x.get_lr()[-1], self.schedulers))
        lr = ' %.5f' * self.num
        res = lr % lrs
        return res

In [8]:
from sklearn.metrics import f1_score, precision_score, recall_score, classification_report

def get_score(y_ture, y_pred):
    y_ture = np.array(y_ture)
    y_pred = np.array(y_pred)
    f1 = f1_score(y_ture, y_pred, average='macro') * 100
    p = precision_score(y_ture, y_pred, average='macro') * 100
    r = recall_score(y_ture, y_pred, average='macro') * 100

    return str((reformat(p, 2), reformat(r, 2), reformat(f1, 2))), reformat(f1, 2)


def reformat(num, n):
    return float(format(num, '0.' + str(n) + 'f'))

In [9]:
def train(model, epoch_num, data_loader, criterion, optimizer, device):
    start_time = time.time()
    step = 0
    y_pred = []
    y_true = []
    total_num = len(data_loader)
    losses = 0
    overall_losses = 0
    for epoch in range(1,epoch_num+1):
        for idx, (label, inp) in tqdm(enumerate(data_loader),total=total_num, desc='training'):
            torch.cuda.empty_cache()
            inp = (inp[0].to(device),inp[1].to(device),inp[2].to(device))
            label = label.to(device)
            
            output = model(inp)
            loss = criterion(output,label)
            loss_value = loss.detach().cpu().item()
            loss.backward()
            losses += loss_value
            overall_losses += loss_value
            y_pred.extend(torch.max(output, dim=1)[1].cpu().numpy().tolist())
            y_true.extend(label.cpu().numpy().tolist())
            nn.utils.clip_grad_norm_(optimizer.all_params,5.0)
            for optim, scheduler in zip(optimizer.optims, optimizer.schedulers):
                optim.step()
                scheduler.step()
            optimizer.zero_grad()

            step += 1
            if (idx+1) %500 ==0:
                elapsed = time.time() - start_time
                lrs = optimizer.get_lr()
                print(
                    '| epoch {:3d} | step {:3d} | batch {:3d}/{:3d} | lr{} | loss {:.4f} | s/batch {:.2f}'.format(
                        epoch, step, idx, total_num, lrs,
                        losses / 50,
                        elapsed / 50))
                logging.info(
                    '| epoch {:3d} | step {:3d} | batch {:3d}/{:3d} | lr{} | loss {:.4f} | s/batch {:.2f}'.format(
                        epoch, step, idx, total_num, lrs,
                        losses / 50,
                        elapsed / 50))

                losses = 0

def test(model, data_loader, device):
    total_num = len(data_loader)
    y_pred = []
    y_true = []
    for idx, (label, inp) in tqdm(enumerate(data_loader), total=total_num ):
        torch.cuda.empty_cache()
        inp = (inp[0].to(device),inp[1].to(device),inp[2].to(device))
        with torch.no_grad():
            output = model(inp)
            pred = output.argmax(1).cpu().numpy().tolist()
        y_pred.extend(pred)
        y_true.extend(label.tolist())
    report = classification_report(y_true, y_pred)
    torch.save(model.state_dict(),'./bert.pth')
    print(report)
  

def pred(model, data_loader, device):
    save_test = 'bert.csv'
#     model = Model(14)
#     if use_cuda:
#         model.load_state_dict(torch.load(model_state))
#     else:
#         model.load_state_dict(torch.load(model_state,map_location='cpu'))
    total_num = len(data_loader)
    y_pred = []
    model.eval()
    for label,inp in tqdm(data_loader, total=total_num, desc='predicting'):
        inp = (inp[0].to(device),inp[1].to(device),inp[2].to(device))
        with torch.no_grad():
            output = model(inp)
            pred = output.argmax(1).cpu().numpy().tolist()
        y_pred.extend(pred)
    df = pd.DataFrame({'label': y_pred})
    df.to_csv(save_test, index=False, sep=',')
    

In [10]:
epochs = 1
batch_size = 32

save_model = os.path.join(root,'bert.pth')
model = Model(14)
# model.load_state_dict(torch.load(save_model))
criterion = nn.CrossEntropyLoss()
optimizer = Optimizer(model.all_parameters, steps=batch_size * epochs)


2021-01-13 15:43:00,294 INFO: Build Bert encoder with pooled False.
2021-01-13 15:43:17,119 INFO: Build model with bert word encoder, lstm sent encoder.
2021-01-13 15:43:17,125 INFO: Model param num: 7.72 M.


In [11]:
vocab_file = './emb/vocab.txt'
vocab = Vocab.from_prevocab(vocab_file)

In [12]:

fold_num = 5
data_file = os.path.join(root,'data/train_set.csv')
import pandas as pd
from sklearn.model_selection import StratifiedKFold

In [13]:
def predict():
    test_file = './data/test_a.csv'
    test_df = pd.read_csv(test_file)
    test_df['label'] = 0
    test_df = test_df.loc[:,['label','text']]
    test_set = NLPDataSet(vocab, test_df.values, 256, 8)
    test_loader = DataLoader(test_set, 1, collate_fn=collate_wrapper)
    pred(model, test_loader, device)

In [14]:
def training():
    df = pd.read_csv(data_file, sep='\t',encoding='utf-8')
    kf = StratifiedKFold(fold_num, shuffle=False)
    kf_index = kf.split(df['text'],df['label'])
    for train_idx, val_idx in kf_index:
        train_data = df.iloc[train_idx].values
        train_set = NLPDataSet(vocab, train_data, 256, 8)
        train_loader = DataLoader(train_set, batch_size, collate_fn=collate_wrapper)
        train(model, epochs, train_loader, criterion, optimizer,device)

        val_data = df.iloc[val_idx].values
        val_set = NLPDataSet(vocab, val_data, 256, 8)
        val_loader = DataLoader(val_set, batch_size, collate_fn=collate_wrapper)
        test(model, val_loader, device)

In [None]:
training()
# predict()

segment doc: 100%|██████████| 160000/160000 [01:35<00:00, 1667.42it/s]
training:  10%|▉         | 499/5000 [06:00<54:09,  1.39it/s]2021-01-13 15:51:02,300 INFO: | epoch   1 | step 500 | batch 499/5000 | lr 0.00020 0.00000 | loss 11.1667 | s/batch 7.23
training:  10%|█         | 500/5000 [06:01<54:09,  1.38it/s]

| epoch   1 | step 500 | batch 499/5000 | lr 0.00020 0.00000 | loss 11.1667 | s/batch 7.23


training:  20%|█▉        | 999/5000 [12:02<48:11,  1.38it/s]2021-01-13 15:57:03,855 INFO: | epoch   1 | step 1000 | batch 999/5000 | lr 0.00015 0.00000 | loss 7.2187 | s/batch 14.46
training:  20%|██        | 1000/5000 [12:03<48:13,  1.38it/s]

| epoch   1 | step 1000 | batch 999/5000 | lr 0.00015 0.00000 | loss 7.2187 | s/batch 14.46


training:  30%|██▉       | 1499/5000 [18:06<42:22,  1.38it/s]2021-01-13 16:03:08,024 INFO: | epoch   1 | step 1500 | batch 1499/5000 | lr 0.00015 0.00000 | loss 6.4490 | s/batch 21.75
training:  30%|███       | 1500/5000 [18:07<42:19,  1.38it/s]

| epoch   1 | step 1500 | batch 1499/5000 | lr 0.00015 0.00000 | loss 6.4490 | s/batch 21.75


training:  40%|███▉      | 1999/5000 [24:07<36:12,  1.38it/s]2021-01-13 16:09:09,327 INFO: | epoch   1 | step 2000 | batch 1999/5000 | lr 0.00011 0.00000 | loss 6.3312 | s/batch 28.97
training:  40%|████      | 2000/5000 [24:08<36:13,  1.38it/s]

| epoch   1 | step 2000 | batch 1999/5000 | lr 0.00011 0.00000 | loss 6.3312 | s/batch 28.97


training:  50%|████▉     | 2499/5000 [30:09<30:25,  1.37it/s]2021-01-13 16:15:10,776 INFO: | epoch   1 | step 2500 | batch 2499/5000 | lr 0.00011 0.00000 | loss 5.9428 | s/batch 36.20
training:  50%|█████     | 2500/5000 [30:10<30:27,  1.37it/s]

| epoch   1 | step 2500 | batch 2499/5000 | lr 0.00011 0.00000 | loss 5.9428 | s/batch 36.20


training:  60%|█████▉    | 2999/5000 [36:10<24:05,  1.38it/s]2021-01-13 16:21:12,411 INFO: | epoch   1 | step 3000 | batch 2999/5000 | lr 0.00008 0.00000 | loss 5.9615 | s/batch 43.43
training:  60%|██████    | 3000/5000 [36:11<24:04,  1.38it/s]

| epoch   1 | step 3000 | batch 2999/5000 | lr 0.00008 0.00000 | loss 5.9615 | s/batch 43.43


training:  70%|██████▉   | 3499/5000 [42:12<18:04,  1.38it/s]2021-01-13 16:27:13,668 INFO: | epoch   1 | step 3500 | batch 3499/5000 | lr 0.00008 0.00000 | loss 5.7850 | s/batch 50.66
training:  70%|███████   | 3500/5000 [42:12<18:04,  1.38it/s]

| epoch   1 | step 3500 | batch 3499/5000 | lr 0.00008 0.00000 | loss 5.7850 | s/batch 50.66


training:  80%|███████▉  | 3999/5000 [48:13<12:04,  1.38it/s]2021-01-13 16:33:14,869 INFO: | epoch   1 | step 4000 | batch 3999/5000 | lr 0.00006 0.00000 | loss 5.6777 | s/batch 57.88
training:  80%|████████  | 4000/5000 [48:14<12:03,  1.38it/s]

| epoch   1 | step 4000 | batch 3999/5000 | lr 0.00006 0.00000 | loss 5.6777 | s/batch 57.88


training:  90%|████████▉ | 4499/5000 [54:14<06:00,  1.39it/s]2021-01-13 16:39:15,976 INFO: | epoch   1 | step 4500 | batch 4499/5000 | lr 0.00006 0.00000 | loss 5.4930 | s/batch 65.10
training:  90%|█████████ | 4500/5000 [54:15<06:00,  1.39it/s]

| epoch   1 | step 4500 | batch 4499/5000 | lr 0.00006 0.00000 | loss 5.4930 | s/batch 65.10


training: 100%|█████████▉| 4999/5000 [1:00:15<00:00,  1.39it/s]2021-01-13 16:45:17,198 INFO: | epoch   1 | step 5000 | batch 4999/5000 | lr 0.00005 0.00000 | loss 5.4191 | s/batch 72.33
training: 100%|██████████| 5000/5000 [1:00:16<00:00,  1.38it/s]
segment doc:   0%|          | 187/40000 [00:00<00:21, 1868.52it/s]

| epoch   1 | step 5000 | batch 4999/5000 | lr 0.00005 0.00000 | loss 5.4191 | s/batch 72.33


segment doc: 100%|██████████| 40000/40000 [00:22<00:00, 1809.19it/s]
100%|██████████| 1250/1250 [04:20<00:00,  4.80it/s]
segment doc:   0%|          | 189/160000 [00:00<01:24, 1887.69it/s]

              precision    recall  f1-score   support

           0       0.88      0.81      0.84      7784
           1       0.78      0.91      0.84      7389
           2       0.95      0.93      0.94      6285
           3       0.79      0.89      0.84      4426
           4       0.82      0.72      0.77      3003
           5       0.80      0.81      0.80      2446
           6       0.90      0.80      0.84      1997
           7       0.83      0.61      0.70      1768
           8       0.55      0.72      0.63      1570
           9       0.84      0.70      0.76      1176
          10       0.79      0.83      0.81       984
          11       0.78      0.61      0.69       627
          12       0.92      0.69      0.79       364
          13       0.59      0.42      0.49       181

    accuracy                           0.83     40000
   macro avg       0.80      0.75      0.77     40000
weighted avg       0.83      0.83      0.83     40000



segment doc: 100%|██████████| 160000/160000 [01:40<00:00, 1587.50it/s]
training:  10%|▉         | 499/5000 [06:00<54:07,  1.39it/s]2021-01-13 16:57:43,053 INFO: | epoch   1 | step 500 | batch 499/5000 | lr 0.00005 0.00000 | loss 5.4862 | s/batch 7.22
training:  10%|█         | 500/5000 [06:01<54:08,  1.39it/s]

| epoch   1 | step 500 | batch 499/5000 | lr 0.00005 0.00000 | loss 5.4862 | s/batch 7.22


training:  20%|█▉        | 999/5000 [12:01<48:10,  1.38it/s]2021-01-13 17:03:44,424 INFO: | epoch   1 | step 1000 | batch 999/5000 | lr 0.00004 0.00000 | loss 5.3011 | s/batch 14.45
training:  20%|██        | 1000/5000 [12:02<48:12,  1.38it/s]

| epoch   1 | step 1000 | batch 999/5000 | lr 0.00004 0.00000 | loss 5.3011 | s/batch 14.45


training:  30%|██▉       | 1499/5000 [18:03<42:11,  1.38it/s]2021-01-13 17:09:45,810 INFO: | epoch   1 | step 1500 | batch 1499/5000 | lr 0.00004 0.00000 | loss 5.3251 | s/batch 21.68
training:  30%|███       | 1500/5000 [18:03<42:10,  1.38it/s]

| epoch   1 | step 1500 | batch 1499/5000 | lr 0.00004 0.00000 | loss 5.3251 | s/batch 21.68


training:  40%|███▉      | 1999/5000 [24:04<36:09,  1.38it/s]2021-01-13 17:15:47,179 INFO: | epoch   1 | step 2000 | batch 1999/5000 | lr 0.00003 0.00000 | loss 5.2256 | s/batch 28.91
training:  40%|████      | 2000/5000 [24:05<36:10,  1.38it/s]

| epoch   1 | step 2000 | batch 1999/5000 | lr 0.00003 0.00000 | loss 5.2256 | s/batch 28.91


training:  50%|████▉     | 2499/5000 [30:06<30:08,  1.38it/s]2021-01-13 17:21:48,770 INFO: | epoch   1 | step 2500 | batch 2499/5000 | lr 0.00003 0.00000 | loss 5.0434 | s/batch 36.14
training:  50%|█████     | 2500/5000 [30:06<30:08,  1.38it/s]

| epoch   1 | step 2500 | batch 2499/5000 | lr 0.00003 0.00000 | loss 5.0434 | s/batch 36.14


training:  60%|█████▉    | 2999/5000 [36:07<24:03,  1.39it/s]2021-01-13 17:27:50,099 INFO: | epoch   1 | step 3000 | batch 2999/5000 | lr 0.00002 0.00000 | loss 5.1398 | s/batch 43.36
training:  60%|██████    | 3000/5000 [36:08<24:04,  1.38it/s]

| epoch   1 | step 3000 | batch 2999/5000 | lr 0.00002 0.00000 | loss 5.1398 | s/batch 43.36


training:  70%|██████▉   | 3499/5000 [42:08<18:03,  1.38it/s]2021-01-13 17:33:51,287 INFO: | epoch   1 | step 3500 | batch 3499/5000 | lr 0.00002 0.00000 | loss 5.1040 | s/batch 50.59
training:  70%|███████   | 3500/5000 [42:09<18:03,  1.38it/s]

| epoch   1 | step 3500 | batch 3499/5000 | lr 0.00002 0.00000 | loss 5.1040 | s/batch 50.59


training:  80%|███████▉  | 3999/5000 [48:10<12:03,  1.38it/s]2021-01-13 17:39:53,008 INFO: | epoch   1 | step 4000 | batch 3999/5000 | lr 0.00002 0.00000 | loss 5.1010 | s/batch 57.82
training:  80%|████████  | 4000/5000 [48:11<12:02,  1.38it/s]

| epoch   1 | step 4000 | batch 3999/5000 | lr 0.00002 0.00000 | loss 5.1010 | s/batch 57.82


training:  90%|████████▉ | 4499/5000 [54:12<06:02,  1.38it/s]2021-01-13 17:45:54,813 INFO: | epoch   1 | step 4500 | batch 4499/5000 | lr 0.00002 0.00000 | loss 5.0051 | s/batch 65.06
training:  90%|█████████ | 4500/5000 [54:12<06:02,  1.38it/s]

| epoch   1 | step 4500 | batch 4499/5000 | lr 0.00002 0.00000 | loss 5.0051 | s/batch 65.06


training: 100%|█████████▉| 4999/5000 [1:00:14<00:00,  1.38it/s]2021-01-13 17:51:56,714 INFO: | epoch   1 | step 5000 | batch 4999/5000 | lr 0.00001 0.00000 | loss 4.9691 | s/batch 72.30
training: 100%|██████████| 5000/5000 [1:00:14<00:00,  1.38it/s]
segment doc:   0%|          | 188/40000 [00:00<00:21, 1872.27it/s]

| epoch   1 | step 5000 | batch 4999/5000 | lr 0.00001 0.00000 | loss 4.9691 | s/batch 72.30


segment doc: 100%|██████████| 40000/40000 [00:22<00:00, 1802.93it/s]
100%|██████████| 1250/1250 [04:21<00:00,  4.78it/s]
segment doc:   0%|          | 186/160000 [00:00<01:26, 1854.34it/s]

              precision    recall  f1-score   support

           0       0.87      0.83      0.85      7783
           1       0.81      0.90      0.85      7389
           2       0.95      0.94      0.95      6285
           3       0.86      0.87      0.86      4427
           4       0.79      0.76      0.78      3003
           5       0.79      0.83      0.81      2446
           6       0.88      0.83      0.86      1997
           7       0.83      0.66      0.73      1769
           8       0.62      0.73      0.67      1570
           9       0.83      0.73      0.77      1176
          10       0.81      0.82      0.81       984
          11       0.78      0.69      0.73       626
          12       0.88      0.72      0.79       364
          13       0.73      0.52      0.61       181

    accuracy                           0.84     40000
   macro avg       0.82      0.77      0.79     40000
weighted avg       0.84      0.84      0.84     40000



segment doc: 100%|██████████| 160000/160000 [01:37<00:00, 1644.79it/s]
training:  10%|▉         | 499/5000 [06:00<54:00,  1.39it/s]2021-01-13 18:04:20,173 INFO: | epoch   1 | step 500 | batch 499/5000 | lr 0.00001 0.00000 | loss 5.1452 | s/batch 7.22
training:  10%|█         | 500/5000 [06:01<54:05,  1.39it/s]

| epoch   1 | step 500 | batch 499/5000 | lr 0.00001 0.00000 | loss 5.1452 | s/batch 7.22


training:  20%|█▉        | 999/5000 [12:01<48:04,  1.39it/s]2021-01-13 18:10:21,219 INFO: | epoch   1 | step 1000 | batch 999/5000 | lr 0.00001 0.00000 | loss 4.9712 | s/batch 14.44
training:  20%|██        | 1000/5000 [12:02<48:06,  1.39it/s]

| epoch   1 | step 1000 | batch 999/5000 | lr 0.00001 0.00000 | loss 4.9712 | s/batch 14.44


training:  30%|██▉       | 1499/5000 [18:02<42:08,  1.38it/s]2021-01-13 18:16:22,312 INFO: | epoch   1 | step 1500 | batch 1499/5000 | lr 0.00001 0.00000 | loss 4.9959 | s/batch 21.66
training:  30%|███       | 1500/5000 [18:03<42:09,  1.38it/s]

| epoch   1 | step 1500 | batch 1499/5000 | lr 0.00001 0.00000 | loss 4.9959 | s/batch 21.66


training:  40%|███▉      | 1999/5000 [24:03<36:07,  1.38it/s]2021-01-13 18:22:23,720 INFO: | epoch   1 | step 2000 | batch 1999/5000 | lr 0.00001 0.00000 | loss 5.0172 | s/batch 28.89
training:  40%|████      | 2000/5000 [24:04<36:08,  1.38it/s]

| epoch   1 | step 2000 | batch 1999/5000 | lr 0.00001 0.00000 | loss 5.0172 | s/batch 28.89


training:  50%|████▉     | 2499/5000 [30:05<30:07,  1.38it/s]2021-01-13 18:28:24,875 INFO: | epoch   1 | step 2500 | batch 2499/5000 | lr 0.00001 0.00000 | loss 4.9621 | s/batch 36.11
training:  50%|█████     | 2500/5000 [30:05<30:07,  1.38it/s]

| epoch   1 | step 2500 | batch 2499/5000 | lr 0.00001 0.00000 | loss 4.9621 | s/batch 36.11


training:  50%|█████     | 2518/5000 [30:18<29:53,  1.38it/s]