In [81]:
import torch
print(torch.__version__)
import torch.nn as nn
import torch.nn.functional as F
from pytorch_transformers import BertModel,BertTokenizer


import numpy as np
from sklearn import metrics
import time
from datetime import timedelta
from tqdm import tqdm
import os
import pandas as pd

import random

1.4.0


In [12]:
def get_time_dif(start_time):
    end_time = time.time()
    time_dif = end_time - start_time
    return timedelta(seconds=int(round(time_dif)))

In [39]:
class Config(object):
    def __init__(self,dataset):
        self.model_name = 'bert'
        self.train_path = os.path.join(dataset,'train.txt')
        self.dev_path = os.path.join(dataset,'dev.txt')
        self.test_path = os.path.join(dataset,'test.txt')
        
        self.class_list = [x.strip() 
                           for x in open(os.path.join(dataset,'class.txt')).readlines()]
        
#         self.save_path = dataset + '/save_dict/' + self.model_name + '.ckpt'
        
        self.device = torch.device('cuda:1' if torch.cuda.is_available()
                                  else 'cpu')
        
        self.require_improvement = 1000;
        self.num_classes = len(self.class_list)
        self.num_epochs = 3
        self.batch_size = 128
        self.pad_size = 32
        self.learning_size = 5e-5
        self.bert_path = './bert_chinese'
        self.tokenizer = BertTokenizer.from_pretrained(self.bert_path)
        self.hidden_size = 768

In [5]:
class Model(nn.Module):
    def __init__(self,config):
        super(Model,self).__init__()
        self.bert = BertModel.from_pretrained(config.bert_path)
        for param in self.bert.parameters():
            param.requires_grad = True
        self.fc = nn.Linear(config.hidden_size,config.num_classes)
        
    def forward(self,x):
        context = x[0]
        mask = x[2]
        _,pooled = self.bert(context,attention_mask=mask,
                            output_all_encoded_layers=False)
        out = self.fc(pooled)
        return out

In [7]:
def init_network(model,method='xavier',exclude='embedding',seed=123):
    for name, w in model.named_parameters():
        if exclude not in name:
            if len(w.size()) < 2:
                continue
            if 'weight' in name:
                if method == 'xavier':
                    nn.init.xavier_normal_(w)
                elif method == 'kaiming':
                    nn.init.kaiming_normal_(w)
                else:
                    nn.init.normal_(w)
            elif 'bias' in name:
                nn.init.constant_(w,0)
            else:
                pass

In [9]:
def evaluate(config,model,data_iter,test=False):
    model.eval()
    loss_total = 0
    pred_all = np.array([],dtype=int)
    labels_all = np.array([],dtype=int)
    
    with torch.no_grad():
        for texts,labels in data_iter:
            outputs = model(texts)
            loss = F.cross_entropy(outputs,labels)
            loss_total += loss
            labels = labels.data.cpu().numpy()
            pred = torch.max(output.data,1)[1].cpu().numpy()
            labels_all = np.append(labels_all,labels)
            pred_all = np.append(pred_all,pred)
            
    acc = metrics.accuracy_score(labels_all,pred_all)
    if test:
        report = metrics.classification_report(labels_all,pred_all,
                                              target_names=config.class_list,
                                              digits=4)
        
        confusion = metrics.confusion_matrix(labels_all,pred_all)
        return acc, loss_total / len(data_iter),report,confusion
    return acc,loss_total/len(data_iter)

In [13]:
def train(config,model,train_iter,dev_iter,test_iter):
    start_time = time.time()
    model.train()
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias','LayerNorm.bias','LayerNorm.weight']
    optimizer_grouped_parameters = [
        {'params':[p for n,p in param_optimizer 
                   if not any(nd in n for nd in no_decay)],
        'weight_decay':0.01},
        {'params':[p for n,p in param_optimizer
                  if any(nd in n for nd in no_decay)],
        'weight_decay':0.0}
    ]
    optimzer = BertAdam(optimizer_grouped_parameters,
                       lr=config.learning_rate,
                       warmup=0.05,
                       t_total=len(train_iter) * config.num_epochs)
    total_batch = 0
    dev_best_loss = float('inf')
    last_improve = 0
    flag = False
    model.train()
    
    for epoch in range(config.num_epochs):
        print('Epoch [{} / {}]'.format(epoch+1,config.num_epochs))
        for i,(trains,labels) in enumerate(train_iter):
            outputs = model(trains)
            model.zero_grad()
            loss = F.cross_entropy(outputs,labels)
            loss.backward()
            optimzer.step()
            
            if total_batch%100 ==0:
                true = labels.data.cpu()
                pred = torch.max(outputs.data,1)[1].cpu()
                train_acc = metrics.accuracy_score(true,pred)
                dev_acc,dev_loss = evaluate(config,model,dev_iter)
                
                if dev_loss < dev_best_loss:
                    dev_best_loss = dev_loss
                    torch.save(model.start_dict(),config.save_path)
                    improve = '*'
                    last_improve = total_batch
                else:
                    improve = ''
                
                time_dif = get_time_dif(start_time)
                msg = 'Iter: {0:>6},  \
                Train Loss: {1:>5.2},  Train Acc: {2:>6.2%},  \
                Val Loss: {3:>5.2},  Val Acc: {4:>6.2%},  \
                        Time: {5} {6}'
                print(msg.format(total_batch, loss.item(), train_acc, dev_loss, dev_acc, time_dif, improve))
                model.train()
            total_batch += 1
            if total_batch - last_improve > config.require_improvement:
                # 验证集loss超过1000batch没下降，结束训练
                print("No optimization for a long time, auto-stopping...")
                flag = True
                break
        if flag:
            break
    test(config, model, test_iter)

In [14]:
def test(config, model, test_iter):
    # test
    model.load_state_dict(torch.load(config.save_path))
    model.eval()
    start_time = time.time()
    test_acc, test_loss, test_report, test_confusion = evaluate(config, model, test_iter, test=True)
    msg = 'Test Loss: {0:>5.2},  Test Acc: {1:>6.2%}'
    print(msg.format(test_loss, test_acc))
    print("Precision, Recall and F1-Score...")
    print(test_report)
    print("Confusion Matrix...")
    print(test_confusion)
    time_dif = get_time_dif(start_time)
    print("Time usage:", time_dif)

In [48]:
PAD,CLS = '[PAD]','[CLS]'
def build_dataset(config):

    def load_dataset(path, pad_size=32):
        contents = []
        with open(path, 'r', encoding='UTF-8') as f:
            for line in tqdm(f):
                lin = line.strip()
                if not lin:
                    continue
                content, label = lin.split('\t')
#                 print(content)
                token = config.tokenizer.tokenize(content)
                token = [CLS] + token
                seq_len = len(token)
                mask = []
                token_ids = config.tokenizer.convert_tokens_to_ids(token)

                if pad_size:
                    if len(token) < pad_size:
                        mask = [1] * len(token_ids) + [0] * (pad_size - len(token))
                        token_ids += ([0] * (pad_size - len(token)))
                    else:
                        mask = [1] * pad_size
                        token_ids = token_ids[:pad_size]
                        seq_len = pad_size
                contents.append((token_ids, int(label), seq_len, mask))
        return contents
    train = load_dataset(config.train_path, config.pad_size)
    dev = load_dataset(config.dev_path, config.pad_size)
    test = load_dataset(config.test_path, config.pad_size)
    return train, dev, test

In [87]:
# def build_dataset_csv(config):
csv_path = '/work/tcxia/codes/senti_analysis/data/data.csv'
content = []
data = pd.read_csv(csv_path)
content = data['content'].tolist()
label = data['label'].tolist()

contents = []
for cont in content:
    contents.append("".join(eval(cont)))
    
print(len(contents))
print(len(label))
cont_label = zip(contents,label)
content_total = []
pad_size = 32
for cl in cont_label:
    c = cl[0]
    l = cl[1]
#     print(l)
    token = config.tokenizer.tokenize(c)
    token = [CLS] + token
    seq_len = len(token)
    mask = []
    token_ids = config.tokenizer.convert_tokens_to_ids(token)
    if pad_size:
        if len(token) < pad_size:
            mask = [1] * len(token_ids) + [0] * (pad_size - len(token))
            token_ids += ([0] * (pad_size - len(token)))
        else:
            mask = [1] * pad_size
            token_ids = token_ids[:pad_size]
            seq_len = pad_size
        content_total.append((token_ids, int(l), seq_len, mask))
    
# print(content_total[:5])
random.shuffle(content_total)
# print("=" * 20)
# print(content_total[:5])

train = content_total[:3000]
dev = content_total[3000:3500]
test = content_total[3500:]


3920
3920


In [49]:
class DatasetIterater(object):
    def __init__(self, batches, batch_size, device):
        self.batch_size = batch_size
        self.batches = batches
        self.n_batches = len(batches) // batch_size
        self.residue = False  # 记录batch数量是否为整数
        if len(batches) % self.n_batches != 0:
            self.residue = True
        self.index = 0
        self.device = device

    def _to_tensor(self, datas):
        x = torch.LongTensor([_[0] for _ in datas]).to(self.device)
        y = torch.LongTensor([_[1] for _ in datas]).to(self.device)

        # pad前的长度(超过pad_size的设为pad_size)
        seq_len = torch.LongTensor([_[2] for _ in datas]).to(self.device)
        mask = torch.LongTensor([_[3] for _ in datas]).to(self.device)
        return (x, seq_len, mask), y

    def __next__(self):
        if self.residue and self.index == self.n_batches:
            batches = self.batches[self.index * self.batch_size: len(self.batches)]
            self.index += 1
            batches = self._to_tensor(batches)
            return batches

        elif self.index >= self.n_batches:
            self.index = 0
            raise StopIteration
        else:
            batches = self.batches[self.index * self.batch_size: (self.index + 1) * self.batch_size]
            self.index += 1
            batches = self._to_tensor(batches)
            return batches

    def __iter__(self):
        return self

    def __len__(self):
        if self.residue:
            return self.n_batches + 1
        else:
            return self.n_batches

In [50]:
def build_iterator(dataset, config):
    iter = DatasetIterater(dataset, config.batch_size, config.device)
    return iter

In [51]:
dataset = '/work/tcxia/codes/bert-sentiment/THUCNews'  # 数据集
model_name = 'bert'  # bert
# x = import_module('')
# config = x.Config(dataset)
config = Config(dataset)
#print(config.train_path)
np.random.seed(1)
torch.manual_seed(1)
torch.cuda.manual_seed_all(1)
torch.backends.cudnn.deterministic = True  # 保证每次结果一样

start_time = time.time()
print("Loading data...")
train_data, dev_data, test_data = build_dataset(config)
train_iter = build_iterator(train_data, config)
dev_iter = build_iterator(dev_data, config)
test_iter = build_iterator(test_data, config)
time_dif = get_time_dif(start_time)
print("Time usage:", time_dif)

# train
model = Model(config).to(config.device)
train(config, model, train_iter, dev_iter, test_iter)





0it [00:00, ?it/s][A[A[A[A



219it [00:00, 2188.99it/s][A[A[A[A

Loading data...






518it [00:00, 2379.41it/s][A[A[A[A



817it [00:00, 2533.45it/s][A[A[A[A



1105it [00:00, 2627.19it/s][A[A[A[A



1376it [00:00, 2651.29it/s][A[A[A[A



1641it [00:00, 2650.60it/s][A[A[A[A



1926it [00:00, 2706.31it/s][A[A[A[A



2181it [00:00, 2580.00it/s][A[A[A[A



2501it [00:00, 2738.07it/s][A[A[A[A



2819it [00:01, 2854.70it/s][A[A[A[A



3103it [00:01, 2819.70it/s][A[A[A[A



3384it [00:01, 2796.52it/s][A[A[A[A



3663it [00:01, 2731.25it/s][A[A[A[A



3941it [00:01, 2744.31it/s][A[A[A[A



4216it [00:01, 2608.12it/s][A[A[A[A



4509it [00:01, 2696.03it/s][A[A[A[A



4814it [00:01, 2792.40it/s][A[A[A[A



5107it [00:01, 2830.45it/s][A[A[A[A



5392it [00:01, 2670.48it/s][A[A[A[A



5679it [00:02, 2726.13it/s][A[A[A[A



5966it [00:02, 2765.29it/s][A[A[A[A



6245it [00:02, 2613.12it/s][A[A[A[A



6533it [00:02, 2687.40it/s][A[A[A[A



6821it [00:02, 2740.82it/s][A[A[A[A



7101it [0

KeyboardInterrupt: 