### 参考https://www.kaggle.com/hawkeoni/pytorch-simple-bert

In [1]:
import os
from typing import Tuple, List

import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, RandomSampler
from torch.nn.utils.rnn import pad_sequence
from transformers import BertTokenizer, BertModel, AdamW, WarmupLinearSchedule, BertPreTrainedModel
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score
from tqdm import tqdm

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  from ._conv import register_converters as _register_converters


In [2]:
path = "./input/"
device = torch.device('cpu')
if torch.cuda.is_available():
    print('use cuda')
    device = torch.device('cuda:0')

use cuda


In [3]:
# tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
# tokenizer = BertTokenizer.from_pretrained('./chinese_L-12_H-768_A-12/bert_model.ckpt.index',from_tf=True)
assert tokenizer.pad_token_id == 0, "Padding value used in masks is set to zero, please change it everywhere"
train_df = pd.read_csv(os.path.join(path, 'ai_challenger_sentiment_analysis_trainingset_20180816/sentiment_analysis_trainingset.csv'))
val_df = pd.read_csv(os.path.join(path, 'ai_challenger_sentiment_analysis_validationset_20180816/sentiment_analysis_validationset.csv'))
# training on a part of data for speed
# train_df = train_df.sample(frac=0.33)
# train_df, val_df = train_test_split(train_df, test_size=0.05)

In [4]:
features_size = 20

In [5]:
class ToxicDataset(Dataset):
    
    def __init__(self, tokenizer, dataframe, device):
        self.device = device
        self.tokenizer = tokenizer
        self.pad_idx = tokenizer.pad_token_id
        self.X = []
        self.Y = []
        self.labels = ['location_traffic_convenience', 'location_distance_from_business_district', 'location_easy_to_find', 'service_wait_time', 'service_waiters_attitude', 'service_parking_convenience', 'service_serving_speed', 'price_level', 'price_cost_effective', 'price_discount', 'environment_decoration', 'environment_noise', 'environment_space', 'environment_cleaness', 'dish_portion', 'dish_taste', 'dish_look', 'dish_recommendation', 'others_overall_experience', 'others_willing_to_consume_again']
        max_tokens = 0
        for i, (row) in tqdm(dataframe.iterrows()):
            tokens = tokenizer.tokenize(row["content"])
            sentence = row["content"]
            if len(tokens) > 400:
                tokens = tokens[0:400]
                # max_tokens += 1
                # continue
                sentence = row["content"][0:400]
#             print(sentence)
            text = tokenizer.encode(sentence, add_special_tokens=True)
            text = torch.LongTensor(text)
            # 将label展开成全联合分布,并平均到20类的可信程度为1/20
            tags = torch.FloatTensor(1,4*features_size)
            start = 0
            for c in row[self.labels]:
                m = (c==np.array([-2,-1,0,1]))
                tags[0,start:(start+4)]=torch.from_numpy(m)
                start+=4
            self.X.append(text)
            self.Y.append(tags/features_size)
#         print(f"skip rows: {max_tokens}, {max_tokens/len(self.X)}")
    
    def __len__(self):
        return len(self.X)

    def __getitem__(self, index: int) -> Tuple[torch.LongTensor, torch.LongTensor]:
        return self.X[index], self.Y[index]

def collate_fn(batch: List[Tuple[torch.LongTensor, torch.LongTensor]]) \
        -> Tuple[torch.LongTensor, torch.LongTensor]:
    x, y = list(zip(*batch))
    x = pad_sequence(x, batch_first=True, padding_value=0)
    y = torch.stack(y)
    return x.to(device), y.to(device)

train_dataset = ToxicDataset(tokenizer, train_df, device)
dev_dataset = ToxicDataset(tokenizer, val_df, device)

105000it [12:06, 144.49it/s]


skip rows: 0, 0.0


15000it [05:24, 46.24it/s]


skip rows: 0, 0.0


In [6]:
BATCH_SIZE = 4
train_sampler = RandomSampler(train_dataset)
dev_sampler = RandomSampler(dev_dataset)
train_iterator = DataLoader(train_dataset, batch_size=BATCH_SIZE, sampler=train_sampler, collate_fn=collate_fn)
dev_iterator = DataLoader(dev_dataset, batch_size=BATCH_SIZE, sampler=dev_sampler, collate_fn=collate_fn)

In [7]:
class BertClassifier(BertPreTrainedModel):
    
    def __init__(self, config):
        super(BertClassifier, self).__init__(config)
        self.bert = BertModel(config)
        self.classifier = nn.Linear(config.hidden_size, features_size*4)
#         self.classifiers = []
#         for idx in range(4):
#             # index 0 map to emotion -2, 1 to -1, 2 to 0,3 to 1
#             self.classifiers.append(nn.Linear(config.hidden_size, 20))
        
    def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None,
                
            labels=None):
        outputs = self.bert(input_ids,
                               attention_mask=attention_mask,
                               token_type_ids=token_type_ids,
                               position_ids=position_ids,
                               head_mask=head_mask)
        cls_output = outputs[1] # batch, hidden
#         out = torch.zeros(4,20)#.to(device)
#         tem = torch.sigmoid(self.classifiers[0](cls_output))
#         print(tem.shape)
#         for row in range(4):
#             print(out[row,:].shape,cls_output.shape)
#             temp = self.classifiers[row](cls_output) # batch, 20
#             out[row,:] = torch.sigmoid(temp)
#         print('1',cls_output.shape)
        cls_output = self.classifier(cls_output)
        cls_output = torch.sigmoid(cls_output)
#         print('2',cls_output.shape)
#         print('label',labels.shape)
        criterion = nn.BCELoss()
        loss = 0
        if labels is not None:
            loss = criterion(cls_output.flatten(), labels)
        return loss, cls_output

model = BertClassifier.from_pretrained('bert-base-chinese').to(device)

In [8]:
def train(model, iterator, optimizer, scheduler):
    model.train()
    total_loss = 0
    for x, y in tqdm(iterator):
        optimizer.zero_grad()
        mask = (x != 0).float()
        y = y.flatten()
        loss, outputs = model(x, attention_mask=mask, labels=y)
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
        scheduler.step()
    print(f"Train loss {total_loss / len(iterator)}")

def evaluate(model, iterator):
    model.eval()
    pred = []
    true = []
    with torch.no_grad():
        total_loss = 0
        for x, y in tqdm(iterator):
            mask = (x != 0).float()
            y = y.flatten()
            loss, outputs = model(x, attention_mask=mask, labels=y)
#             print('output: ',outputs.shape,' y:',y.shape)
            total_loss += loss
            true += y.cpu().numpy().tolist()
            pred += outputs.cpu().numpy().tolist()
    true = np.argmax(np.array(true).reshape((-1,features_size,4)),axis=2)
    pred = np.argmax(np.array(pred).reshape((-1,features_size,4)),axis=2)

    total_f1 = 0
    for i, name in enumerate(['location_traffic_convenience', 'location_distance_from_business_district', 'location_easy_to_find', 'service_wait_time', 'service_waiters_attitude', 'service_parking_convenience', 'service_serving_speed', 'price_level', 'price_cost_effective', 'price_discount', 'environment_decoration', 'environment_noise', 'environment_space', 'environment_cleaness', 'dish_portion', 'dish_taste', 'dish_look', 'dish_recommendation', 'others_overall_experience', 'others_willing_to_consume_again']):
        f1_value = f1_score(true[:, i], pred[:, i],labels=[0,1,2,3],average ='micro')
        total_f1 += f1_value
        print(f"{name} f1 {f1_value}")
    print(f"Evaluate loss {total_loss / len(iterator)}")
    print(f"average f1: {total_f1/features_size}")

In [9]:
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
{'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
{'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]
EPOCH_NUM = 2
# triangular learning rate, linearly grows untill half of first epoch, then linearly decays 
warmup_steps = int(0.5 * len(train_iterator))
total_steps = len(train_iterator) * EPOCH_NUM - warmup_steps
optimizer = AdamW(optimizer_grouped_parameters, lr=2e-5, eps=1e-8)
scheduler = WarmupLinearSchedule(optimizer, warmup_steps=warmup_steps, t_total=total_steps)

In [10]:
for i in range(EPOCH_NUM):
    print('=' * 50, f"EPOCH {i}", '=' * 50)
    train(model, train_iterator, optimizer, scheduler)
    evaluate(model, dev_iterator)



100%|█| 26250/26250 [2:27:06<00:00,  2.97it/s]  


Train loss 0.08241832897095454


100%|█| 3750/3750 [05:29<00:00, 11.38it/s]


location_traffic_convenience f1 0.9438
location_distance_from_business_district f1 0.8927333333333334
location_easy_to_find f1 0.9162666666666667
service_wait_time f1 0.8932666666666667
service_waiters_attitude f1 0.8229333333333333
service_parking_convenience f1 0.9654666666666667
service_serving_speed f1 0.9238666666666666
price_level f1 0.7799333333333334
price_cost_effective f1 0.8630666666666666
price_discount f1 0.8288666666666666
environment_decoration f1 0.8518000000000001
environment_noise f1 0.8474666666666666
environment_space f1 0.8168666666666666
environment_cleaness f1 0.8624666666666667
dish_portion f1 0.7648
dish_taste f1 0.7498
dish_look f1 0.779
dish_recommendation f1 0.8825333333333333
others_overall_experience f1 0.8000000000000002
others_willing_to_consume_again f1 0.823
Evaluate loss 0.05534471571445465
average f1: 0.8503966666666667


100%|█| 26250/26250 [2:26:52<00:00,  2.98it/s]  


Train loss 0.05504928433880919


100%|█| 3750/3750 [05:30<00:00, 11.36it/s]


location_traffic_convenience f1 0.9428
location_distance_from_business_district f1 0.8891333333333333
location_easy_to_find f1 0.9188666666666667
service_wait_time f1 0.8971333333333333
service_waiters_attitude f1 0.8276666666666667
service_parking_convenience f1 0.9667333333333333
service_serving_speed f1 0.9292
price_level f1 0.7906
price_cost_effective f1 0.8727999999999999
price_discount f1 0.8342
environment_decoration f1 0.8584
environment_noise f1 0.8635333333333334
environment_space f1 0.8333333333333334
environment_cleaness f1 0.8661333333333333
dish_portion f1 0.7802666666666667
dish_taste f1 0.7655333333333333
dish_look f1 0.7933333333333333
dish_recommendation f1 0.8871333333333333
others_overall_experience f1 0.8092666666666667
others_willing_to_consume_again f1 0.8293333333333334
Evaluate loss 0.055033281445503235
average f1: 0.85777


In [12]:
output_dir='./model'
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

# state = {'net':model.state_dict(), 'optimizer':optimizer.state_dict(), 'epoch':2}
# torch.save(state,output_dir+'/torch_point')

('./model\\vocab.txt',
 './model\\special_tokens_map.json',
 './model\\added_tokens.json')

In [13]:
test_tokenizer = BertTokenizer.from_pretrained(output_dir)
test_df = pd.read_csv(os.path.join(path, 'ai_challenger_sentiment_analysis_testa_20180816/sentiment_analysis_testa.csv'))
test_dataset = ToxicDataset(test_tokenizer, test_df, device)

test_sampler = RandomSampler(test_dataset)
test_iterator = DataLoader(test_dataset, batch_size=BATCH_SIZE, sampler=test_sampler, collate_fn=collate_fn)

15000it [00:59, 250.57it/s]


skip rows: 0, 0.0


In [None]:
test_model = BertClassifier.from_pretrained(output_dir).to(device)
evaluate(test_model,test_iterator)

 74%|▋| 2792/3750 [04:05<01:24, 11.34it/s]