REF: https://leemeng.tw/attack_on_bert_transfer_learning_in_nlp.html

In [1]:
import torch
from transformers import BertTokenizer
from IPython.display import clear_output
import pandas as pd
import numpy as np
import csv

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
I1225 10:51:55.285141 10788 file_utils.py:40] PyTorch version 1.3.1 available.


In [7]:
device = torch.device('cuda:0')
torch.cuda.empty_cache()


## Config 

In [3]:
PRETRAINED_MODEL_NAME = "bert-base-chinese"  # 指定繁簡中文 BERT-BASE 預訓練模型

# 取得此預訓練模型所使用的 tokenizer
tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)
clear_output()
print("PyTorch 版本：", torch.__version__)

PyTorch 版本： 1.3.1


## Initilize a Dataset Class to convert data fitting BERT requirements

In [4]:
"""
實作一個可以用來讀取訓練 / 測試集的 Dataset，這是你需要徹底了解的部分。
此 Dataset 每次將 tsv 裡的一筆成對句子轉換成 BERT 相容的格式，並回傳 3 個 tensors：
- tokens_tensor：兩個句子合併後的索引序列，包含 [CLS] 與 [SEP]
- segments_tensor：可以用來識別兩個句子界限的 binary tensor
- label_tensor：將分類標籤轉換成類別索引的 tensor, 如果是測試集則回傳 None
"""
from torch.utils.data import Dataset
 
    
class OurDataset(Dataset):
    # 讀取前處理後的 tsv 檔並初始化一些參數
    def __init__(self, data, tokenizer):
        if isinstance(data,str) and data in ['train','test']:
            self.mode = data
            self.df = pd.read_csv(data + ".tsv", sep="\t").fillna("")
            self.len = len(self.df)
            self.label_map = {'negative': 0, 'neutral': 1, 'positive': 2}
            self.tokenizer = tokenizer  # 我們將使用 BERT tokenizer
        else : 
            self.mode='predict'
            self.df = pd.DataFrame(data,columns=['sentence'])
            self.len = len(self.df)
            self.label_map = {'negative': 0, 'neutral': 1, 'positive': 2}
            self.tokenizer = tokenizer  # 我們將使用 BERT tokenizer
            
    def __getitem__(self, idx):

        if self.mode == "test":
            text,label = self.df.iloc[idx, :].values
            label_tensor = None
            if idx-1 < 0 :
                prevText = ''
                nextText, nextLabel = self.df.iloc[idx+1, :].values
            elif idx+1 == len(self.df):
                prevText, prevLabel = self.df.iloc[idx-1, :].values
                nextText = ''
            else:
                prevText, prevLabel = self.df.iloc[idx-1, :].values
                nextText, nextLabel = self.df.iloc[idx+1, :].values
                
        elif self.mode=='predict':
            text= self.df.iloc[idx,0]
            label_tensor = None  
            if idx-1 < 0 :
                prevText= ''
                nextText= self.df.iloc[idx+1,0]
            elif idx+1 == len(self.df):
                prevText= self.df.iloc[idx-1,0]
                nextText= ''
            else:
                prevText= self.df.iloc[idx-1,0]
                nextText= self.df.iloc[idx+1,0]
                
        else:
            text, label = self.df.iloc[idx, :].values
            label_id = self.label_map[label]
            label_tensor = torch.tensor(label_id) 
            if idx-1 < 0 :
                prevText = ''
                nextText, nextLabel = self.df.iloc[idx+1, :].values   
            elif idx+1 == len(self.df):
                prevText, prevLabel = self.df.iloc[idx-1, :].values
                nextText = ''
            else:
                prevText, prevLabel = self.df.iloc[idx-1, :].values
                nextText, nextLabel = self.df.iloc[idx+1, :].values   
            # 將 label 文字也轉換成索引方便轉換成 tensor
         
            
        # 建立第一個句子的 BERT tokens 並加入分隔符號 [SEP]
        word_pieces = ["[CLS]"]
        tokens = self.tokenizer.tokenize(text)
        word_pieces += tokens+ ["[SEP]"] 
        len_text = len(word_pieces)
        
         # 第二個句子的 BERT tokens
        prevTokens = self.tokenizer.tokenize(prevText)
        nextTokens = self.tokenizer.tokenize(nextText)
        word_pieces += prevTokens + nextTokens+ ["[SEP]"]
        len_b = len(word_pieces) - len_text
        
        # 將整個 token 序列轉換成索引序列
        ids = self.tokenizer.convert_tokens_to_ids(word_pieces)
        tokens_tensor = torch.tensor(ids)
       
        # 將第一句包含 [SEP] 的 token 位置設為 0，其他為 1 表示第二句
        segments_tensor = torch.tensor([0] * len_text + [1] * len_b, dtype=torch.long)

        return (tokens_tensor, segments_tensor, label_tensor)
    
    def __len__(self):
        return self.len
    
# 初始化一個專門讀取訓練樣本的 Dataset，使用中文 BERT 斷詞
trainset = OurDataset("train" , tokenizer=tokenizer)

## Printing Convered Result

In [5]:
# 選擇第一個樣本
sample_idx = 1

# 將原始文本拿出做比較
text, label = trainset.df.iloc[sample_idx].values
# 利用剛剛建立的 Dataset 取出轉換後的 id tensors
tokens_tensor, segments_tensor, label_tensor = trainset[sample_idx]

# 將 tokens_tensor 還原成文本
tokens = tokenizer.convert_ids_to_tokens(tokens_tensor.tolist())
combined_text = "".join(tokens)

print(f"""[original corpus]
句子 ：{text}
分類  ：{label}

--------------------

[tensors]
tokens_tensor  ：{tokens_tensor}

segments_tensor：{segments_tensor}

label_tensor   ：{label_tensor}

--------------------

[revert tokens_tensors]
{combined_text}
""")

[original corpus]
句子 ：可是定焦也太小气了如果能忍受它缓慢的存储速度的话
分類  ：positive

--------------------

[tensors]
tokens_tensor  ：tensor([ 101, 1377, 3221, 2137, 4193,  738, 1922, 2207, 3698,  749, 1963, 3362,
        5543, 2556, 1358, 2124, 5353, 2714, 4638, 2100,  996, 6862, 2428, 4638,
        6413,  102, 4500, 4638, 3221, 7770, 6862, 6825, 2864, 2339,  868, 4638,
        3198,  952, 3301, 1351,  955,  749, 1378, 5314, 2769,  886, 4500,  749,
         676, 1921,  102])

segments_tensor：tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1])

label_tensor   ：2

--------------------

[revert tokens_tensors]
[CLS]可是定焦也太小气了如果能忍受它缓慢的存储速度的话[SEP]用的是高速连拍工作的时候朋友借了台给我使用了三天[SEP]



## Implementations 

In [6]:
"""
實作可以一次回傳一個 mini-batch 的 DataLoader
這個 DataLoader 吃我們上面定義的 `FakeNewsDataset`，
回傳訓練 BERT 時會需要的 4 個 tensors：
- tokens_tensors  : (batch_size, max_seq_len_in_batch)
- segments_tensors: (batch_size, max_seq_len_in_batch)
- masks_tensors   : (batch_size, max_seq_len_in_batch)
- label_ids       : (batch_size)
"""

from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence

# 這個函式的輸入 `samples` 是一個 list，裡頭的每個 element 都是
# 剛剛定義的 `FakeNewsDataset` 回傳的一個樣本，每個樣本都包含 3 tensors：
# - tokens_tensor
# - segments_tensor
# - label_tensor
# 它會對前兩個 tensors 作 zero padding，並產生前面說明過的 masks_tensors
def create_mini_batch(samples):
    tokens_tensors = [s[0] for s in samples]
    segments_tensors = [s[1] for s in samples]
    
    # 測試集有 labels
    if samples[0][2] is not None:
        label_ids = torch.stack([s[2] for s in samples])
    else:
        label_ids = None
    
    # zero pad 到同一序列長度
    tokens_tensors = pad_sequence(tokens_tensors, 
                                  batch_first=True)
    segments_tensors = pad_sequence(segments_tensors, 
                                    batch_first=True)
    
    # attention masks，將 tokens_tensors 裡頭不為 zero padding
    # 的位置設為 1 讓 BERT 只關注這些位置的 tokens
    masks_tensors = torch.zeros(tokens_tensors.shape, 
                                dtype=torch.long)
    masks_tensors = masks_tensors.masked_fill(
        tokens_tensors != 0, 1)
    
    return tokens_tensors, segments_tensors, masks_tensors, label_ids


# 初始化一個每次回傳 64 個訓練樣本的 DataLoader
# 利用 `collate_fn` 將 list of samples 合併成一個 mini-batch 是關鍵
BATCH_SIZE = 32
trainloader = DataLoader(trainset, batch_size=BATCH_SIZE, 
                         collate_fn=create_mini_batch)

In [101]:
## Just Visualizing Batch
data = next(iter(trainloader))

tokens_tensors, segments_tensors, \
    masks_tensors, label_ids = data

print(f"""
tokens_tensors.shape   = {tokens_tensors.shape} 
{tokens_tensors}
------------------------
segments_tensors.shape = {segments_tensors.shape}
{segments_tensors}
------------------------
masks_tensors.shape    = {masks_tensors.shape}
{masks_tensors}
------------------------
label_ids.shape        = {label_ids.shape}
{label_ids}
""")


tokens_tensors.shape   = torch.Size([32, 64]) 
tensor([[ 101, 4500, 4638,  ...,    0,    0,    0],
        [ 101, 1377, 3221,  ...,    0,    0,    0],
        [ 101, 2339,  868,  ...,    0,    0,    0],
        ...,
        [ 101, 5375, 4157,  ...,    0,    0,    0],
        [ 101, 3297, 3209,  ...,    0,    0,    0],
        [ 101, 2242, 2391,  ...,    0,    0,    0]])
------------------------
segments_tensors.shape = torch.Size([32, 64])
tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]])
------------------------
masks_tensors.shape    = torch.Size([32, 64])
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])
------------------------
label_ids.shape        

In [102]:
# 載入一個可以做中文多分類任務的模型，n_class = 3
from transformers import BertForSequenceClassification

PRETRAINED_MODEL_NAME = "bert-base-chinese"
NUM_LABELS = 3

model = BertForSequenceClassification.from_pretrained(
    PRETRAINED_MODEL_NAME, num_labels=NUM_LABELS)

clear_output()

# high-level 顯示此模型裡的 modules
print("""
name            module
----------------------""")
for name, module in model.named_children():
    if name == "bert":
        for n, _ in module.named_children():
            print(f"{name}:{n}")
    else:
        print("{:15} {}".format(name, module))


name            module
----------------------
bert:embeddings
bert:encoder
bert:pooler
dropout         Dropout(p=0.1, inplace=False)
classifier      Linear(in_features=768, out_features=3, bias=True)


In [103]:
model.config

{
  "attention_probs_dropout_prob": 0.1,
  "directionality": "bidi",
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "is_decoder": false,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "num_labels": 3,
  "output_attentions": false,
  "output_hidden_states": false,
  "output_past": true,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "pruned_heads": {},
  "torchscript": false,
  "type_vocab_size": 2,
  "use_bfloat16": false,
  "vocab_size": 21128
}

In [104]:
def get_predictions(model, dataloader, compute_acc=False):
    predictions = None
    correct = 0
    total = 0
    with torch.no_grad():
        # 遍巡整個資料集
        for data in dataloader:
            
            # 將所有 tensors 移到 GPU 上
            if next(model.parameters()).is_cuda:
                data = [t.to("cuda:0") for t in data if t is not None]

            # 別忘記前 3 個 tensors 分別為 tokens, segments 以及 masks
            # 且強烈建議在將這些 tensors 丟入 `model` 時指定對應的參數名稱
            tokens_tensors, segments_tensors, masks_tensors = data[:3]
            outputs = model(input_ids=tokens_tensors, 
                            token_type_ids=segments_tensors, 
                            attention_mask=masks_tensors)
            
            logits = outputs[0]
            _, pred = torch.max(logits.data, 1)
            # 用來計算訓練集的分類準確率
            if compute_acc:
                labels = data[3]
                total += labels.size(0)
                correct += (pred == labels).sum().item()
                
            # 將當前 batch 記錄下來
            if predictions is None:
                predictions = pred
            else:
                predictions = torch.cat((predictions, pred))
    
    if compute_acc:
        acc = correct / total
        return predictions, acc
    return predictions
    
# 讓模型跑在 GPU 上並取得訓練集的分類準確率
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("device:", device)
model = model.to(device)
_, acc = get_predictions(model, trainloader, compute_acc=True)
print("classification acc:", acc)
def get_learnable_params(module):
    return [p for p in module.parameters() if p.requires_grad]
     
model_params = get_learnable_params(model)
clf_params = get_learnable_params(model.classifier)

print(f"""
整個分類模型的參數量：{sum(p.numel() for p in model_params)}
線性分類器的參數量：{sum(p.numel() for p in clf_params)}
""")

device: cuda:0
classification acc: 0.29894313034725717

整個分類模型的參數量：102269955
線性分類器的參數量：2307



## Training 

In [105]:
%%time

# 訓練模式
model.train()

# 使用 Adam Optim 更新整個分類模型的參數
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)


EPOCHS = 8 
for epoch in range(EPOCHS):
    running_loss = 0.0
    for data in trainloader:
        
        tokens_tensors, segments_tensors, \
        masks_tensors, labels = [t.to(device) for t in data]

        # 將參數梯度歸零
        optimizer.zero_grad()
        
        # forward pass
        outputs = model(input_ids=tokens_tensors, 
                        token_type_ids=segments_tensors, 
                        attention_mask=masks_tensors, 
                        labels=labels)

        loss = outputs[0]
        # backward
        loss.backward()
        optimizer.step()


        # 紀錄當前 batch loss
        running_loss += loss.item()
        
    # 計算分類準確率
    _, acc = get_predictions(model, trainloader, compute_acc=True)

    print('[epoch %d] loss: %.3f, acc: %.3f' %
          (epoch + 1, running_loss, acc))

[epoch 1] loss: 162.221, acc: 0.752
[epoch 2] loss: 118.816, acc: 0.799
[epoch 3] loss: 97.798, acc: 0.802
[epoch 4] loss: 87.248, acc: 0.788
[epoch 5] loss: 80.846, acc: 0.831
[epoch 6] loss: 76.334, acc: 0.840
[epoch 7] loss: 71.036, acc: 0.843
[epoch 8] loss: 67.368, acc: 0.849
Wall time: 10min 33s


## Prediction

In [106]:
# 建立測試集。這邊我們可以用跟訓練時不同的 batch_size，看你 GPU 多大
testset = OurDataset("test", tokenizer=tokenizer)
testloader = DataLoader(testset, batch_size=256, 
                        collate_fn=create_mini_batch)

# 用分類模型預測測試集
predictions = get_predictions(model, testloader)
# 用來將預測的 label id 轉回 label 文字

index_map = {v: k for k, v in testset.label_map.items()}
print(index_map)

result = testset.df
temp_list = list()
for x in predictions.tolist():
    temp_list.append(index_map[x])
result['predict'] = temp_list

{0: 'negative', 1: 'neutral', 2: 'positive'}


## Evaluate Performance

In [107]:

def single_label_f1(mat):
    def f1(p,r):
        return 2*p*r / (p+r)
    print('='*50)
    print ('Performance for Each Label')
    for i,label in enumerate(['negative','neutral','postive']):
        predict_count = sum(mat.T[i])
        precision = mat[i][i] / predict_count
        ground_truth_count = sum(mat[i])
        recall = mat[i][i] / ground_truth_count
        print('{:10} | precision :{:.2f}, Recall : {:.2f}, F1 Score : {:.2f}'.format(label,precision,recall,f1(precision,recall)))

In [108]:
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
f1 = f1_score(result['sentiment'],result['predict'],average = 'macro')
print ('Test ALL:Macro F1-Score %.4f'%f1)
mat = confusion_matrix(result['sentiment'],result['predict'])
print(mat)
single_label_f1(mat)

Test ALL:Macro F1-Score 0.6995
[[ 40  10   2]
 [ 50 402  26]
 [ 12  14  60]]
Performance for Each Label
negative   | precision :0.39, Recall : 0.77, F1 Score : 0.52
neutral    | precision :0.94, Recall : 0.84, F1 Score : 0.89
postive    | precision :0.68, Recall : 0.70, F1 Score : 0.69


In [109]:
def my_f1(mat):
    def f1(p,r):
        return 2*p*r / (p+r)
    p,r = 0,0
    for i,label in enumerate(['negative','neutral','postive']):
        predict_count = sum(mat.T[i])
        p += mat[i][i] / predict_count
        ground_truth_count = sum(mat[i])
        r += mat[i][i] / ground_truth_count
    p /=3
    r /=3
    print(f1(p,r))

In [110]:
my_f1(mat)

0.717677870224492


# Predict On Test Data

In [111]:
import csv
data = []
with open("Test_sample.tsv" , encoding= 'utf-8') as tsvfile:
    tsvreader = csv.reader(tsvfile, delimiter="\t")
    for i, line in enumerate(tsvreader):
            data.append(line)
data


[['千呼万唤始出来，',
  '尼康的APSC小相机终于发布了，',
  'COOLPIX A. 你怎么看呢？',
  '我看，尼康是挤牙膏挤惯了啊，',
  '1，外观既没有V1时尚，',
  '也没P7100专业，',
  '反而类似P系列。',
  '2，CMOS炒冷饭。',
  '3，OVF没有任何提示和显示。',
  '（除了框框)',
  '4，28MM镜头是不错，',
  '可是F2.8定焦也太小气了。',
  '5，电池坑爹，',
  '用D800和V1的电池很难吗？',
  '6，考虑到1100美元的定价，',
  '富士X100S表示很欢乐。',
  '***好处是，',
  '可以确定，',
  '尼康会继续大力发展1系列了***另外体积比X100S小也算是A的优势吧***。',
  '等2014年年中跌倒1900左右的时候就可以入手了。'],
 ['进xe之前知道有一部m8.2才万元左右，',
  '十分心动。',
  '都拿在手中了才突然醒悟，',
  '色彩fuji未必比leica差，',
  '结实耐用在数码时代已经不那么重要了，',
  '所以拿xe,',
  '还能多买只老头玩。'],
 ['二选一：',
  '富士X-Pro1+18mmF2和徕卡X？',
  '玩过的小机器里面感觉成像最好的还是适马的DP系列，',
  '可是它的机械性能又实在是不好，',
  '对焦叽叽的叫声让你觉得随时要坏。',
  'NEX5C玩了半年就出掉了主要是边缘成像太差色散严重，',
  'NEX5C玩了半年就出掉了主要是边缘成像太差色散严重，',
  '用转接要好很多但又没有了自动对焦。',
  '徕卡X2出来后我是成都第一批购买的。',
  '它的做工确实精湛镜头也很漂亮就是一件工艺品，',
  '它的做工确实精湛镜头也很漂亮就是一件工艺品，',
  '在实际使用中发现暗光近景不错有一些独特的味道，',
  '但是阳光下的远景分辨率说句不好听的就像手机拍的一样而且紫边也不小，',
  '但是阳光下的远景分辨率说句不好听的就像手机拍的一样而且紫边也不小，',
  '当然徕卡是神物用的不好主要还是我的水平差。',
  '出掉x2已经是11月了成都天气一直不太好DP这阳光机也暂时没用。',
  '12月陪朋友去买尼康D800时看到了富士的XE1就叫老

In [112]:
%%time
f = open('result.tsv','w',encoding='utf-8')
for d in data:
    predictset = OurDataset(d, tokenizer=tokenizer)
    predictloader = DataLoader(predictset, batch_size=256, 
                        collate_fn=create_mini_batch)

    predictions = get_predictions(model, predictloader)

    index_map = {v: k for k, v in predictset.label_map.items()}
    
    if True:
        raw = predictset.df.values.flatten()
        f.write('\t'.join(raw))
        f.write('\n')
    
    result = [index_map[x] for x in predictions.tolist()]
    f.write('\t'.join(result))
    f.write('\n')
f.close()
    

    

Wall time: 135 ms
