https://leemeng.tw/attack_on_bert_transfer_learning_in_nlp.html  

In [1]:
import torch 
from transformers import BertTokenizer, BertModel
from IPython.display import clear_output
import random

from transformers import BertForMaskedLM

import sys
!test -d bertviz_repo || git clone https://github.com/jessevig/bertviz bertviz_repo  
if not 'bertviz_repo' in sys.path:
    sys.path += ['bertviz_repo']
from bertviz import head_view # view attention
import IPython

import os
import pandas as pd
from torch.utils.data import Dataset
!pip install pysnooper -q
import pysnooper
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence
from transformers import BertForSequenceClassification

In [2]:
PRETRAINED_MODEL_NAME = 'bert-base-chinese' # bert 預訓練模型
tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME) 
clear_output()
print('Pytorch 版本: ', torch.__version__)

Pytorch 版本:  1.10.2+cu102


In [3]:
vocab = tokenizer.vocab
print('字典大小: ', len(vocab))

random_tokens = random.sample(list(vocab), 10)
random_ids = [vocab[t] for t in random_tokens]
print('{0:20}{1:15}'.format('token', 'index'))
print('-'*25)
for t, id in zip(random_tokens, random_ids):
    print('{0:15}{1:10}'.format(t, id))

print('注音符號')
indices = list(range(647,657))
some_pairs = [(t, idx) for t ,idx in vocab.items() if idx in indices] 
for pair in some_pairs:
    print(pair)

字典大小:  21128
token               index          
-------------------------
##廉                 15499
##瞻                 17807
鯨                    7809
姿                    2013
##枸                 16432
逐                    6852
仓                     797
泳                    3807
##ode               10260
##寵                 15244
注音符號
('ㄅ', 647)
('ㄆ', 648)
('ㄇ', 649)
('ㄉ', 650)
('ㄋ', 651)
('ㄌ', 652)
('ㄍ', 653)
('ㄎ', 654)
('ㄏ', 655)
('ㄒ', 656)


<a id = fstkb></a>
## five special tokens in bert
* [CLS]：在做分類任務時其最後一層的 repr. 會被視為整個輸入序列的 repr.  
* [SEP]：有兩個句子的文本會被串接成一個輸入序列，並在兩句之間插入這個 token 以做區隔  
* [UNK]：沒出現在 BERT 字典裡頭的字會被這個 token 取代  
* [PAD]：zero padding 遮罩，將長度不一的輸入序列補齊方便做 batch 運算  
* [MASK]：未知遮罩，僅在預訓練階段會用到  

In [4]:
text = "[CLS]等到潮水[MASK]了，就知道誰沒穿褲子。"
tokens = tokenizer.tokenize(text)
ids = tokenizer.convert_tokens_to_ids(tokens)

print(text)
print(tokens[:10], '...') # words 
print(ids[:10], '...')  # words index

[CLS]等到潮水[MASK]了，就知道誰沒穿褲子。
['[CLS]', '等', '到', '潮', '水', '[MASK]', '了', '，', '就', '知'] ...
[101, 5023, 1168, 4060, 3717, 103, 749, 8024, 2218, 4761] ...


In [5]:
tokens_tensor = torch.tensor([ids]) # 
print('token_tensor', tokens_tensor.shape) # (1, seq_len)
segments_tensors = torch.zeros_like(tokens_tensor)
maskedLM_model = BertForMaskedLM.from_pretrained(PRETRAINED_MODEL_NAME) 
clear_output

maskedLM_model.eval()
with torch.no_grad():
    outputs = maskedLM_model(tokens_tensor, segments_tensors) 
    predictions = outputs[0]
del maskedLM_model

masked_index = 5 # 預測字詞位置
k = 3 # 機率最高前k個
probs, indices = torch.topk(torch.softmax(predictions[0, masked_index], -1), k)
predicted_tokens = tokenizer.convert_ids_to_tokens(indices.tolist()) 
print('probs: ', probs, 
     'indices: ', indices)

print('輸入tokens: ', tokens[:10], '...')
print('-'*50)
for i, (t, p) in enumerate(zip(predicted_tokens, probs) ,1): 
    tokens[masked_index] = t
    print('Top{} ({:2}%) : {}'.format(i, int(p.item()*100), tokens[:10]), '...' ) 
    


token_tensor torch.Size([1, 17])


Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


probs:  tensor([0.6731, 0.2515, 0.0284]) indices:  tensor([ 889, 4086, 6882])
輸入tokens:  ['[CLS]', '等', '到', '潮', '水', '[MASK]', '了', '，', '就', '知'] ...
--------------------------------------------------
Top1 (67%) : ['[CLS]', '等', '到', '潮', '水', '來', '了', '，', '就', '知'] ...
Top2 (25%) : ['[CLS]', '等', '到', '潮', '水', '濕', '了', '，', '就', '知'] ...
Top3 ( 2%) : ['[CLS]', '等', '到', '潮', '水', '過', '了', '，', '就', '知'] ...


In [6]:
# bert 內部注意力視覺化
def call_html(): 
    display(IPython.core.display.HTML(
        '''
        <script src="/static/components/requirejs/require.js"></script>
        <script>
          requirejs.config({
            paths: {
              base: '/static/base',
              "d3": "https://cdnjs.cloudflare.com/ajax/libs/d3/3.5.8/d3.min",
              jquery: '//ajax.googleapis.com/ajax/libs/jquery/2.0.0/jquery.min',
            },
          });
        </script>
        '''
        )
    )
clear_output()

model_version = 'bert-base-chinese'
model = BertModel.from_pretrained(model_version, output_attentions = True) 
tokenizer = BertTokenizer.from_pretrained(model_version)

sentence_a = '胖虎叫大雄去買漫畫，'
sentence_b = '回來慢了就打他。'

inputs = tokenizer.encode_plus(sentence_a, sentence_b, return_tensors = 'pt', add_special_tokens = True) #return_tensors: pt pytorch, tf tensorflow
token_type_ids = inputs['token_type_ids'] # seg 
input_ids = inputs['input_ids']
attention = model(input_ids, token_type_ids = token_type_ids)[-1] 
input_id_list = input_ids[0].tolist()
tokens = tokenizer.convert_ids_to_tokens(input_id_list)
call_html()

head_view(attention, tokens) # view

Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


<IPython.core.display.Javascript object>

## bert fine tune
* data = https://www.kaggle.com/c/fake-news-pair-classification-challenge/submissions  
submmit: https://www.kaggle.com/c/fake-news-pair-classification-challenge/submit  
  
* step
1. 準備原始文本數據
2. 將原始文本轉換成 BERT 相容的輸入格式
3. 在 BERT 之上加入新 layer 成下游任務模型
4. 訓練該下游任務模型
5. 對新樣本做推論

### 準備原始文本數據

In [7]:
data_ = pd.read_csv('/home/bettyliao/sentiment/data/fake-news-pair-classification-challenge/train.csv') 

# preprocessing
# remove empty title
empty_title = ((data_['title2_zh'].isnull()) \
              |(data_['title1_zh'].isnull()) \
              |(data_['title2_zh'] == '') \
              |(data_['title2_zh'] == '0')
              )
data_ = data_[~empty_title]
print('remove empty title: ', len(data_))

MAX_LENGTH = 30
data_ = data_[ ~(data_.title1_zh.apply(lambda x: len(x) > MAX_LENGTH))]
data_ = data_[ ~(data_.title2_zh.apply(lambda x: len(x) > MAX_LENGTH))]

SAMPLE_FRAC = 0.01
train = data_.sample(frac = SAMPLE_FRAC, random_state = 9527) 

train = train.reset_index()
train = train.loc[:, ['title1_zh', 'title2_zh', 'label']]
train.columns = ['text_a', 'text_b', 'label']

# save as tsv (bert format)
train.to_csv('/home/bettyliao/sentiment/data/fake-news-pair-classification-challenge/train.tsv', sep = '\t', index = False) 
print('train: ', len(train))
print('train label: \n', train.label.value_counts()/len(train))

test = pd.read_csv('/home/bettyliao/sentiment/data/fake-news-pair-classification-challenge/test.csv') 
test = test.loc[:, ['title1_zh', 'title2_zh', 'id']]
test.columns = ['text_a', 'text_b', 'Id']
test.to_csv('/home/bettyliao/sentiment/data/fake-news-pair-classification-challenge/test.tsv', sep = '\t', index = False) 
print('test: ', len(test))
ratio = round(len(test)/ len(train), 2)
print('test/train = {}'.format(ratio))

remove empty title:  320543
train:  2657
train label: 
 unrelated    0.679338
agreed       0.294317
disagreed    0.026346
Name: label, dtype: float64
test:  80126
test/train = 30.16


### 將原始文本轉換成 BERT 相容的輸入格式
<img src = 'https://leemeng.tw/images/bert/practical_bert_encoding_for_pytorch.jpg'> 
  
* tokens_tensor: 代表識別每個 token 的索引值(use tokenizer)
* segments_tensor:來識別句子界限。第一句為 0，第二句則為 1。另外注意句子間的 *[SEP]* 為 0
* masks_tensor: 用來界定自注意力機制範圍。1 讓 BERT 關注該位置，0 則代表是 padding 不需關注


Dataset 將tsv的句子轉為bert相容格式，回傳三個tensor:
1. tokens_tensor: 句子合併後的所引序列，包含[CLS]、[SEP]
2. segments_tensor: 可以用來辨識句子界限的binary tensor
3. label_tensor: 將分類標籤轉換成類別索引的 tensor, 如果是測試集則回傳 None

In [8]:
class FakeNewsDataset(Dataset):
    os.chdir('/home/bettyliao/sentiment/data/fake-news-pair-classification-challenge/')
    # 初始參數
    def __init__(self, mode, tokenizer):
        assert mode in ['train', 'test'] # dev set
        self.mode = mode
        self.df = pd.read_csv(mode + '.tsv', sep = '\t').fillna('') 
        self.len = len(self.df)
        self.label_map = {'agreed': 0, 'disagreed': 1, 'unrelated': 2} 
        self.tokenizer = tokenizer
    # @pysnooper.snoop() # 列出所有轉換過程
    def __getitem__(self, idx):
        if self.mode == 'test':
            text_a, text_b = self.df.iloc[idx, :2].values
            label_tensor = None # test: no label
        else:
            text_a, text_b, label = self.df.iloc[idx, :].values
            label_id = self.label_map[label]
            label_tensor = torch.tensor(label_id)
        # first sentence
        word_pieces = ["[CLS]"]
        tokens_a = self.tokenizer.tokenize(text_a)
        word_pieces += tokens_a + ["[SEP]"]
        len_a = len(word_pieces)
        # second sentence
        tokens_b = self.tokenizer.tokenize(text_b)
        word_pieces += tokens_b + ["[SEP]"]
        len_b = len(word_pieces) - len_a
        
        ids = self.tokenizer.convert_tokens_to_ids(word_pieces) 
        tokens_tensor = torch.tensor(ids)
        segments_tensor = torch.tensor([0] * len_a + [1] * len_b, 
                                    dtype = torch.long)
        return (tokens_tensor, segments_tensor, label_tensor)
    def __len__(self):
        return self.len

trainset = FakeNewsDataset('train', tokenizer = tokenizer)        

In [9]:
sample_idx = 0

# 原始文章
text_a, text_b, label = trainset.df.iloc[sample_idx].values 
# tensor
tokens_tensor, segments_tensor, label_tensor = trainset[sample_idx] 
# tokens convert to origin
tokens = tokenizer.convert_ids_to_tokens(tokens_tensor.tolist()) 
combined_text = ''.join(tokens)

print(f"""
[原始文本]
句子1 :{text_a}
句子2: {text_b}
分類: {label}
------------------------
[Dataset 回傳tensors]
tokens_tensor: {tokens_tensor}
segments_tensor: {segments_tensor}
label_tensor: {label_tensor}
--------------------------
[還原tokens_tensor]
{combined_text}
""")


[原始文本]
句子1 :苏有朋要结婚了，但网友觉得他还是和林心如比较合适
句子2: 好闺蜜结婚给不婚族的秦岚扔花球，倒霉的秦岚掉水里笑哭苏有朋！
分類: unrelated
------------------------
[Dataset 回傳tensors]
tokens_tensor: tensor([ 101, 5722, 3300, 3301, 6206, 5310, 2042,  749, 8024,  852, 5381, 1351,
        6230, 2533,  800, 6820, 3221, 1469, 3360, 2552, 1963, 3683, 6772, 1394,
        6844,  102, 1962, 7318, 6057, 5310, 2042, 5314,  679, 2042, 3184, 4638,
        4912, 2269, 2803, 5709, 4413, 8024,  948, 7450, 4638, 4912, 2269, 2957,
        3717, 7027, 5010, 1526, 5722, 3300, 3301, 8013,  102])
segments_tensor: tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1])
label_tensor: 2
--------------------------
[還原tokens_tensor]
[CLS]苏有朋要结婚了，但网友觉得他还是和林心如比较合适[SEP]好闺蜜结婚给不婚族的秦岚扔花球，倒霉的秦岚掉水里笑哭苏有朋！[SEP]



In [10]:
def create_mini_batch(samples):
    # samples contain many tensors
    tokens_tensors = [s[0] for s in samples]
    segments_tensors = [s[1] for s in samples]
    if samples[0][2] is not None:
        label_ids = torch.stack([s[2] for s in samples]) 
    else: 
        label_ids = None
    tokens_tensors = pad_sequence(tokens_tensors, batch_first = True) # batch tensor  
    segments_tensors = pad_sequence(segments_tensors, batch_first = True) 
    masks_tensors = torch.zeros(tokens_tensors.shape,
                               dtype = torch.long)
    masks_tensors = masks_tensors.masked_fill(tokens_tensors != 0, 1) 
    return tokens_tensors, segments_tensors, masks_tensors, label_ids

# 初始化每次回傳64個訓練樣本的dataloader
BATCH_SIZE = 64 # 作者建議是用16、32
trainloader = DataLoader(trainset, batch_size = BATCH_SIZE,
                        collate_fn = create_mini_batch)

In [11]:
data = next(iter(trainloader))

tokens_tensors, segments_tensors, masks_tensors, label_ids = data  

# tokens_tensors, segments_tensors, masks_tensors因長度不同需padding  
print(f"""
tokens_tensors: {tokens_tensors.shape}
{tokens_tensors}
-------------------------------------
segments_tensors: {segments_tensors.shape}
{segments_tensors}
------------------------------------------
masks_tensors: {masks_tensors.shape}
{masks_tensors}
------------------------------------
label_ids.shape = {label_ids.shape}
{label_ids}
""")


tokens_tensors: torch.Size([64, 63])
tensor([[ 101, 5722, 3300,  ...,    0,    0,    0],
        [ 101, 4255, 3160,  ..., 8013,  102,    0],
        [ 101,  711, 2506,  ..., 8013,  102,    0],
        ...,
        [ 101,  671, 2157,  ...,    0,    0,    0],
        [ 101, 1380,  677,  ...,    0,    0,    0],
        [ 101, 2458, 1853,  ...,    0,    0,    0]])
-------------------------------------
segments_tensors: torch.Size([64, 63])
tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 1, 1, 0],
        [0, 0, 0,  ..., 1, 1, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]])
------------------------------------------
masks_tensors: torch.Size([64, 63])
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 0],
        [1, 1, 1,  ..., 1, 1, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])
------------------------------------
label_i

## 在 BERT 之上加入新 layer 成下游任務模型

In [12]:
# 中文多分類
PRETRAINED_MODEL_NAME = 'bert-base-chinese'
NUM_LABELS = 3 # 取三分類
model = BertForSequenceClassification.from_pretrained(PRETRAINED_MODEL_NAME, num_labels = NUM_LABELS)  
clear_output()

# high-level 顯示模型裡的modules
# 由上而下的資料流
print("""
name       module
-------------------""")
for name, module in model.named_children():
    if name == 'bert':
        for n, _ in module.named_children():
            print(f'{name} : {n}')
    else:
        print('{:15} {}'.format(name, module))


name       module
-------------------
bert : embeddings
bert : encoder
bert : pooler
dropout         Dropout(p=0.1, inplace=False)
classifier      Linear(in_features=768, out_features=3, bias=True)


```
class BertForSequenceClassification(BertPreTrainedModel):
    def __init__(self, config, num_labels=2, ...):
        super(BertForSequenceClassification, self).__init__(config)
        self.num_labels = num_labels # number of label
        self.bert = BertModel(config, ...)  # 載入預訓練 BERT
        self.dropout = nn.Dropout(config.hidden_dropout_prob) # set dropout
        # 簡單 linear 層
        self.classifier = nn.Linear(config.hidden_size, num_labels) # add linear
          ...

    def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None, ...):
        # BERT 輸入就是 tokens, segments, masks
        outputs = self.bert(input_ids, token_type_ids, attention_mask, ...)
        ...
        pooled_output = self.dropout(pooled_output)
        # 線性分類器將 dropout 後的 BERT repr. 轉成類別 logits
        logits = self.classifier(pooled_output)

        # 輸入有 labels 的話直接計算 Cross Entropy 回傳，方便！
        if labels is not None:
            loss_fct = CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            return loss
        # 有要求回傳注意矩陣的話回傳
        elif self.output_attentions:
            return all_attentions, logits
        # 回傳各類別的 logits
        return logits
model.config # 參數設定，調整bertForSequenceClassification
```

In [13]:
def get_predictions(model, dataloader, compute_acc=False):
    predictions = None
    correct = 0
    total = 0
      
    with torch.no_grad():
        # 遍巡整個資料集
        for data in dataloader:
            # 將所有 tensors 移到 GPU 上
            if next(model.parameters()).is_cuda:
                data = [t.to("cuda:0") for t in data if t is not None]
            
            
            # 別忘記前 3 個 tensors 分別為 tokens, segments 以及 masks
            # 且強烈建議在將這些 tensors 丟入 `model` 時指定對應的參數名稱
            tokens_tensors, segments_tensors, masks_tensors = data[:3]
            outputs = model(input_ids=tokens_tensors, 
                            token_type_ids=segments_tensors, 
                            attention_mask=masks_tensors)
            
            logits = outputs[0]
            _, pred = torch.max(logits.data, 1)
            
            # 用來計算訓練集的分類準確率
            if compute_acc:
                labels = data[3]
                total += labels.size(0)
                correct += (pred == labels).sum().item()
                
            # 將當前 batch 記錄下來
            if predictions is None:
                predictions = pred
            else:
                predictions = torch.cat((predictions, pred))
    
    if compute_acc:
        acc = correct / total
        return predictions, acc
    return predictions
    
# 讓模型跑在 GPU 上並取得訓練集的分類準確率
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("device:", device)
model = model.to(device)
_, acc = get_predictions(model, trainloader, compute_acc=True)
print("初始 classification acc:", acc)

device: cpu
初始 classification acc: 0.029732781332329695


In [14]:
def get_learnable_params(module):
    return [p for p in module.parameters() if p.requires_grad] 

model_params = get_learnable_params(model)
clf_params = get_learnable_params(model.classifier)

print(f'''
整體模型參數： {sum(p.numel() for p in model_params)}
線性模型參數: {sum(p.numel() for p in clf_params)}
''')


整體模型參數： 102269955
線性模型參數: 2307



### 訓練該下游任務模型

In [15]:
%%time
model.train()

optimizer = torch.optim.Adam(model.parameters(), lr = 1e-5) 
EPOCHS = 6  #
for epoch in range(EPOCHS):
    
    running_loss = 0.0
    for data_ in trainloader:
        
        tokens_tensors, segments_tensors, \
        masks_tensors, labels = [t.to(device) for t in data_]

        # 將參數梯度歸零
        optimizer.zero_grad()
        
        # forward pass
        outputs = model(input_ids = tokens_tensors, 
                        token_type_ids = segments_tensors, 
                        attention_mask = masks_tensors, 
                        labels = labels)

        loss = outputs[0]
        # backward
        loss.backward()
        optimizer.step()


        # 紀錄當前 batch loss
        running_loss += loss.item()
        
    # 計算分類準確率
    _, acc = get_predictions(model, trainloader, compute_acc=True)

    print('[epoch %d] loss: %.3f, acc: %.3f' %
          (epoch + 1, running_loss, acc))

[epoch 1] loss: 34.772, acc: 0.807
[epoch 2] loss: 19.561, acc: 0.838
[epoch 3] loss: 14.576, acc: 0.874
[epoch 4] loss: 10.463, acc: 0.928
[epoch 5] loss: 7.743, acc: 0.928
[epoch 6] loss: 6.984, acc: 0.962
CPU times: user 4h 57min 5s, sys: 1min 10s, total: 4h 58min 15s
Wall time: 50min 52s


### 對新樣本做推論

In [16]:
%%time
# 建立測試集。這邊我們可以用跟訓練時不同的 batch_size，看你 GPU 多大
testset = FakeNewsDataset("test", tokenizer=tokenizer)
testloader = DataLoader(testset, batch_size=256, 
                        collate_fn=create_mini_batch)

# 用分類模型預測測試集
predictions = get_predictions(model, testloader)

# 用來將預測的 label id 轉回 label 文字
index_map = {v: k for k, v in testset.label_map.items()}

df = pd.DataFrame({"Category": predictions.tolist()})
df['Category'] = df.Category.apply(lambda x: index_map[x])
df_pred = pd.concat([testset.df.loc[:, ["Id"]], 
                          df.loc[:, 'Category']], axis=1)
df_pred.to_csv('/home/bettyliao/sentiment/output/fakenews_pred(1).csv', index = False ) 
df_pred.head()

CPU times: user 7h 31min 20s, sys: 6min 43s, total: 7h 38min 3s
Wall time: 1h 19min 4s


Unnamed: 0,Id,Category
0,321187,unrelated
1,321190,unrelated
2,321189,unrelated
3,321193,unrelated
4,321191,unrelated


In [17]:
predictions = get_predictions(model, trainloader)
df = pd.DataFrame({'predicted': predictions.tolist()})
df['predicted'] = df.predicted.apply(lambda x: index_map[x]) 
df1 = pd.concat([trainset.df, df.loc[:,'predicted']], axis = 1) 
disagreed_tp = ((df1.label == 'disagreed') & \
                (df1.label == df1.predicted) & \
                (df1.text_a.apply(lambda x: True if len(x) < 10 else False)))
df1[disagreed_tp].head()

Unnamed: 0,text_a,text_b,label,predicted
603,海口飞机撒药治白蛾,3月谣言盘点：飞机撒药治白蛾、驾考新规，你中“谣”了吗？,disagreed,disagreed
803,烟王褚时健去世,辟谣：一代烟王褚时健安好！,disagreed,disagreed
952,李宇春跟老外结婚,李宇春被传嫁给78岁老外？春爸被逼亲自辟谣：假的！,disagreed,disagreed
1752,海口飞机撒药治白蛾,紧急辟谣 飞机又来撒药治白蛾了？别再传了，是假的！,disagreed,disagreed
2646,12306数据泄漏,铁路12306 辟谣，称网站未发生用户信息泄漏！,disagreed,disagreed


In [18]:
model_version = 'bert-base-chinese'
finetuned_model = BertModel.from_pretrained(model_version, 
                                           output_attentions = True, state_dict = model.state_dict()) 
sentence_a = "烟王褚时健去世"
sentence_b = "辟谣：一代烟王褚时健安好！"
inputs = tokenizer.encode_plus(sentence_a, sentence_b, return_tensors = 'pt', add_special_tokens = True)
token_type_ids = inputs['token_type_ids']
input_ids = inputs['input_ids']
attention = finetuned_model(input_ids, token_type_ids = token_type_ids)[-1] 
input_id_list = input_ids[0].tolist() # batch index 0
tokens = tokenizer.convert_ids_to_tokens(input_id_list)
call_html()
head_view(attention, tokens)


Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertModel: ['classifier.weight', 'classifier.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


<IPython.core.display.Javascript object>