# 引入 library

In [None]:
%%bash
pip install transformers tqdm boto3 requests regex -q

In [None]:
import os
import pandas as pd
import pathlib
import json
import torch
from transformers import BertTokenizer
from IPython.display import clear_output

pathlib.Path().absolute()

PosixPath('/content')

# 把 train.json 轉成 dataframe

In [None]:
import string
def preprocess(s):
  valid = []
  for sp in s.split(' '):
    sp = sp.lower()
    sp = sp.replace('#', '')
    sp = sp.replace('.', '')
    sp = sp.replace(',', '')
    sp = sp.replace('!', '')
    sp = sp.replace('?', '')
    sp = sp.replace('\n', '')
    sp = sp.replace('\r', '')
    if not sp.isalpha():
      continue
    valid.append(sp)
  return ' '.join(valid)

In [None]:
train_json = open('train.json', 'rb').read()
df_train = json.loads(train_json)
for j in df_train:
  j['text'] = preprocess(j['text'])
  j['reply'] = preprocess(j['reply'])
  j['reply'] += ' ' + ' '.join([i.replace('_', ' ') for i in j['categories']])

In [None]:
df_train = pd.DataFrame(df_train)
# 只用 1% 訓練數據看看 BERT 對少量標註數據有多少幫助
SAMPLE_FRAC = 1
df_train = df_train.sample(frac=SAMPLE_FRAC, random_state=9527)

# 去除不必要的欄位並重新命名兩標題的欄位名
df_train = df_train.reset_index()
df_train = df_train.loc[:, ['text', 'reply', 'label']]
df_train.columns = ['text_a', 'text_b', 'label']

# idempotence, 將處理結果另存成 tsv 供 PyTorch 使用
df_train.to_csv("train.tsv", sep="\t", index=False)

print("訓練樣本數：", len(df_train))
df_train.head()

訓練樣本數： 168521


Unnamed: 0,text_a,text_b,label
0,ns was a private underage kid defamed for noth...,if fakenews media gets away with branding a ki...,fake
1,the results speak for themselves the non stop ...,president trump has done a superb job in resto...,fake
2,if ur reading this its too late i already sent...,thank you,real
3,white house news conference today at pm easter...,no do not want,real
4,dad get a tesco delivery slot and was in the g...,love that good on your dad thumbs down,real


# 確認 training baseline

In [None]:
df_train.label.value_counts() / len(df_train)

fake    0.811305
real    0.188695
Name: label, dtype: float64

# 弄成 BERT 的輸入格式

In [None]:
"""
實作一個可以用來讀取訓練 / 測試集的 Dataset，這是你需要徹底了解的部分。
此 Dataset 每次將 tsv 裡的一筆成對句子轉換成 BERT 相容的格式，並回傳 3 個 tensors：
- tokens_tensor：兩個句子合併後的索引序列，包含 [CLS] 與 [SEP]
- segments_tensor：可以用來識別兩個句子界限的 binary tensor
- label_tensor：將分類標籤轉換成類別索引的 tensor, 如果是測試集則回傳 None
"""
from torch.utils.data import Dataset
 
    
class FakeNewsDataset(Dataset):
    # 讀取前處理後的 tsv 檔並初始化一些參數
    def __init__(self, mode, tokenizer):
        assert mode in ["train", "test"]  # 一般訓練你會需要 dev set
        self.mode = mode
        # 大數據你會需要用 iterator=True
        self.df = pd.read_csv(mode + ".tsv", sep="\t").fillna("")
        self.len = len(self.df)
        self.label_map = {'real': 0, 'fake': 1}
        self.tokenizer = tokenizer  # 我們將使用 BERT tokenizer
    
    # 定義回傳一筆訓練 / 測試數據的函式
    def __getitem__(self, idx):
        if self.mode == "test":
            text_a, text_b = self.df.iloc[idx, :2].values
            label_tensor = None
        else:
            text_a, text_b, label = self.df.iloc[idx, :].values
            # 將 label 文字也轉換成索引方便轉換成 tensor
            label_id = self.label_map[label]
            label_tensor = torch.tensor(label_id)
            
        # 建立第一個句子的 BERT tokens 並加入分隔符號 [SEP]
        word_pieces = ["[CLS]"]
        tokens_a = self.tokenizer.tokenize(text_a)
        word_pieces += tokens_a + ["[SEP]"]
        len_a = len(word_pieces)
        
        # 第二個句子的 BERT tokens
        tokens_b = self.tokenizer.tokenize(text_b)
        word_pieces += tokens_b + ["[SEP]"]
        len_b = len(word_pieces) - len_a
        
        # 將整個 token 序列轉換成索引序列
        ids = self.tokenizer.convert_tokens_to_ids(word_pieces)
        tokens_tensor = torch.tensor(ids)
        
        # 將第一句包含 [SEP] 的 token 位置設為 0，其他為 1 表示第二句
        segments_tensor = torch.tensor([0] * len_a + [1] * len_b, 
                                        dtype=torch.long)
        
        return (tokens_tensor, segments_tensor, label_tensor)
    
    def __len__(self):
        return self.len
    
    
# 初始化一個專門讀取訓練樣本的 Dataset，使用中文 BERT 斷詞
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
trainset = FakeNewsDataset("train", tokenizer=tokenizer)

In [None]:
# 選擇第一個樣本
sample_idx = 0

# 將原始文本拿出做比較
text_a, text_b, label = trainset.df.iloc[sample_idx].values

# 利用剛剛建立的 Dataset 取出轉換後的 id tensors
tokens_tensor, segments_tensor, label_tensor = trainset[sample_idx]

# 將 tokens_tensor 還原成文本
tokens = tokenizer.convert_ids_to_tokens(tokens_tensor.tolist())
combined_text = "".join(tokens)

# 渲染前後差異，毫無反應就是個 print。可以直接看輸出結果
print(f"""[原始文本]
句子 1：{text_a}
句子 2：{text_b}
分類  ：{label}

--------------------

[Dataset 回傳的 tensors]
tokens_tensor  ：{tokens_tensor}

segments_tensor：{segments_tensor}

label_tensor   ：{label_tensor}

--------------------

[還原 tokens_tensors]
{combined_text}
""")

[原始文本]
句子 1：ns was a private underage kid defamed for nothing news choses what to report and what to state as fact this was not newsworthy and media had reckless disregard for facts there was no excuse it go unpunished is garbage
句子 2：if fakenews media gets away with branding a kid as the perpetrator of a hate crime with no evidence then no one is safe in public anymore ns needs a better legal team judge needs to be fired and media needs to be punished or we all will be freedom of press has limits 
分類  ：fake

--------------------

[Dataset 回傳的 tensors]
tokens_tensor  ：tensor([  101, 24978,  2001,  1037,  2797,  2104,  4270,  4845, 13366, 14074,
         2094,  2005,  2498,  2739,  4900,  2015,  2054,  2000,  3189,  1998,
         2054,  2000,  2110,  2004,  2755,  2023,  2001,  2025,  2739, 13966,
         1998,  2865,  2018, 18555, 27770,  2005,  8866,  2045,  2001,  2053,
         8016,  2009,  2175,  4895, 14289, 28357,  2003, 13044,   102,  2065,
         8275,  2638,  9333,  2865,

# 實作 mini-batch dataloader

In [None]:
"""
實作可以一次回傳一個 mini-batch 的 DataLoader
這個 DataLoader 吃我們上面定義的 `FakeNewsDataset`，
回傳訓練 BERT 時會需要的 4 個 tensors：
- tokens_tensors  : (batch_size, max_seq_len_in_batch)
- segments_tensors: (batch_size, max_seq_len_in_batch)
- masks_tensors   : (batch_size, max_seq_len_in_batch)
- label_ids       : (batch_size)
"""

from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence

# 這個函式的輸入 `samples` 是一個 list，裡頭的每個 element 都是
# 剛剛定義的 `FakeNewsDataset` 回傳的一個樣本，每個樣本都包含 3 tensors：
# - tokens_tensor
# - segments_tensor
# - label_tensor
# 它會對前兩個 tensors 作 zero padding，並產生前面說明過的 masks_tensors
def create_mini_batch(samples):
    tokens_tensors = [s[0] for s in samples]
    segments_tensors = [s[1] for s in samples]
    
    # 測試集有 labels
    if samples[0][2] is not None:
        label_ids = torch.stack([s[2] for s in samples])
    else:
        label_ids = None
    
    # zero pad 到同一序列長度
    tokens_tensors = pad_sequence(tokens_tensors, 
                                  batch_first=True)
    segments_tensors = pad_sequence(segments_tensors, 
                                    batch_first=True)
    
    # attention masks，將 tokens_tensors 裡頭不為 zero padding
    # 的位置設為 1 讓 BERT 只關注這些位置的 tokens
    masks_tensors = torch.zeros(tokens_tensors.shape, 
                                dtype=torch.long)
    masks_tensors = masks_tensors.masked_fill(
        tokens_tensors != 0, 1)
    
    return tokens_tensors, segments_tensors, masks_tensors, label_ids


# 初始化一個每次回傳 1 個訓練樣本的 DataLoader
# 利用 `collate_fn` 將 list of samples 合併成一個 mini-batch 是關鍵
BATCH_SIZE = 1
trainloader = DataLoader(trainset, batch_size=BATCH_SIZE, 
                         collate_fn=create_mini_batch)

In [None]:
data = next(iter(trainloader))

tokens_tensors, segments_tensors, \
    masks_tensors, label_ids = data

print(f"""
tokens_tensors.shape   = {tokens_tensors.shape} 
{tokens_tensors}
------------------------
segments_tensors.shape = {segments_tensors.shape}
{segments_tensors}
------------------------
masks_tensors.shape    = {masks_tensors.shape}
{masks_tensors}
------------------------
label_ids.shape        = {label_ids.shape}
{label_ids}
""")


tokens_tensors.shape   = torch.Size([1, 108]) 
tensor([[  101, 24978,  2001,  1037,  2797,  2104,  4270,  4845, 13366, 14074,
          2094,  2005,  2498,  2739,  4900,  2015,  2054,  2000,  3189,  1998,
          2054,  2000,  2110,  2004,  2755,  2023,  2001,  2025,  2739, 13966,
          1998,  2865,  2018, 18555, 27770,  2005,  8866,  2045,  2001,  2053,
          8016,  2009,  2175,  4895, 14289, 28357,  2003, 13044,   102,  2065,
          8275,  2638,  9333,  2865,  4152,  2185,  2007, 16140,  1037,  4845,
          2004,  1996,  2566, 22327, 16259,  1997,  1037,  5223,  4126,  2007,
          2053,  3350,  2059,  2053,  2028,  2003,  3647,  1999,  2270,  4902,
         24978,  3791,  1037,  2488,  3423,  2136,  3648,  3791,  2000,  2022,
          5045,  1998,  2865,  3791,  2000,  2022, 14248,  2030,  2057,  2035,
          2097,  2022,  4071,  1997,  2811,  2038,  6537,   102]])
------------------------
segments_tensors.shape = torch.Size([1, 108])
tensor([[0, 0, 0, 0, 0, 

# 載入 BERT

In [None]:
# 載入一個可以做中文多分類任務的模型，n_class = 3
from transformers import BertForSequenceClassification

PRETRAINED_MODEL_NAME = "bert-base-uncased"
NUM_LABELS = 3

model = BertForSequenceClassification.from_pretrained(
  PRETRAINED_MODEL_NAME, num_labels=NUM_LABELS)

clear_output()

# high-level 顯示此模型裡的 modules
print("""
name            module
----------------------""")
for name, module in model.named_children():
  if name == "bert":
      for n, _ in module.named_children():
          print(f"{name}:{n}")
  else:
      print("{:15} {}".format(name, module))


name            module
----------------------
bert:embeddings
bert:encoder
bert:pooler
dropout         Dropout(p=0.1, inplace=False)
classifier      Linear(in_features=768, out_features=3, bias=True)


# 開 train

In [None]:
def get_predictions(model, dataloader, compute_acc=False):
    predictions = None
    correct = 0
    total = 0
      
    with torch.no_grad():
        # 遍巡整個資料集
        for data in dataloader:
            # 將所有 tensors 移到 GPU 上
            if next(model.parameters()).is_cuda:
                data = [t.to("cuda:0") for t in data if t is not None]
            
            
            # 別忘記前 3 個 tensors 分別為 tokens, segments 以及 masks
            # 且強烈建議在將這些 tensors 丟入 `model` 時指定對應的參數名稱
            tokens_tensors, segments_tensors, masks_tensors = data[:3]
            outputs = model(input_ids=tokens_tensors, 
                            token_type_ids=segments_tensors, 
                            attention_mask=masks_tensors)
            
            logits = outputs[0]
            _, pred = torch.max(logits.data, 1)
            
            # 用來計算訓練集的分類準確率
            if compute_acc:
                labels = data[3]
                total += labels.size(0)
                correct += (pred == labels).sum().item()
                
            # 將當前 batch 記錄下來
            if predictions is None:
                predictions = pred
            else:
                predictions = torch.cat((predictions, pred))
    
    if compute_acc:
        acc = correct / total
        return predictions, acc
    return predictions
    
# 讓模型跑在 GPU 上並取得訓練集的分類準確率
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("device:", device)
model = model.to(device)
_, acc = get_predictions(model, trainloader, compute_acc=True)
print("classification acc:", acc)

device: cuda:0
classification acc: 0.17799562072382671


In [None]:
%%time

# 訓練模式
model.train()

# 使用 Adam Optim 更新整個分類模型的參數
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)


EPOCHS = 6  # 幸運數字
for epoch in range(EPOCHS):
    
    running_loss = 0.0
    for data in trainloader:
        
        tokens_tensors, segments_tensors, \
        masks_tensors, labels = [t.to(device) for t in data]

        # 將參數梯度歸零
        optimizer.zero_grad()
        
        # forward pass
        outputs = model(input_ids=tokens_tensors, 
                        token_type_ids=segments_tensors, 
                        attention_mask=masks_tensors, 
                        labels=labels)

        loss = outputs[0]
        # backward
        loss.backward()
        optimizer.step()


        # 紀錄當前 batch loss
        running_loss += loss.item()
        
    # 計算分類準確率
    _, acc = get_predictions(model, trainloader, compute_acc=True)

    print('[epoch %d] loss: %.3f, acc: %.3f' %
          (epoch + 1, running_loss, acc))

# 開 Test

In [None]:
df_test = open('dev.json', 'rb').read()
#df_test = open('eval.json', 'rb').read()

df_test = json.loads(df_test)
for j in df_test:
  j['text'] = preprocess(j['text'])
  j['reply'] = preprocess(j['reply'])
  j['reply'] += ' ' + ' '.join([i.replace('_', ' ') for i in j['categories']])

In [None]:
df_test = pd.DataFrame(df_test)

# 去除不必要的欄位並重新命名兩標題的欄位名
df_test = df_test.reset_index()

df_test = df_test.loc[:, ['text', 'reply', 'idx', 'context_idx']]
df_test.columns = ["text_a", "text_b", "idx", 'context_idx']
# idempotence, 將處理結果另存成 tsv 供 PyTorch 使用
df_test.to_csv("test.tsv", sep="\t", index=False)

print("測試樣本數：", len(df_test))
df_test.head()

In [None]:
%%time
# 建立測試集。這邊我們可以用跟訓練時不同的 batch_size，看你 GPU 多大
testset = FakeNewsDataset("test", tokenizer=tokenizer)
testloader = DataLoader(testset, batch_size=1, 
                        collate_fn=create_mini_batch)

# 用分類模型預測測試集
predictions = get_predictions(model, testloader)

# 用來將預測的 label id 轉回 label 文字
index_map = {v: k for k, v in testset.label_map.items()}

# 生成比賽格式(https://sites.google.com/view/covidfake-emoreact-2021/shared-task/submission-format?authuser=0)

In [None]:
df = pd.DataFrame({"label": predictions.tolist()})
df['label'] = df.label.apply(lambda x: index_map[x])
df_pred = pd.concat([testset.df.loc[:, ["idx", "context_idx"]], df.loc[:, 'label']], axis=1)
#df_pred.to_csv('eval.csv', index=False)
df_pred.to_csv('dev.csv', index=False)