<a href="https://colab.research.google.com/github/tinginde/Fakenews_detection_bert/blob/main/fakenews.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 資料前處理

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd

In [3]:
df_train = pd.read_csv("train.csv", sep=',')

In [4]:
empty_title = ((df_train['title2_zh'].isnull()) \
               | (df_train['title1_zh'].isnull()) \
               | (df_train['title2_zh'] == '') \
               | (df_train['title2_zh'] == '0'))

In [5]:
df_train = df_train[~empty_title]

In [None]:
df_train

In [6]:
MAX_LENGTH = 30
df_train = df_train[~(df_train.title1_zh.apply(lambda x : len(x)) > MAX_LENGTH)]
df_train = df_train[~(df_train.title2_zh.apply(lambda x : len(x)) > MAX_LENGTH)]

In [7]:
SAMPLE_FRAC = 0.01
df_train = df_train.sample(frac=SAMPLE_FRAC, random_state=9527)

In [8]:
df_train = df_train.reset_index()
df_train = df_train.loc[:, ['title1_zh', 'title2_zh', 'label']]
df_train.columns = ['text_a', 'text_b', 'label']

In [10]:
df_train.to_csv("train.tsv", sep="\t", index=False)

In [None]:
print("訓練樣本數：", len(df_train))
df_train.head()

In [12]:
df_train.label.value_counts() / len(df_train)

unrelated    0.679338
agreed       0.294317
disagreed    0.026346
Name: label, dtype: float64

In [13]:
df_test = pd.read_csv("test.csv", sep=",")

In [14]:
df_test = df_test.loc[:, ["title1_zh", "title2_zh", "id"] ]

In [17]:
df_test.columns = ["text_a", "text_b", "Id"]
df_test.to_csv("test.tsv", sep="\t", index=False)

In [None]:
print("預測樣本數：", len(df_test))
df_test.tail()

In [19]:
ratio = len(df_test) / len(df_train)
print("測試集樣本數 / 訓練集樣本數 = {:.1f} 倍".format(ratio))

測試集樣本數 / 訓練集樣本數 = 30.2 倍


# 實作BERT
## 要把資料轉換成BERT需要的輸入

## 建立Dataset

In [20]:
!pip install transformers

[K     |████████████████████████████████| 2.8 MB 5.2 MB/s 
[K     |████████████████████████████████| 131 kB 44.8 MB/s 
[K     |████████████████████████████████| 636 kB 28.4 MB/s 
[K     |████████████████████████████████| 50 kB 6.0 MB/s 
[K     |████████████████████████████████| 895 kB 48.8 MB/s 
[K     |████████████████████████████████| 3.3 MB 33.0 MB/s 
[K     |████████████████████████████████| 7.9 MB 35.3 MB/s 
[K     |████████████████████████████████| 79 kB 7.1 MB/s 
[K     |████████████████████████████████| 138 kB 46.8 MB/s 
[K     |████████████████████████████████| 127 kB 45.5 MB/s 
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
datascience 0.10.6 requires folium==0.2.1, but you have folium 0.8.3 which is incompatible.[0m
[?25h

In [21]:
import torch
from transformers import BertTokenizer
from IPython.display import clear_output

PRETRAINED_MODEL_NAME = "bert-base-chinese"  # 指定繁簡中文 BERT-BASE 預訓練模型

# 取得此預訓練模型所使用的 tokenizer
tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)

Downloading:   0%|          | 0.00/110k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/269k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/624 [00:00<?, ?B/s]

In [22]:
from torch.utils.data import Dataset

In [24]:
class FakeNewsDataset(Dataset):
  # 讀取處理後的data，並初始化參數
  def __init__(self, mode, tokenizer):
     assert mode in ["train", "test"] 
     self.mode = mode
     self.df = pd.read_csv(mode+".tsv", sep="\t").fillna("")
     self.len = len(self.df)
     self.label_map = {'agreed': 0, 'disagreed': 1, 'unrelated': 2}
     self.tokenizer = tokenizer

  # 定義一筆回傳訓練與測試的函式
  def __getitem__(self, idx):
    if self.mode == "test":
      text_a, text_b = self.df.iloc[idx, :2].values
      label_tensor = None
    else:
      text_a, text_b, label = self.df.iloc[idx, :].values
      label_id = self.label_map[label]
      label_tensor = torch.tensor(label_id)

    # 建立第一個句子的 BERT tokens 並加入分隔符號 [SEP]
    word_pieces = ["[CLS]"]
    tokens_a = self.tokenizer.tokenize(text_a)
    word_pieces += tokens_a + ["[SEP]"]
    len_a = len(word_pieces)

    # 建立第二個句子的 BERT tokens
    tokens_b = self.tokenizer.tokenize(text_b)
    word_pieces += tokens_b + ["[SEP]"]
    len_b = len(word_pieces) - len_a
    # 將整個 token 序列轉換成索引序列
    ids = self.tokenizer.convert_tokens_to_ids(word_pieces)
    toeken_tensor = torch.tensor(ids)
    # 將第一句包含 [SEP] 的 token 位置設為 0，其他為 1 表示第二句
    segments_tensor = torch.tensor([0]*len_a+ [1]*len_b, dtype = torch.long)

    return (toeken_tensor, segments_tensor, label_tensor)

  def __len__(self):
    return self.len

trainset = FakeNewsDataset("train", tokenizer=tokenizer)

## 建立Dataloader，把data放進來






In [25]:
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence

In [26]:
def create_mini_batch(samples):
  tokens_tensors = [s[0] for s in samples]
  segments_tensors = [s[1] for s in samples]

  # 測試集有 labels
  if samples[0][2] is not None:
      label_ids = torch.stack([s[2] for s in samples])
  else:
      label_ids = None

  # zero pad 到同一序列長度
  tokens_tensors = pad_sequence(tokens_tensors,batch_first=True)
  segments_tensors = pad_sequence(segments_tensors,batch_first=True)

  # 處理attention masks，zero padding 位置不需要attention，所以其他位置設成1
  masks_tensors = torch.zeros(tokens_tensors.shape, dtype=torch.long)
  masks_tensors = masks_tensors.masked_fill(tokens_tensors != 0, 1)

  return tokens_tensors, segments_tensors, masks_tensors, label_ids

初始化一個每次回傳 64 個訓練樣本的 DataLoader
，利用 `collate_fn` 將 list of samples 合併成一個 mini-batch 是關鍵

In [27]:
BATCH_SIZE = 64
trainloader = DataLoader(trainset, batch_size=BATCH_SIZE, collate_fn=create_mini_batch)

拿出一個mini-batch內容來看

In [28]:
data = next(iter(trainloader))

tokens_tensors, segments_tensors, masks_tensors, label_ids = data

In [None]:
print(f"""
tokens_tensors.shape   = {tokens_tensors.shape} 
{tokens_tensors}
------------------------
segments_tensors.shape = {segments_tensors.shape}
{segments_tensors}
------------------------
masks_tensors.shape    = {masks_tensors.shape}
{masks_tensors}
------------------------
label_ids.shape        = {label_ids.shape}
{label_ids}
""")

# 載入BERT可做中文多分類(multi-class)模型

In [29]:
from transformers import BertForSequenceClassification

In [30]:
PRETRAINED_MODEL_NAME = "bert-base-chinese"
NUM_LABELS = 3

In [31]:
model = BertForSequenceClassification.from_pretrained(
    PRETRAINED_MODEL_NAME, num_labels=NUM_LABELS)

clear_output()

In [102]:
print("""
name            module
----------------------""")
for name, module in model.named_children():
    if name == "bert":
        for n, _ in module.named_children():
            print(f"{name}:{n}")
    else:
        print("{:15} {}".format(name, module))


name            module
----------------------
bert:embeddings
bert:encoder
bert:pooler
dropout         Dropout(p=0.1, inplace=False)
classifier      Linear(in_features=768, out_features=3, bias=True)


#預測

In [32]:
def get_predictions(model, dataloader, compute_acc=False):
    predictions = None
    correct = 0
    total = 0
    with torch.no_grad():
      for data in dataloader:
        if next(model.parameters()).is_cuda:
          data = [t.to("cuda:0") for t in data if t is not None]
        tokens_tensors, segments_tensors, masks_tensors = data[:3]
        outputs = model(input_ids=tokens_tensors, 
                token_type_ids=segments_tensors, 
                attention_mask=masks_tensors)
        logits = outputs[0]
        # 返回每一行中最大值的那个元素，且返回其索引(也就是哪一個class是最有可能的答案)
        _, pred = torch.max(logits.data, 1)

        # 用來計算訓練集的分類準確率
        if compute_acc:
          labels = data[3]
          # size()函数主要是用来统计矩阵元素个数，或矩阵某一维上的元素个数的函数。 
          total += labels.size(0)
          correct += (pred == labels).sum().item()

        # 將當前 batch 記錄下來
        if predictions is None:
          predictions = pred
        # torch.cat: 讓tensor在設定的dim，cat起來
        else:
          predictions = torch.cat((predictions, pred))
    if compute_acc:
      acc = correct / total
      return predictions, acc
    return predictions

In [33]:
# 讓模型跑在 GPU 上並取得訓練集的分類準確率
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("device:", device)
model = model.to(device)
_, acc = get_predictions(model, trainloader, compute_acc=True)
print("classification acc:", acc)

device: cuda:0
classification acc: 0.30033872788859617


# 訓練該下游任務模式

In [34]:
# 訓練模式
model.train()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(21128, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [35]:
# 使用 Adam Optim 更新整個分類模型的參數
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)

In [36]:
EPOCHS = 6  # 幸運數字
for epoch in range(EPOCHS):

  running_loss = 0.0
  for data in trainloader:

    tokens_tensors, segments_tensors, masks_tensors, labels = [t.to(device) for t in data]
    # 將參數梯度歸零
    optimizer.zero_grad()
    # forward pass
    output = model(input_ids=tokens_tensors, 
            token_type_ids=segments_tensors, 
            attention_mask=masks_tensors,
            labels = labels)
    loss = output[0]
    # backward
    loss.backward()
    optimizer.step()
    # 紀錄當前 batch loss
    running_loss += loss.item()
  _, acc = get_predictions(model, trainloader, compute_acc=True)
  print('[epoch %d] loss:%.3f ,acc:%.3f' % (epoch+1, running_loss, acc))

[epoch 1] loss:30.653 ,acc:0.833
[epoch 2] loss:17.536 ,acc:0.883
[epoch 3] loss:13.395 ,acc:0.924
[epoch 4] loss:9.381 ,acc:0.956
[epoch 5] loss:7.012 ,acc:0.912
[epoch 6] loss:5.931 ,acc:0.979


In [None]:
# 用測試集來取得預測結果
testset = FakeNewsDataset("test", tokenizer=tokenizer)
testloader = DataLoader(testset, batch_size=256, collate_fn=create_mini_batch)
predictions = get_predictions(model, testloader)

In [None]:
# 把預測出的label id 換成 文字
index_map = {v:k for k,v in testset.label_map.item()}
df = DataFrame({"Category": predictions.tolist()})
df["Category"] = df.Category.apply(lamda x: index_map[x])
df