In [20]:
import os
import pandas as pd
import numpy as np

from torch.utils.data import Dataset
import torch
from transformers import BertTokenizer,AutoTokenizer
from IPython.display import clear_output
from sklearn.model_selection import train_test_split


def data_append(data,n):
    newdf = pd.DataFrame(np.repeat(data.values,n,axis=0))
    newdf.columns = df.columns
    return newdf

df = pd.read_csv("512_all.csv")
df = df.dropna(axis = 0,how = 'any')

df_train,df_test = train_test_split(df, test_size=0.2 ,random_state=10)

df_test = df_test.drop('label',axis=1)
id_test = [i for i in range(len(df_test))]
df_test.loc[:,'id'] = id_test

df_train.to_csv("train.tsv", sep="\t", index=False)
print("訓練樣本數：", len(df_train))

df_test.to_csv("test.tsv", sep="\t", index=False)
print("預測樣本數：", len(df_test))

訓練樣本數： 988
預測樣本數： 247


In [22]:
df_test

Unnamed: 0,text_a,id
123,"成為優秀成功領導人,帶領團隊更上巔峰、創造績效。新人茶會：針對新進人員介紹公司文化與現狀,了...",0
824,"推廣客製化服務,達到個人化精準行銷。深耕數位平台客戶,提升外部獲客能力數位化主要策略是以客戶...",1
874,"為全國唯一具有HCE與掃碼付功能的手機信用卡。在智能金融方面,以大數據及數位平台發展為核心,...",2
699,"並辦理盈餘轉增資42億元,發放前(1年度股票股利每股50元及現金股利每股10元。在公司治理方...",3
363,"﻿107年度營業計畫概要經營方針106年全球經濟穩健擴張,美國於去年3度宣布升息,並持續升息...",4
...,...,...
786,"以因應金融科技之發展。充分投資收益性及安全性較佳之金融商品,加強餘裕資金之管理,以賺取利息收...",242
531,以確保本集團之法令遵循制度及作業符合國內外金融監理之潮流與趨勢。強化共同行銷集團共銷效益：1...,243
399,"並提供保戶完整的專業服務。長期持續致力於商品創新,華南產險在「最佳保險專業」及「最佳保險商品...",244
20,"458萬元,稅前淨損為6億2,760萬元,所得稅利益為7億4,964萬元,稅後純益為1億2,...",245


In [13]:
class FakeNewsDataset(Dataset):
    # 讀取前處理後的 tsv 檔並初始化一些參數
    def __init__(self, mode, tokenizer):
        assert mode in ["train", "test"]  # 一般訓練你會需要驗證集
        self.mode = mode
        # 大數據你會需要用 iterator=True
        self.df = pd.read_csv(mode + ".tsv", sep="\t").fillna("")
        self.len = len(self.df)
        self.label_map = {'Y': 0, 'N': 1}
        self.tokenizer = tokenizer  # 我們將使用 BERT tokenizer
    
    # 定義回傳一筆訓練 / 測試數據的函式
    def __getitem__(self, idx):
        if self.mode == "test":
            text_a = self.df.iloc[idx, :1].values
            label_tensor = None
        else:
            text_a, label = self.df.iloc[idx, :].values
            # 將 label 文字也轉換成索引方便轉換成 tensor
            label_id = self.label_map[label]
            label_tensor = torch.tensor(label_id)
            
        word_pieces = ["[CLS]"]
        tokens_a = self.tokenizer.tokenize(text_a)
        word_pieces += tokens_a+ ["[SEP]"]
        len_a = len(word_pieces)
        
        # 將整個 token 序列轉換成索引序列
        ids = self.tokenizer.convert_tokens_to_ids(word_pieces)
        tokens_tensor = torch.tensor(ids)
        
        return (tokens_tensor, label_tensor)
    
    def __len__(self):
        return self.len
    
    
tokenizer = BertTokenizer.from_pretrained("bert-base-chinese")

# tokenizer = AutoTokenizer.from_pretrained("schen/longformer-chinese-base-4096")
trainset = FakeNewsDataset("train", tokenizer=tokenizer)

In [14]:
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence


def create_mini_batch(samples):
    tokens_tensors = [s[0] for s in samples]
    
    # 測試集有 labels
    if samples[0][1] is not None:
        label_ids = torch.stack([s[1] for s in samples])
    else:
        label_ids = None
    
    # zero pad 到同一序列長度
    tokens_tensors = pad_sequence(tokens_tensors, batch_first=True)
    # attention masks，將 tokens_tensors 裡頭不為 zero padding的位置設為 1 讓 BERT 只關注這些位置的 tokens
    masks_tensors = torch.zeros(tokens_tensors.shape,dtype=torch.long)
    masks_tensors = masks_tensors.masked_fill(tokens_tensors != 0, 1)
    
    return tokens_tensors, masks_tensors, label_ids



BATCH_SIZE = 64
trainloader = DataLoader(trainset, batch_size=BATCH_SIZE, collate_fn=create_mini_batch)

In [15]:
# 選擇第一個樣本
sample_idx = 0

# 將原始文本拿出做比較
text_a, label = trainset.df.iloc[sample_idx].values

# 利用剛剛建立的 Dataset 取出轉換後的 id tensors
tokens_tensor, label_tensor = trainset[sample_idx]

# 將 tokens_tensor 還原成文本
tokens = tokenizer.convert_ids_to_tokens(tokens_tensor.tolist())
combined_text = "".join(tokens)

# 渲染前後差異，毫無反應就是個 print。可以直接看輸出結果
print(f"""[原始文本]
句子 1：{text_a}
分類  ：{label}

--------------------

[Dataset 回傳的 tensors]
tokens_tensor  ：{tokens_tensor}

label_tensor   ：{label_tensor}

--------------------

""")

[原始文本]
句子 1：9-7%健傷保險618,297699,552(81,2-6%財責保險1,472,5731,413,22659,347+2%運輸保險642,762690,092(47,3-9%合計7,333,8817,221,241112
分類  ：N

--------------------

[Dataset 回傳的 tensors]
tokens_tensor  ：tensor([  101,   130,   118,   128,   110,   978,  1003,   924,  7402,  9048,
          117, 11992,  9398,  8160,   117,  8222,  8144,   113,  8424,   117,
          123,   118,   127,   110,  6512,  6519,   924,  7402,   122,   117,
         8264,  8144,   117,  8272,  8805,   117, 12561,   117, 10436,  9632,
          117, 12936,   116,   123,   110,  6880,  6745,   924,  7402,  8308,
         8144,   117,  8399,  8756,  8599,   117,  8141,  8144,   113,  8264,
          117,   124,   118,   130,   110,  1394,  6243,   128,   117, 10745,
          117,  8302,  8408,   117, 10118,   117, 10896,  8452,  8144,   102])

label_tensor   ：1

--------------------




In [16]:
for i in iter(trainloader):
    data = i
    tokens_tensors,masks_tensors, label_ids = data
    
    print(tokens_tensors.shape)
    print(masks_tensors.shape)
    print(label_ids.shape)

torch.Size([64, 407])
torch.Size([64, 407])
torch.Size([64])
torch.Size([64, 411])
torch.Size([64, 411])
torch.Size([64])
torch.Size([64, 446])
torch.Size([64, 446])
torch.Size([64])
torch.Size([64, 413])
torch.Size([64, 413])
torch.Size([64])
torch.Size([64, 432])
torch.Size([64, 432])
torch.Size([64])
torch.Size([64, 437])
torch.Size([64, 437])
torch.Size([64])
torch.Size([64, 432])
torch.Size([64, 432])
torch.Size([64])
torch.Size([64, 409])
torch.Size([64, 409])
torch.Size([64])
torch.Size([64, 399])
torch.Size([64, 399])
torch.Size([64])
torch.Size([64, 397])
torch.Size([64, 397])
torch.Size([64])
torch.Size([64, 447])
torch.Size([64, 447])
torch.Size([64])
torch.Size([64, 429])
torch.Size([64, 429])
torch.Size([64])
torch.Size([64, 431])
torch.Size([64, 431])
torch.Size([64])
torch.Size([64, 430])
torch.Size([64, 430])
torch.Size([64])
torch.Size([64, 445])
torch.Size([64, 445])
torch.Size([64])
torch.Size([28, 422])
torch.Size([28, 422])
torch.Size([28])


In [17]:
data = next(iter(trainloader))

tokens_tensors,masks_tensors, label_ids = data

print(f"""
tokens_tensors.shape   = {tokens_tensors.shape} 
{tokens_tensors}
------------------------
masks_tensors.shape    = {masks_tensors.shape}
{masks_tensors}
------------------------
label_ids.shape        = {label_ids.shape}
{label_ids}
""")


tokens_tensors.shape   = torch.Size([64, 407]) 
tensor([[ 101,  130,  118,  ...,    0,    0,    0],
        [ 101, 8632, 2399,  ...,    0,    0,    0],
        [ 101, 2990, 1285,  ...,    0,    0,    0],
        ...,
        [ 101, 3300, 7302,  ...,    0,    0,    0],
        [ 101, 3313,  889,  ...,    0,    0,    0],
        [ 101, 4948, 3513,  ...,    0,    0,    0]])
------------------------
masks_tensors.shape    = torch.Size([64, 407])
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])
------------------------
label_ids.shape        = torch.Size([64])
tensor([1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1,
        1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
        0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1])



In [18]:
from transformers import BertForSequenceClassification,AutoModel

PRETRAINED_MODEL_NAME = "bert-base-chinese"
# PRETRAINED_MODEL_NAME = "schen/longformer-chinese-base-4096"
NUM_LABELS = 2

model = BertForSequenceClassification.from_pretrained(PRETRAINED_MODEL_NAME, num_labels=NUM_LABELS)
clear_output()

# high-level 顯示此模型裡的 modules
print("""
name            module
----------------------""")
for name, module in model.named_children():
    if name == "bert":
        for n, _ in module.named_children():
            print(f"{name}:{n}")
    else:
        print("{:15} {}".format(name, module))


name            module
----------------------
bert:embeddings
bert:encoder
bert:pooler
dropout         Dropout(p=0.1, inplace=False)
classifier      Linear(in_features=768, out_features=2, bias=True)


In [19]:
model.config

BertConfig {
  "_name_or_path": "bert-base-chinese",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "directionality": "bidi",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "position_embedding_type": "absolute",
  "transformers_version": "4.15.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 21128
}

In [None]:
"""
定義一個可以針對特定 DataLoader 取得模型預測結果以及分類準確度的函式
之後也可以用來生成上傳到 Kaggle 競賽的預測結果

2019/11/22 更新：在將 `tokens`、`segments_tensors` 等 tensors
丟入模型時，強力建議指定每個 tensor 對應的參數名稱，以避免 HuggingFace
更新 repo 程式碼並改變參數順序時影響到我們的結果。
"""

def get_predictions(model, dataloader, compute_acc=False):
    predictions = None
    correct = 0
    total = 0
      
    with torch.no_grad():
        # 遍巡整個資料集
        for data in dataloader:
            # 將所有 tensors 移到 GPU 上
            if next(model.parameters()).is_cuda:
                data = [t.to("cuda:0") for t in data if t is not None]
            
            
            # 別忘記前 3 個 tensors 分別為 tokens, segments 以及 masks
            # 且強烈建議在將這些 tensors 丟入 `model` 時指定對應的參數名稱
            tokens_tensors, masks_tensors = data[:2]
            outputs = model(input_ids=tokens_tensors,attention_mask=masks_tensors)
            logits = outputs[0]
            _, pred = torch.max(logits.data, 1)
            
            # 用來計算訓練集的分類準確率
            if compute_acc:
                labels = data[2]
                total += labels.size(0)
                correct += (pred == labels).sum().item()
                
            # 將當前 batch 記錄下來
            if predictions is None:
                predictions = pred
            else:
                predictions = torch.cat((predictions, pred))
    
    if compute_acc:
        acc = correct / total
        return predictions, acc
    return predictions
    
# 讓模型跑在 GPU 上並取得訓練集的分類準確率
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("device:", device)
model = model.to(device)
_, acc = get_predictions(model, trainloader, compute_acc=True)
print("classification acc:", acc)

device: cpu


In [None]:
%%time

# 訓練模式
model.train()

# 使用 Adam Optim 更新整個分類模型的參數
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)


EPOCHS = 3  
for epoch in range(EPOCHS):
    
    running_loss = 0.0
    for data in trainloader:
        
        tokens_tensors,masks_tensors, labels = [t.to(device) for t in data]

        # 將參數梯度歸零
        optimizer.zero_grad()
        
        # forward pass
        outputs = model(input_ids=tokens_tensors,  
                        attention_mask=masks_tensors, 
                        labels=labels)

        loss = outputs[0]
        # backward
        loss.backward()
        optimizer.step()


        # 紀錄當前 batch loss
        running_loss += loss.item()
        
    # 計算分類準確率
    _, acc = get_predictions(model, trainloader, compute_acc=True)

    print('[epoch %d] loss: %.3f, acc: %.3f' %
          (epoch + 1, running_loss, acc))