In [1]:
!pip install transformers tqdm boto3 requests regex -q

[K     |████████████████████████████████| 573kB 3.5MB/s 
[K     |████████████████████████████████| 3.7MB 33.6MB/s 
[K     |████████████████████████████████| 1.0MB 45.0MB/s 
[K     |████████████████████████████████| 890kB 47.0MB/s 
[?25h  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone


In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/gdrive


In [3]:
cd gdrive/My\ Drive/Colab Notebooks

/content/gdrive/My Drive/Colab Notebooks


In [0]:
import pandas as pd

In [5]:
df_train = pd.read_excel("stock_data/stockdata_withdo_new_onecompany_train.xlsx")
df_test = pd.read_excel("stock_data/stockdata_withdo_new_onecompany_test.xlsx")
df_dev = pd.read_excel("stock_data/stockdata_withdo_new_onecompany_dev.xlsx")

ratio = len(df_test) / len(df_train)
print("測試集樣本數 / 訓練集樣本數 = {:.2f} 倍".format(ratio))


測試集樣本數 / 訓練集樣本數 = 0.12 倍


In [0]:
'''
MAX_LENGTH = 300

df_train = df_train[~(df_train.content.apply(lambda x : len(x)) > MAX_LENGTH)]
df_test = df_test[~(df_test.content.apply(lambda x : len(x)) > MAX_LENGTH)]
'''
import pandas as pd
from pandas import DataFrame 
l1 = []
l2 = []

for i, row in df_train.iterrows():
    content = row['content'][:100]
    l1.extend([content])
    l2.extend([row['labeling']])
    
df_train = DataFrame({'content': l1, 'labeling': l2})

l1 = []
l2 = []

for i, row in df_test.iterrows():
    content = row['content'][:100]
    l1.extend([content])
    l2.extend([row['labeling']])
    
df_test = DataFrame({'content': l1, 'labeling': l2})

l1 = []
l2 = []

for i, row in df_dev.iterrows():
    content = row['content'][:100]
    l1.extend([content])
    l2.extend([row['labeling']])
    
df_dev = DataFrame({'content': l1, 'labeling': l2})

In [0]:
# 去除不必要欄位

## (測試用)
# SAMPLE_FRAC = 0.001
# df_train = df_train.sample(frac=SAMPLE_FRAC, random_state=9527)

df_train = df_train.reset_index()
df_train = df_train.loc[:, ['content', 'labeling']]
df_train.columns = ['content', 'labeling']
df_train.to_csv("train.tsv", sep="\t", index=False)

df_test = df_test.loc[:, ['content', 'labeling']]
df_test.columns = ['content', 'labeling']
df_test.to_csv("test.tsv", sep="\t", index=False)

df_dev = df_dev.loc[:, ['content', 'labeling']]
df_dev.columns = ['content', 'labeling']
df_dev.to_csv("dev.tsv", sep="\t", index=False)

In [8]:
print(len(df_train))

11377


In [0]:
"""
實作一個可以用來讀取訓練 / 測試集的 Dataset，這是你需要徹底了解的部分。
此 Dataset 每次將 tsv 裡的一筆成對句子轉換成 BERT 相容的格式，並回傳 3 個 tensors：
- tokens_tensor：兩個句子合併後的索引序列，包含 [CLS] 與 [SEP]
- segments_tensor：可以用來識別兩個句子界限的 binary tensor
- label_tensor：將分類標籤轉換成類別索引的 tensor, 如果是測試集則回傳 None
"""

from torch.utils.data import Dataset
from albert_zh import AlbertTokenizer
import torch

PRETRAINED_MODEL_NAME = "albert_base/vocab.txt"
tokenizer = AlbertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)
    
class StockDataset(Dataset):
    # 讀取前處理後的 tsv 檔並初始化一些參數
    def __init__(self, mode, tokenizer):
        assert mode in ["train", "test", "dev"]  # 一般訓練你會需要 dev set
        self.mode = mode
        # 大數據你會需要用 iterator=True
        self.df = pd.read_csv(mode + ".tsv", sep="\t").fillna("")
        self.len = len(self.df)
        self.tokenizer = tokenizer  # 我們將使用 BERT tokenizer
    
    # 定義回傳一筆訓練 / 測試數據的函式
    def __getitem__(self, idx):
        if self.mode == "test":
            content, label = self.df.iloc[idx, :2].values
            label_id = label
            label_tensor = torch.tensor(label_id)
        else:
            content, label = self.df.iloc[idx, :2].values
            # 將 label 文字也轉換成索引方便轉換成 tensor
            label_id = label
            label_tensor = torch.tensor(label_id)
            
        # 建立第一個句子的 BERT tokens 並加入分隔符號 [SEP]
        word_pieces = ["[CLS]"]
        tokens_a = self.tokenizer.tokenize(content)
        word_pieces += tokens_a + ["[SEP]"]
        len_a = len(word_pieces)
                
        # 將整個 token 序列轉換成索引序列
        ids = self.tokenizer.convert_tokens_to_ids(word_pieces)
        tokens_tensor = torch.tensor(ids)
        
        # 將第一句包含 [SEP] 的 token 位置設為 0，其他為 1 表示第二句
        segments_tensor = torch.tensor([0] * len_a , 
                                        dtype=torch.long)
        
        return (tokens_tensor, segments_tensor, label_tensor)
    
    def __len__(self):
        return self.len
    
    
# 初始化一個專門讀取訓練樣本的 Dataset，使用中文 BERT 斷詞
trainset = StockDataset("train", tokenizer=tokenizer)

devset = StockDataset("dev", tokenizer=tokenizer)

In [10]:
# 選擇第一個樣本
sample_idx = 1

# 將原始文本拿出做比較
content, labeling = trainset.df.iloc[sample_idx].values

# 利用剛剛建立的 Dataset 取出轉換後的 id tensors
tokens_tensor, segments_tensor, label_tensor = trainset[sample_idx]

# 將 tokens_tensor 還原成文本
tokens = tokenizer.convert_ids_to_tokens(tokens_tensor.tolist())
combined_text = "".join(tokens)

# 渲染前後差異，毫無反應就是個 print。可以直接看輸出結果
print(f"""[原始文本]
句子 1：{content}
分類  ：{labeling}

--------------------

[Dataset 回傳的 tensors]
tokens_tensor  ：{tokens_tensor}

segments_tensor：{segments_tensor}

label_tensor   ：{label_tensor}

--------------------

[還原 tokens_tensors]
{combined_text}
""")


[原始文本]
句子 1： 2 上 週五 亞洲盤 漲跌 互見 台股 在 本 周 二 即將 封關 成交量 下滑 台積電 止穩 中小型股 維繫 市場 多方 氣氛 終場 指數 上漲 1334 點 以 933146 點 作收 成交量 
分類  ：1

--------------------

[Dataset 回傳的 tensors]
tokens_tensor  ：tensor([ 101,  123,  677, 6867,  758,  765, 3828, 4676, 4039, 6649,  757, 6210,
        1378, 5500, 1762, 3315, 1453,  753, 1315, 2200, 2196, 7302, 2768,  769,
        7030,  678, 3998, 1378, 4948, 7442, 3632, 4952,  704, 2207, 1798, 5500,
        5204, 5258, 2356, 1842, 1914, 3175, 3706, 3702, 5173, 1842, 2900, 3149,
         677, 4039, 9246, 8159, 7953,  809, 8430, 8805, 9340, 7953,  868, 3119,
        2768,  769, 7030,  102])

segments_tensor：tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

label_tensor   ：1

--------------------

[還原 tokens_tensors]
[CLS]2上週五亞洲盤漲跌互見台股在本周二即將封關成交量下滑台積電止穩中小型股維繫市場多方氣氛終場指數上漲133##4點以93##31##46點作收成交量[SEP]



In [0]:
"""
實作可以一次回傳一個 mini-batch 的 DataLoader
這個 DataLoader 吃我們上面定義的 `FakeNewsDataset`，
回傳訓練 BERT 時會需要的 4 個 tensors：
- tokens_tensors  : (batch_size, max_seq_len_in_batch)
- segments_tensors: (batch_size, max_seq_len_in_batch)
- masks_tensors   : (batch_size, max_seq_len_in_batch)
- label_ids       : (batch_size)
"""

from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence

# 這個函式的輸入 `samples` 是一個 list，裡頭的每個 element 都是
# 剛剛定義的 `FakeNewsDataset` 回傳的一個樣本，每個樣本都包含 3 tensors：
# - tokens_tensor
# - segments_tensor
# - label_tensor
# 它會對前兩個 tensors 作 zero padding，並產生前面說明過的 masks_tensors
def create_mini_batch(samples):
    tokens_tensors = [s[0] for s in samples]
    segments_tensors = [s[1] for s in samples]
    
    # 訓練集有 labels
    if samples[0][2] is not None:
        label_ids = torch.stack([s[2] for s in samples])
    else:
        label_ids = None
    
    # zero pad 到同一序列長度
    tokens_tensors = pad_sequence(tokens_tensors, 
                                  batch_first=True)
    segments_tensors = pad_sequence(segments_tensors, 
                                    batch_first=True)
    
    # attention masks，將 tokens_tensors 裡頭不為 zero padding
    # 的位置設為 1 讓 BERT 只關注這些位置的 tokens
    masks_tensors = torch.zeros(tokens_tensors.shape, 
                                dtype=torch.long)
    masks_tensors = masks_tensors.masked_fill(
        tokens_tensors != 0, 1)
    
    return tokens_tensors, segments_tensors, masks_tensors, label_ids


# 初始化一個每次回傳 64 個訓練樣本的 DataLoader
# 利用 `collate_fn` 將 list of samples 合併成一個 mini-batch 是關鍵
BATCH_SIZE = 16
trainloader = DataLoader(trainset, batch_size=BATCH_SIZE, 
                         collate_fn=create_mini_batch)

devloader = DataLoader(devset, batch_size=BATCH_SIZE, 
                        collate_fn=create_mini_batch)

In [12]:
data = next(iter(trainloader))

tokens_tensors, segments_tensors, \
    masks_tensors, label_ids = data

print(f"""
tokens_tensors.shape   = {tokens_tensors.shape} 
{tokens_tensors}
------------------------
segments_tensors.shape = {segments_tensors.shape}
{segments_tensors}
------------------------
masks_tensors.shape    = {masks_tensors.shape}
{masks_tensors}
------------------------
label_ids.shape        = {label_ids.shape}
{label_ids}
""")


tokens_tensors.shape   = torch.Size([16, 65]) 
tensor([[ 101, 4078, 3828,  ...,    0,    0,    0],
        [ 101,  123,  677,  ..., 7030,  102,    0],
        [ 101, 1369, 1912,  ...,    0,    0,    0],
        ...,
        [ 101, 5445, 3300,  ..., 4245,  102,    0],
        [ 101, 5401, 5500,  ..., 7274, 1993,  102],
        [ 101, 4676, 1248,  ...,    0,    0,    0]])
------------------------
segments_tensors.shape = torch.Size([16, 65])
tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]])
------------------------
masks_tensors.shape    = torch.Size([16, 65])
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 1, 1, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0]])
------------------------
label_ids.shape        

In [13]:
from albert_zh import AlbertForSequenceClassification, AlbertConfig

PRETRAINED_MODEL_NAME = ""
NUM_LABELS = 3

model_config = AlbertConfig.from_json_file('./albert_base/config.json')
model = AlbertForSequenceClassification.from_pretrained(
    "albert_base",config = model_config)

# model = torch.load('Albert_model/Albert0226.pkl')

#clear_output()

# high-level 顯示此模型裡的 modules
print("""
name            module
----------------------""")
for name, module in model.named_children():
    if name == "bert":
        for n, _ in module.named_children():
            print(f"{name}:{n}")
    else:
        print("{:15} {}".format(name, module))


name            module
----------------------
bert:embeddings
bert:encoder
bert:pooler
dropout         Dropout(p=0.1, inplace=False)
classifier      Linear(in_features=768, out_features=2, bias=True)


In [14]:
"""
定義一個可以針對特定 DataLoader 取得模型預測結果以及分類準確度的函式

GPU 跑會有 cuda runtime error 的問題，所以先使用 CPU 跑不會有狀況!

"""

def get_predictions(model, dataloader, compute_acc=False):
    predictions = None
    correct = 0
    total = 0
    tp = 0
    tn = 0
    fp = 0
    fn = 0
      
    with torch.no_grad():
        # 遍巡整個資料集
        for data in dataloader:
            # 將所有 tensors 移到 GPU 上
            if next(model.parameters()).is_cuda:
                data = [t.to("cuda:0") for t in data if t is not None]
            
            
            # 別忘記前 3 個 tensors 分別為 tokens, segments 以及 masks
            # 且強烈建議在將這些 tensors 丟入 `model` 時指定對應的參數名稱
            tokens_tensors, segments_tensors, masks_tensors = data[:3]
            outputs = model(input_ids=tokens_tensors, 
                            token_type_ids=segments_tensors, 
                            attention_mask=masks_tensors)
            
            logits = outputs[0]
            _, pred = torch.max(logits.data, 1)
            
            # 用來計算訓練集的分類準確率
            if compute_acc:
                labels = data[3]
                total += labels.size(0)
                correct += (pred == labels).sum().item()

                tp += (labels * pred).sum().to(torch.float32)
                tn += ((1 - labels) * (1 - pred)).sum().to(torch.float32)
                fp += ((1 - labels) * pred).sum().to(torch.float32)
                fn += (labels * (1 - pred)).sum().to(torch.float32)
                                
            # 將當前 batch 記錄下來
            if predictions is None:
                predictions = pred
            else:
                predictions = torch.cat((predictions, pred))
    
    if compute_acc:
        acc = correct / total

        #epsilon = 1e-7
        precision = tp / (tp + fp)
        recall = tp / (tp + fn)
        fscore = (2 * precision * recall) / (precision + recall)
        return predictions, acc, precision, recall, fscore    
    return predictions
  

# 讓模型跑在 GPU 上並取得訓練集的分類準確率
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("device:", device)
model = model.to(device)
# _, acc, precision, recall, fscore = get_predictions(model, trainloader, compute_acc=True)
# print("classification acc:", acc)


device: cuda:0


In [15]:
def get_learnable_params(module):
    return [p for p in module.parameters() if p.requires_grad]
     
model_params = get_learnable_params(model)
clf_params = get_learnable_params(model.classifier)

print(f"""
整個分類模型的參數量：{sum(p.numel() for p in model_params)}
線性分類器的參數量：{sum(p.numel() for p in clf_params)}
""")


整個分類模型的參數量：10878978
線性分類器的參數量：1538



In [16]:
model.config

{
  "attention_probs_dropout_prob": 0.0,
  "directionality": "bidi",
  "embedding_size": 128,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "ln_type": "postln",
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "num_labels": 2,
  "output_attentions": false,
  "output_hidden_states": false,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "pruned_heads": {},
  "share_type": "all",
  "torchscript": false,
  "type_vocab_size": 2,
  "vocab_size": 21128
}

In [17]:
%%time

import time

# 訓練模式
model.train()

# 使用 Adam Optim 更新整個分類模型的參數
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)

EPOCHS = 100   # 預設為 6
for epoch in range(EPOCHS):
    
    tStart=time.time()
    running_loss = 0.0
    for data in trainloader:
        
        tokens_tensors, segments_tensors, \
        masks_tensors, labels = [t.to(device) for t in data]

        # 將參數梯度歸零
        optimizer.zero_grad()
        
        # forward pass
        outputs = model(input_ids=tokens_tensors, 
                        token_type_ids=segments_tensors, 
                        attention_mask=masks_tensors, 
                        labels=labels)

        loss = outputs[0]
        # backward
        loss.backward()
        optimizer.step()

        # 紀錄當前 batch loss
        running_loss += loss.item()
 
    # if (epoch+1) % 5 == 0 and epoch > 0 :
    #     # 計算分類準確率
    #     _, acc, precision, recall, fscore = get_predictions(model, devloader, compute_acc=True)

    #     print('[epoch %d] loss: %.3f, acc: %.3f, precision: %.3f, recall: %.3f, fscore: %.3f' % 
    #           (epoch + 1, running_loss, acc, precision, recall, fscore))
    
    #     tEnd=time.time()
    #     print("\n It cost %f sec"%(tEnd-tStart))

    if (epoch+1) % 5 == 0 and epoch > 0 :
        # 計算分類準確率
        _, acc, precision, recall, fscore = get_predictions(model, devloader, compute_acc=True)

        print('[epoch %d] loss: %.3f, acc: %.3f, precision: %.3f, recall: %.3f, fscore: %.3f' % 
              (epoch + 1, running_loss, acc, precision, recall, fscore))
    
        tEnd=time.time()
        print("\n It cost %f sec"%(tEnd-tStart))

    

[epoch 5] loss: 280.881, acc: 0.606, precision: 0.654, recall: 0.610, fscore: 0.631

 It cost 105.193107 sec
[epoch 10] loss: 118.409, acc: 0.621, precision: 0.642, recall: 0.713, fscore: 0.676

 It cost 104.905291 sec
[epoch 15] loss: 93.395, acc: 0.623, precision: 0.644, recall: 0.713, fscore: 0.677

 It cost 104.880034 sec
[epoch 20] loss: 87.152, acc: 0.624, precision: 0.678, recall: 0.611, fscore: 0.643

 It cost 105.034907 sec
[epoch 25] loss: 75.830, acc: 0.618, precision: 0.662, recall: 0.632, fscore: 0.646

 It cost 104.981254 sec
[epoch 30] loss: 67.326, acc: 0.610, precision: 0.667, recall: 0.588, fscore: 0.625

 It cost 105.037210 sec
[epoch 35] loss: 75.203, acc: 0.637, precision: 0.671, recall: 0.674, fscore: 0.673

 It cost 104.782430 sec
[epoch 40] loss: 66.482, acc: 0.610, precision: 0.665, recall: 0.596, fscore: 0.628

 It cost 104.929706 sec
[epoch 45] loss: 64.591, acc: 0.611, precision: 0.684, recall: 0.550, fscore: 0.610

 It cost 105.050560 sec
[epoch 50] loss: 5

In [18]:
%%time

# 建立測試集
testset = StockDataset("test", tokenizer=tokenizer)
testloader = DataLoader(testset, batch_size=32, 
                        collate_fn=create_mini_batch)

_, acc, precision, recall, fscore = get_predictions(model, testloader, compute_acc=True)

print('acc: %.3f, precision: %.3f, recall: %.3f, fscore: %.3f' % 
      (acc, precision, recall, fscore))

acc: 0.619, precision: 0.708, recall: 0.666, fscore: 0.686
CPU times: user 3.65 s, sys: 864 ms, total: 4.52 s
Wall time: 4.52 s


In [0]:
# torch.save(model, 'Albert50_50.pkl')

In [0]:
# torch.save(model.state_dict(), 'Bert0206_params.pkl')