載入模型所必須要的相依套件

In [None]:
import torch
import transformers
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader
from torch import nn, optim
from transformers import BertModel, BertTokenizer
from transformers import AdamW, get_linear_schedule_with_warmup

%matplotlib inline


讀取先求有data_extract.py所處理完的資料，並將資料區分成train(49500筆)與validate(500筆)兩個部分

In [None]:
TRAIN = pd.read_json("./data/train.json")
TRAIN = TRAIN.sample(frac=1).reset_index(drop=True)
VAL = pd.read_json("./data/test.json")
VAL = VAL.sample(frac=1).reset_index(drop=True)
TRAIN = TRAIN.append(VAL[500:]).reset_index(drop=True)
VAL = VAL.iloc[:500]


選擇所使用的育訓練模型中所搭配的分詞器(Tokenizer)

In [None]:
PRE_TRAINED_MODEL_NAME = 'bert-base-cased'
TOKENIZER = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)

查看一般語句在BERT Tokenizer分詞過後的情形

In [None]:
text = TRAIN.comment[0][:250]

tokens = TOKENIZER.tokenize(text)
token_ids = TOKENIZER.convert_tokens_to_ids(tokens)

print(tokens[:50])
print(token_ids[:50])

['Titanic', 'directed', 'by', 'James', 'Cameron', 'presents', 'a', 'fictional', 'love', 'story', 'on', 'the', 'historical', 'setting', 'of', 'the', 'Titanic', '.', 'The', 'plot', 'is', 'simple', ',', 'non', '##com', '##plicate', '##d', ',', 'or', 'not', 'for', 'those', 'who', 'love', 'plots', 'that', 'twist', 'and', 'turn', 'and', 'keep', 'you', 'in', 'su', '##spense', '.', 'The', 'end', 'of', 'the']
[24342, 2002, 1118, 1600, 6681, 8218, 170, 6725, 1567, 1642, 1113, 1103, 3009, 3545, 1104, 1103, 24342, 119, 1109, 4928, 1110, 3014, 117, 1664, 8178, 21379, 1181, 117, 1137, 1136, 1111, 1343, 1150, 1567, 15836, 1115, 11079, 1105, 1885, 1105, 1712, 1128, 1107, 28117, 21643, 119, 1109, 1322, 1104, 1103]


差看各個特殊Token在BERT分詞器中的編碼

In [None]:
print(TOKENIZER.sep_token, TOKENIZER.sep_token_id)
print(TOKENIZER.cls_token, TOKENIZER.cls_token_id)
print(TOKENIZER.pad_token, TOKENIZER.pad_token_id)
print(TOKENIZER.unk_token, TOKENIZER.unk_token_id)

[SEP] 102
[CLS] 101
[PAD] 0
[UNK] 100


依照資料及各筆語料分詞過後的長度選擇最大編碼長度(MAX_SEQ_LEN)

In [None]:
TRAIN["token_number"] = TRAIN["comment"].apply(TOKENIZER.tokenize).apply(len)
TRAIN["token_number"].describe()

count    49500.000000
mean       317.248424
std        238.372051
min          8.000000
25%        169.000000
50%        236.000000
75%        386.000000
max       3238.000000
Name: token_number, dtype: float64

In [None]:
MAX_SEQ_LEN = 160

BERT分詞器編碼的範例，其產生物件包含input_ids與attention_mask

In [None]:
encoding = TOKENIZER.encode_plus(
  text,
  max_length=MAX_SEQ_LEN,
  add_special_tokens=True,
  return_token_type_ids=False,
  pad_to_max_length=True,
  return_attention_mask=True,
  return_tensors='pt',
)

print(encoding["input_ids"][0])
print(encoding["attention_mask"])

tensor([[  101, 24342,  2002,  1118,  1600,  6681,  8218,   170,  6725,  1567,
          1642,  1113,  1103,  3009,  3545,  1104,  1103, 24342,   119,  1109,
          4928,  1110,  3014,   117,  1664,  8178, 21379,  1181,   117,  1137,
          1136,  1111,  1343,  1150,  1567, 15836,  1115, 11079,  1105,  1885,
          1105,  1712,  1128,  1107, 28117, 21643,   119,  1109,  1322,  1104,
          1103,  2523,  1169,  1129,   102,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,  

查看編碼的原型: [CLS]...[SEP]....

In [None]:
print(TOKENIZER.convert_ids_to_tokens(encoding["input_ids"][0]))

['[CLS]', 'Titanic', 'directed', 'by', 'James', 'Cameron', 'presents', 'a', 'fictional', 'love', 'story', 'on', 'the', 'historical', 'setting', 'of', 'the', 'Titanic', '.', 'The', 'plot', 'is', 'simple', ',', 'non', '##com', '##plicate', '##d', ',', 'or', 'not', 'for', 'those', 'who', 'love', 'plots', 'that', 'twist', 'and', 'turn', 'and', 'keep', 'you', 'in', 'su', '##spense', '.', 'The', 'end', 'of', 'the', 'movie', 'can', 'be', '[SEP]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]'