讀取資料
看前五筆資料

In [None]:
import pandas as pd

df = pd.read_csv('./movie_data.csv', encoding='utf-8')
df.head(5)


查看資料筆數和欄位數

In [None]:
print(df.shape)
print(df['sentiment'].value_counts())


pre-processing
1. to lowercase
2. remove url and html tag
3. remove puctuation

In [None]:
from bs4 import BeautifulSoup
import re
import nltk

# to lowercase
df['review'] = df['review'].str.lower()

# remove url and html tag
df['review'] = df['review'].apply(lambda x: BeautifulSoup(x).get_text())
df['review'] = df['review'].apply(lambda x: re.sub(r"http\S+", "", x))

df.head(5)

In [None]:
# remove puctuation
df['review'] = df['review'].apply(lambda x: " ".join([re.sub('[^A-Za-z]+', '', x) for x in str(x).split()]))
df.head(10)

切分資料集 train, val, test，其中train:test=7:3

In [None]:
# from sklearn.model_selection import train_test_split

# x_train, x_test, y_train, y_test = train_test_split(df['review'], df['sentiment'], train_size=0.7, test_size=0.3)
# # x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, train_size=0.7, test_size=0.3)
# print(x_train.head(5))
# print(x_test.head(5))
# print(y_train.head(5))
# print(y_test.head(5))

# dict = {'review':x_test, 'sentiment':y_test}
# df = pd.DataFrame(dict)
# df.to_csv('imdb_test.csv', header=True, index=False)

# # dict = {'review':x_val, 'sentiment':y_val}
# # df = pd.DataFrame(dict)
# # df.to_csv('imdb_val.csv', header=True, index=False)

# dict = {'review':x_train, 'sentiment':y_train}
# df = pd.DataFrame(dict)
# df.to_csv('imdb_train.csv', header=True, index=False)

In [None]:
test = pd.read_csv('imdb_test.csv', encoding='utf-8')
print(test.shape)
print(test['sentiment'].value_counts())

train = pd.read_csv('imdb_train.csv', encoding='utf-8')
print(train.shape)
print(train['sentiment'].value_counts())

# val = pd.read_csv('imdb_val.csv', encoding='utf-8')
# print(val.shape)
# print(val['sentiment'].value_counts())

Dataset and Dataloader

In [1]:
import torch
from transformers import BertTokenizer
import pandas as pd
from torch.utils.data import Dataset

class MovieComment(Dataset):
    def __init__(self, mode, tokenizer):
        assert mode in ['train','val', 'test']
        self.mode = mode
        if self.mode == 'train':
            self.df = pd.read_csv('imdb_train_bert.csv')
        elif self.mode == 'val':
            self.df = pd.read_csv('imdb_val_bert.csv')
        elif self.mode == 'test':
            self.df = pd.read_csv('imdb_test.csv')
        # required_label
        
        self.len = len(self.df)
        self.label_map = {0: 0, 1: 1}
        self.tokenizer = tokenizer

    def __getitem__(self, idx):
        text = self.df.iloc[idx, 0]
        label = self.df.iloc[idx, 1]
        label_id = self.label_map[label]
        label_tensor = torch.tensor(label_id)
        word_pieces = ['[CLS]']
        tokens = self.tokenizer.tokenize(text[:510])
        word_pieces += tokens + ['[SEP]']
        len_token = len(word_pieces)

        # 將整個token序列轉換成索引序列
        ids = self.tokenizer.convert_tokens_to_ids(word_pieces)
        tokens_tensor = torch.tensor(ids)

        segment_tensor = torch.tensor([0]*len_token, dtype = torch.long)
        return(tokens_tensor, segment_tensor, label_tensor)
    
    def __len__(self):
        return self.len


train bert

In [2]:
from transformers import BertTokenizer, BertForSequenceClassification, get_cosine_schedule_with_warmup
import torch
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence
import time
from datetime import datetime
import pandas as pd
from sklearn.metrics import accuracy_score
from tqdm.auto import tqdm

PRETRAINED_MODEL_NAME = 'bert-base-uncased'
PRETRAINED_TOKENIZER = 'bert-base-uncased'
NUM_LABELS = 2
BATCH_SIZE = 32
EPOCH = 10

tokenizer = BertTokenizer.from_pretrained(PRETRAINED_TOKENIZER)
model = BertForSequenceClassification.from_pretrained(PRETRAINED_MODEL_NAME, num_labels=NUM_LABELS)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("device: ", device)
def create_mini_batch(samples):
    tokens_tensors = [s[0] for s in samples]
    segments_tensors = [s[1] for s in samples]
    # 測試集有 labels
    if len(samples[0])==3:
        if samples[0][2] is not None:
            label_ids = torch.stack([s[2] for s in samples])
    else:
        label_ids = None
    # zero pad 到同一序列長度
    tokens_tensors = pad_sequence(tokens_tensors, batch_first=True)
    segments_tensors = pad_sequence(segments_tensors, batch_first=True)
    # attention masks，將 tokens_tensors 裡頭不為 zero padding
    # 的位置設為 1 讓 BERT 只關注這些位置的 tokens
    masks_tensors = torch.zeros(tokens_tensors.shape, dtype=torch.long)
    masks_tensors = masks_tensors.masked_fill(tokens_tensors != 0, 1)
    
    return tokens_tensors, segments_tensors, masks_tensors, label_ids

# 利用collate_fn將list of samples 合併成一個 mini-batch是關鍵
trainset = MovieComment('train', tokenizer=tokenizer)
trainloader =  DataLoader(trainset, batch_size=BATCH_SIZE, collate_fn=create_mini_batch)
valset = MovieComment('val', tokenizer=tokenizer)
valloader =  DataLoader(trainset, batch_size=BATCH_SIZE, collate_fn=create_mini_batch)

def get_predictions(model, dataloader, compute_acc=False):
    predictions = None
    correct = 0
    total = 0

    with torch.no_grad():
        # 遍尋整個資料集
        for data in dataloader:
            # 將所有tensor 移到GPU上
            if next(model.parameters()).is_cuda:
                data = [t.to("cuda") for t in data if t is not None]
            # 前3個tensors分別為tokens, segments, masks，建議再將這些tensors丟入model時指定對應的參數名稱
            tokens_tensors, segments_tensors, masks_tensors = data[:3]
            outputs = model(input_ids=tokens_tensors, token_type_ids=segments_tensors, attention_mask=masks_tensors)
            logits = outputs[0]
            _, pred = torch.max(logits.data, 1)
            # 用來計算訓練集的分類準確率
            if compute_acc:
                labels = data[3]
                total += labels.size(0)
                correct += (pred == labels).sum().item()

            # 當前batch記錄下來
            if predictions is None:
                predictions = pred
            else:
                predictions = torch.cat((predictions, pred))

    if compute_acc:
        acc = correct / total
        return predictions, acc
    return predictions

# 讓模型跑在GPU上並取得訓練集的分類準確率
model = model.to(device)
# bert fine-tune 前的準確率
_, acc = get_predictions(model, trainloader, compute_acc=True)
print("classification acc:", acc)

# 計時開始
start_time = time.time()
# 使用Adam Optim更新整個分類模型的參數
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-6, weight_decay=1e-2)
scheduler = get_cosine_schedule_with_warmup(optimizer, num_training_steps=EPOCH * len(trainloader), num_warmup_steps=len(trainloader))
for epoch in range(EPOCH):
    print('EPOCH: ', epoch + 1)
    # 訓練模型
    model.train()
    running_loss = 0.0
    for data in tqdm(trainloader):
        tokens_tensors, segments_tensors, masks_tensors, \
        labels = [t.to(device) for t in data]
        # 將參數梯度歸零
        optimizer.zero_grad()
        # forward pass
        outputs = model(input_ids=tokens_tensors, token_type_ids=segments_tensors,\
            attention_mask=masks_tensors, labels=labels)
        loss = outputs[0]
        # backward
        loss.backward()
        optimizer.step()
        scheduler.step()
        # 紀錄當前batch loss
        running_loss += loss.item()
    #計算分類準確率
    _, train_acc = get_predictions(model, trainloader, compute_acc=True)
    print('[epoch %d] train_loss: %.3f, train_acc: %.3f' %(epoch+1, running_loss, train_acc))
    model.eval()
    _, val_acc = get_predictions(model, valloader, compute_acc=True)
    print('[epoch %d] val_acc: %.3f' %(epoch+1, val_acc))

end_time = time.time()
print('execute_time: ', str(end_time-start_time))

### predict testset
testset = MovieComment('test', tokenizer=tokenizer)
y_true = testset.df.iloc[:,1].values
testloader =  DataLoader(testset, batch_size=BATCH_SIZE, collate_fn=create_mini_batch)
prediction = get_predictions(model, testloader, compute_acc=False)
print("======================================")
print('test accuracy: ',accuracy_score(y_true, prediction.tolist()))

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

device:  cuda


  0%|          | 0/875 [00:00<?, ?it/s]

classification acc: 0.50425
EPOCH:  1


100%|██████████| 875/875 [03:23<00:00,  4.29it/s]


[epoch 1] train_loss: 546.777, train_acc: 0.804


  0%|          | 0/875 [00:00<?, ?it/s]

[epoch 1] val_acc: 0.812
EPOCH:  2


100%|██████████| 875/875 [03:23<00:00,  4.29it/s]


[epoch 2] train_loss: 353.667, train_acc: 0.846


  0%|          | 0/875 [00:00<?, ?it/s]

[epoch 2] val_acc: 0.854
EPOCH:  3


100%|██████████| 875/875 [03:22<00:00,  4.32it/s]


[epoch 3] train_loss: 308.078, train_acc: 0.860


  0%|          | 0/875 [00:00<?, ?it/s]

[epoch 3] val_acc: 0.868
EPOCH:  4


100%|██████████| 875/875 [03:22<00:00,  4.31it/s]


[epoch 4] train_loss: 283.352, train_acc: 0.872


  0%|          | 0/875 [00:00<?, ?it/s]

[epoch 4] val_acc: 0.880
EPOCH:  5


100%|██████████| 875/875 [03:21<00:00,  4.34it/s]


[epoch 5] train_loss: 266.732, train_acc: 0.878


  0%|          | 0/875 [00:00<?, ?it/s]

[epoch 5] val_acc: 0.889
EPOCH:  6


100%|██████████| 875/875 [03:22<00:00,  4.33it/s]


[epoch 6] train_loss: 254.413, train_acc: 0.884


  0%|          | 0/875 [00:00<?, ?it/s]

[epoch 6] val_acc: 0.895
EPOCH:  7


100%|██████████| 875/875 [03:22<00:00,  4.31it/s]


[epoch 7] train_loss: 248.416, train_acc: 0.888


  0%|          | 0/875 [00:00<?, ?it/s]

[epoch 7] val_acc: 0.898
EPOCH:  8


100%|██████████| 875/875 [03:24<00:00,  4.29it/s]


[epoch 8] train_loss: 242.805, train_acc: 0.890


  0%|          | 0/875 [00:00<?, ?it/s]

[epoch 8] val_acc: 0.901
EPOCH:  9


100%|██████████| 875/875 [03:22<00:00,  4.31it/s]


[epoch 9] train_loss: 239.154, train_acc: 0.889


  0%|          | 0/875 [00:00<?, ?it/s]

[epoch 9] val_acc: 0.902
EPOCH:  10


100%|██████████| 875/875 [03:22<00:00,  4.32it/s]


[epoch 10] train_loss: 239.814, train_acc: 0.890
[epoch 10] val_acc: 0.902
execute_time:  3768.1931476593018
test accuracy:  0.8690666666666667
