In [None]:
!pip install transformers

import torch
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from transformers import AdamW

import numpy as np
import pandas as pd

Collecting transformers
  Downloading transformers-4.34.1-py3-none-any.whl (7.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m57.0 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.18.0-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m40.0 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers)
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m66.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m71.0 MB/s[0m eta [36m0:00:00[0m
Col

In [None]:
from google.colab import files
uploaded = files.upload()

KeyboardInterrupt: ignored

In [None]:
df = pd.read_csv("IMDB Dataset.csv")

df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [None]:
df.describe(include='all')

Unnamed: 0,review,sentiment
count,50000,50000
unique,49582,2
top,Loved today's show!!! It was a variety and not...,positive
freq,5,25000


In [None]:
df = df.sample(frac=0.2, random_state=42)
reviews = df["review"].to_numpy()

labels = df['sentiment'].map({"positive": 1, "negative": 0}).to_numpy()

In [None]:
labels


array([1, 1, 0, ..., 1, 0, 1])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(reviews, labels, test_size=0.2, random_state=42)

X_train[0], y_train[0]

('I saw it tonight and fell asleep in the movie.<br /><br />That is something that I have not done since - I have never fallen asleep at the movies.<br /><br />I LOVE the original and have seen it several times and recommend it to everyone. This may have been the problem but I do not think so, because there were a couple of bright spots that showed if done right they could have made this movie work.<br /><br />Bette was under used and Anne was over used and miscast.<br /><br />I do not know why English or anyone for that matter let this go out in that condition.<br /><br />They billed this as a Sex in the City but better? Not a chance I liked Sex in the City a lot and was disappointed by this movie.<br /><br />So do not waste your money on this movie - go see anything but this!',
 0)

In [None]:
len(y_test)

2000

In [None]:
# 初始化BERT模型和tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertForSequenceClassification.from_pretrained("bert-base-uncased")

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


將電影評論轉換為適合BERT模型輸入的形式

In [None]:
def tokenize_reviews(reviews, labels):
  input_ids = [] # convert tokens to integers (each id represent a unique token) 用於存儲每個電影評論文本經過分詞後的標記（token）整數表示

# The purpose of the attention mask is to handle sequences of varying lengths.
# In BERT, input sequences are padded or truncated to a fixed length,
# and the attention mask helps the model know which parts are actual data and which are padding.
# e.g. ["I","love","NLP"] -> [1,1,1,0,0] with fixed length 5
  attention_masks = []   # 在BERT中，輸入序列被填充或截斷到一個固定的長度，attention_masks幫助模型知道哪些部分是實際數據，哪些部分是填充（padding）。

  for review in reviews:
    # 使用BERT的分詞器tokenizer來對文本進行編碼和轉換，以便將其轉化為適合BERT輸入的格式
    encoded_dict = tokenizer.encode_plus(    # encode_plus接受文本、標記特殊標記（如[CLS]和[SEP]），以及其他參數，例如最大長度、填充（padding）和截斷
      review,
      add_special_tokens=True, # [CLS]：開始分類的特殊標記、[SEP]：分隔符、[PAD]：填充（Padding）。输入序列通常需要具有相同的长度，但真实的文本序列可能具有不同的长度、[UNK]：未知（Unknown）
      max_length=128,
      padding='max_length',
      truncation=True,
      return_attention_mask=True,
      return_tensors='pt',
    )
    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict[ 'attention_mask'])

  # 將input_ids、attention_masks和labels轉換為PyTorch張量（tensor），以便進行深度學習模型的訓練
  input_ids = torch.cat(input_ids, dim=0)
  attention_masks = torch.cat(attention_masks, dim=0)
  labels = torch.tensor(labels)
  return input_ids, attention_masks, labels

In [None]:
# Convert data to PyTorch tensors
input_ids, attention_masks, labels = tokenize_reviews(X_train, y_train)

# Create DataLoader for training data
dataset = TensorDataset(input_ids, attention_masks, labels)  # TensorDataset是PyTorch提供的一個用於包裝張量數據的工具，以便更容易地進行批次處理
train_dataloader = DataLoader(dataset, batch_size=32, shuffle=True)  # DataLoader用於加載訓練數據批次  # shuffle=True表示在每個訓練周期（epoch）開始時打亂數據，以確保模型在不同的小批次上訓練

In [None]:
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
loss_fn = torch.nn.CrossEntropyLoss ()
epochs = 2

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [None]:
for epoch in range(epochs):
  # set to training mode
  # cuz some layers (dropout, batch norm) perform differently during training
  model.train()
  for batch in train_dataloader:
    input_ids, attention_mask, labels = batch
    input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
    # call at each new batch
    # PyTorch accumulates gradients by default,
    # and we want to compute fresh gradients for each batch.
    optimizer.zero_grad()
    outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
    loss = outputs. loss
    loss.backward() # compute the gradient
    optimizer.step() # update parameters (optimization)

In [None]:
# Evaluation
model.eval() # set to evaluation mode
test_input_ids, test_attention_masks, test_labels = tokenize_reviews(X_test, y_test)
test_input_ids, test_attention_masks, test_labels = test_input_ids.to(device), test_attention_masks.to(device), test_labels.to(device)

with torch.no_grad():
  logits = model(test_input_ids, attention_mask=test_attention_masks)

predicted_labels = np.argmax(logits.logits.cpu().numpy(),axis=1)
accuracy = accuracy_score(y_test, predicted_labels)
print(f'Accuracy: {accuracy:.2f}')

Accuracy: 0.86


In [None]:
inf_inp_ids, inf_att_masks, inf_labels = inf_inp_ids.to(device), inf_att_masks.to(device), inf_labels.to(device)

with torch.no_grad():
  inf_logits = model(inf_inp_ids, attention_mask=inf_att_masks)

for i in range(inf_num):
  print(f"""
  review: {inf_reviews[i]}
  predict: {inf_logits[0][i]}
  ground truth: {inf_labels[i]}
  ------------------------------
  """)

NameError: ignored

In [None]:
inf_inp_ids, inf_att_masks, inf_labels = test_input_ids.to(device), test_attention_masks.to(device), test_labels.to(device)

with torch.no_grad():
  inf_logits = model(inf_inp_ids, attention_mask=inf_att_masks)

import random
#import torch.nn.functional as F
#logits = inf_logits
#probabilities = torch.sigmoid(logits)
for i in random.sample(range(2001), 5):
  print(f"""
  review: {inf_inp_ids[i]}
  predict: {inf_logits[0][i]}
  ground truth: {inf_labels[i]}
  ------------------------------
  """)


  review: tensor([  101,  2023,  2052,  1005,  2310,  2042,  1037,  1008,  2307,  1008,
         4333,  2143,  1012,  1996,  3772,  2428,  2003,  2204,  1010,  2012,
         2560,  1999,  1037,  2298,  5003,  1010,  1045,  1005,  1049,  2725,
         2428,  2502,  3772,   999,  4066,  1997,  2126,  1012,  1026,  7987,
         1013,  1028,  1026,  7987,  1013,  1028,  2673,  2003,  4121,  1012,
         2296,  2240,  2003, 13769,   999,  2296,  3496,  2003, 10909,  2011,
         2529, 10576,   999,  1026,  7987,  1013,  1028,  1026,  7987,  1013,
         1028,  3262,  1010,  1045,  2371,  2066, 18201,  4726,  1012,  2664,
         1010,  2066,  2151,  3345, 12006,  1010,  1045,  2481,  1005,  1056,
         7697,  2026,  2159,  2185,  1012,  2023,  7982,  2453,  1005,  2310,
         2499,  2006,  1996,  2754,  1010,  2348,  1045,  4797,  2009,  1012,
         2006,  1996,  3898,  1010,  2009,  2001, 18856, 23128,  1010,  5292,
        21890, 26154,  1010, 28425,  2098,  1998,  34

總結：
似乎是predict[0的機率,1的機率]正的大的即預測
ground truth為正解

In [None]:
torch.cuda.empty_cache()