<a href="https://colab.research.google.com/github/tonywu8428/Course-Project-for-Deep-Learning/blob/main/NLP_Assignment3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers datasets transformers[torch]


Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m11.9 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-16.1.0-cp310-cp310-manylinux_2_28_x86_64.whl (40.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.8/40.8 MB[0m [31m38.1 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m18.6 MB/s[0m eta [36m0:00:00[0m
Collecting requests (from transformers)
  Downloading requests-2.32.3-py3-none-any.whl (64 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.9/64.9 kB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.wh

In [None]:
import pandas as pd
import re

# 加載數據
train_file_path = '/content/drive/MyDrive/Colab Notebooks/train.txt'
dev_file_path = '/content/drive/MyDrive/Colab Notebooks/dev.txt'

with open(train_file_path, 'r') as file:
    train_lines = file.readlines()

with open(dev_file_path, 'r') as file:
    dev_lines = file.readlines()

# 清理數據
def process_lines(lines):
    texts = []
    labels = []

    for line in lines:
        overall_label = re.match(r'\((\d)', line)
        if overall_label:
            label = int(overall_label.group(1))
        else:
            continue

        text = re.sub(r'\(\d+', '', line)
        text = re.sub(r'\)', '', text)
        text = ' '.join(text.split())

        if text:
            texts.append(text)
            labels.append(label)

    return texts, labels

train_texts, train_labels = process_lines(train_lines)
dev_texts, dev_labels = process_lines(dev_lines)

clean_train_data = pd.DataFrame({'text': train_texts, 'label': train_labels})
clean_dev_data = pd.DataFrame({'text': dev_texts, 'label': dev_labels})

# 查看清理後的數據
print(clean_train_data.head())
print(clean_dev_data.head())


                                                text  label
0  The Rock is destined to be the 21st Century 's...      3
1  The gorgeously elaborate continuation of `` Th...      4
2  Singer\/composer Bryan Adams contributes a sle...      3
3  You 'd think by now America would have had eno...      2
4               Yet the act is still charming here .      3
                                                text  label
0  It 's a lovely film with lovely performances b...      3
1  No one goes unindicted here , which is probabl...      2
2  And if you 're not nearly moved to tears by a ...      3
3                   A warm , funny , engaging film .      4
4  Uses sharp humor and insight into human nature...      4


In [None]:
# 簡化標籤
def simplify_labels(label):
    if label in [0, 1]:
        return 0  # 負面
    elif label == 2:
        return 1  # 中性
    else:
        return 2  # 正面

clean_train_data['label'] = clean_train_data['label'].apply(simplify_labels)
clean_dev_data['label'] = clean_dev_data['label'].apply(simplify_labels)


In [None]:
clean_train_data.head()

Unnamed: 0,text,label
0,The Rock is destined to be the 21st Century 's...,2
1,The gorgeously elaborate continuation of `` Th...,2
2,Singer\/composer Bryan Adams contributes a sle...,2
3,You 'd think by now America would have had eno...,1
4,Yet the act is still charming here .,2


In [None]:
import torch
from torch.utils.data import Dataset
from transformers import BertTokenizer

# 設定 BERT 模型
BERT_MODEL = 'bert-base-cased'
tokenizer = BertTokenizer.from_pretrained(BERT_MODEL)

# 自定義數據集類
class SentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        text = self.texts[item]
        label = self.labels[item]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# 設定最大長度和批次大小
MAX_LEN = 70
BATCH_SIZE = 16
num_labels = 3

# 創建訓練和開發數據集
train_dataset = SentimentDataset(
    texts=clean_train_data.text.to_numpy(),
    labels=clean_train_data.label.to_numpy(),
    tokenizer=tokenizer,
    max_len=MAX_LEN
)

dev_dataset = SentimentDataset(
    texts=clean_dev_data.text.to_numpy(),
    labels=clean_dev_data.label.to_numpy(),
    tokenizer=tokenizer,
    max_len=MAX_LEN
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]



config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
from transformers import BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import numpy as np

# 加載BERT模型
model = BertForSequenceClassification.from_pretrained(BERT_MODEL, num_labels=num_labels)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# 設定訓練參數
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",  # 每個epoch結束後進行評估
    save_strategy="epoch",  # 每個epoch結束後保存
    learning_rate=1e-5,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=100,
    dataloader_num_workers=4,
    fp16=True,  # 使用混合精度訓練
    no_cuda=False,
)

# 定義計算評估指標的函數
def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(p.label_ids, preds, average='weighted')
    acc = accuracy_score(p.label_ids, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

# 創建 Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=dev_dataset,
    compute_metrics=compute_metrics
)

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# 訓練模型
trainer.train()

# # 保存模型
# model.save_pretrained('sentiment_model')
# tokenizer.save_pretrained('sentiment_tokenizer')

# 查看評估指標
eval_results = trainer.evaluate()
print(f"Eval results: {eval_results}")


  self.pid = os.fork()


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.6982,0.700826,0.71208,0.651119,0.673866,0.71208
2,0.5606,0.728888,0.713896,0.682935,0.674146,0.713896
3,0.4052,0.786468,0.715713,0.695619,0.685979,0.715713
4,0.3161,0.852816,0.709355,0.700179,0.695797,0.709355
5,0.2796,0.878015,0.715713,0.706772,0.702986,0.715713


  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()


Eval results: {'eval_loss': 0.8780147433280945, 'eval_accuracy': 0.7157129881925522, 'eval_f1': 0.7067721838395583, 'eval_precision': 0.7029857722108162, 'eval_recall': 0.7157129881925522, 'eval_runtime': 1.2424, 'eval_samples_per_second': 886.191, 'eval_steps_per_second': 55.538, 'epoch': 5.0}
