In [433]:
import torch
import numpy as np
import pandas as pd
import transformers
from sklearn.model_selection import train_test_split
from transformers import AutoModel, BertTokenizerFast
from tqdm import tqdm
torch.cuda.is_available()

True

In [434]:
CVAP_all_SD_df = pd.read_csv('../EmoBank/ChineseEmoBank/CVAP_SD/CVAP_all_SD.csv', encoding= 'utf-8',sep="\t")
df = CVAP_all_SD_df.drop(['No.','Valence_SD', 'Arousal_SD'], axis= 1)
df = df.iloc[:10,:]
df = df.reset_index(drop = True)

df.loc[1]

Phrase           更加小心
Valence_Mean    5.111
Arousal_Mean    7.188
Name: 1, dtype: object

In [435]:

# 提取特徵和標籤
#X = df[['Valence_Mean', 'Arousal_Mean']]
#y = df['Phrase'] # 如果您的數據集中有標籤列，請替換 'label_column_name' 為您的標籤列名稱
x = df['Phrase']
y = df[['Valence_Mean']] # 如果您的數據集中有標籤列，請替換 'label_column_name' 為您的標籤列名稱

# 將數據集分成訓練集和測試集，以 80:20 的比例分割
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=42)
print(x_train)


0     怪八卦
7    更為不利
2    格外小心
9    最為不利
4    極為小心
3    特別小心
6     怪小偷
Name: Phrase, dtype: object


In [436]:
bert = AutoModel.from_pretrained('bert-base-chinese')
tokenizer = BertTokenizerFast.from_pretrained('bert-base-chinese')

# from transformers import (
#   BertTokenizerFast,
#   AutoModel,
# )

# tokenizer = BertTokenizerFast.from_pretrained('bert-base-chinese')
# bert = AutoModel.from_pretrained('ckiplab/bert-base-chinese')

In [437]:
train_idx = x_train.dropna().index
test_idx = x_test.dropna().index

train_tokens = tokenizer.batch_encode_plus(x_train[train_idx].to_list(),
                                           max_length = 50,
                                           pad_to_max_length = True,
                                           truncation = True)
test_tokens = tokenizer.batch_encode_plus(x_test[test_idx].to_list(),
                                           max_length = 50,
                                           pad_to_max_length = True,
                                           truncation = True)
print(y_train['Valence_Mean'])
# y_train = y_train.reset_index(drop = True)
y_train.loc[0]

0    3.450
7    2.375
2    3.950
9    2.500
4    3.989
3    5.000
6    2.900
Name: Valence_Mean, dtype: float64


Valence_Mean    3.45
Name: 0, dtype: float64

In [438]:
train_seq = torch.tensor(train_tokens['input_ids'])
train_mask = torch.tensor(train_tokens['attention_mask'])
# print([i for i in y_train['Valence_Mean']])
train_y = torch.tensor([i for i in y_train['Valence_Mean']])

test_seq = torch.tensor(test_tokens['input_ids'])
test_mask = torch.tensor(test_tokens['attention_mask'])
test_y = torch.tensor([i for i in y_test['Valence_Mean']])

In [439]:
from torch.utils.data import TensorDataset, RandomSampler, DataLoader

In [440]:
train_data = TensorDataset(train_seq, train_mask, train_y)
train_sampler = RandomSampler(train_data)
trainloader = DataLoader(train_data, 
                         sampler = train_sampler,
                         batch_size = 32)

test_data = TensorDataset(test_seq, test_mask, test_y)
test_sampler = RandomSampler(test_data)
testloader = DataLoader(test_data, 
                         sampler = test_sampler,
                         batch_size = 32)

In [441]:
for param in bert.parameters():
    param.requires_grad = False

In [442]:
from torch import nn
from transformers import AdamW
import torch.nn.functional as F
from sklearn.utils.class_weight import compute_class_weight

In [443]:
class BertRegressor(nn.Module):
    def __init__(self, bert):
        super().__init__()
        self.bert = bert
        self.fc1 = nn.Linear(768, 1)  # output one continuous value
    
    def forward(self, sent_id, mask):
        _, cls_hs = self.bert(sent_id, attention_mask=mask)
        return self.fc1(cls_hs).squeeze()  # remove the last dimension of size 1

In [444]:
model = BertRegressor(bert)
model = model.cuda()

In [445]:
optimizer = AdamW(model.parameters(), lr=1e-5)

In [446]:
# def loss_fn(outputs, targets):
#     return F.mse_loss(outputs, targets)

In [447]:
criterion = nn.MSELoss()

In [448]:
from tqdm import tqdm

In [449]:
epochs = 10

for e in range(epochs):   
    train_loss = 0.0
    for batch in tqdm(trainloader):
        batch = [i.cuda() for i in batch]
        sent_id, masks, labels = batch

        optimizer.zero_grad()
        preds = model(sent_id, masks)
        loss = criterion(preds, labels)
        train_loss += loss.item()

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
    print(f'Epoch:{e+1}\t\tTraining Loss: {train_loss / len(trainloader)}')

100%|██████████| 1/1 [00:00<00:00, 43.48it/s]


Epoch:1		Training Loss: 15.035850524902344


100%|██████████| 1/1 [00:00<00:00, 41.66it/s]


Epoch:2		Training Loss: 14.987773895263672


100%|██████████| 1/1 [00:00<00:00, 45.46it/s]


Epoch:3		Training Loss: 14.939763069152832


100%|██████████| 1/1 [00:00<00:00, 41.66it/s]


Epoch:4		Training Loss: 14.89183235168457


100%|██████████| 1/1 [00:00<00:00, 37.04it/s]


Epoch:5		Training Loss: 14.843974113464355


100%|██████████| 1/1 [00:00<00:00, 45.45it/s]


Epoch:6		Training Loss: 14.796195983886719


100%|██████████| 1/1 [00:00<00:00, 47.62it/s]


Epoch:7		Training Loss: 14.748494148254395


100%|██████████| 1/1 [00:00<00:00, 47.62it/s]


Epoch:8		Training Loss: 14.700871467590332


100%|██████████| 1/1 [00:00<00:00, 47.62it/s]


Epoch:9		Training Loss: 14.65333080291748


100%|██████████| 1/1 [00:00<00:00, 47.62it/s]

Epoch:10		Training Loss: 14.605866432189941





In [450]:
pred_label = []
true_label = []
for batch in tqdm(testloader):
    batch = [i.cuda() for i in batch]
    sent_id, masks, labels = batch

    preds = model(sent_id, masks)
    #pred_label.extend(torch.argmax(preds, axis = 1).cpu())
    pred_label.extend(preds.cpu())
    true_label.extend(labels.cpu())

100%|██████████| 1/1 [00:00<00:00, 66.66it/s]
