In [217]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from transformers import BertModel, BertTokenizer
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

In [218]:
# load the datasets
df = pd.read_excel("./train.xlsx")

In [219]:
df.head()

Unnamed: 0,name,text,BACE_Blank,DIS_Blank,IGB_Blank,SC_Blank,TASK_Blank,PWR_Blank,CC_Blank
0,01-安东尼·布林肯/2010年9月7日布林肯关于美伊关系的演讲.txt,Celeste gentlemen let me welcome you to as the...,1,2,1,4,4,1,3
1,01-安东尼·布林肯/2014年4月7日布林肯在哥伦比亚大学进行关于人道主义行动的演讲.txt,So let me just make a very quick her welcome t...,2,4,1,2,3,2,4
2,01-安东尼·布林肯/2015布林肯接受印度快报采访.txt,I'm going to be speaking today to antony bilki...,1,3,1,4,3,1,4
3,01-安东尼·布林肯/2016年4月布林肯特别演讲.txt,So I I have a special opportunity to uh huh mi...,1,2,1,3,3,1,2
4,01-安东尼·布林肯/2016年6月30日布林肯发表关于加入CSIS的演讲.txt,Could more.Everyone welcome to the center for ...,1,2,1,4,1,1,3


In [220]:
# checking for null values
df.isnull().sum()

name          0
text          0
BACE_Blank    0
DIS_Blank     0
IGB_Blank     0
SC_Blank      0
TASK_Blank    0
PWR_Blank     0
CC_Blank      0
dtype: int64

In [221]:
# tokenize the text using BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')



In [222]:
# define helper function for tokenization
def tokenize_text(text):
    return tokenizer.encode(
        text,
        add_special_tokens=True,
        max_length=256,
        padding='max_length',
        return_token_type_ids=True,
        truncation=True,
        return_tensors='pt'
    )

In [223]:
# tokenize texts in corpus using BERT tokenizer
# 模型的输入将是标记化的文本标记和注意力掩码。
df['input_ids'] = [tokenize_text(text) for text in tqdm(df['text'])]

100%|██████████| 157/157 [00:03<00:00, 48.94it/s]


In [239]:
# perform train validation split
df_train, df_val = train_test_split(df, test_size=0.2, random_state=2024)

df_train.head()

Unnamed: 0,name,text,BACE_Blank,DIS_Blank,IGB_Blank,SC_Blank,TASK_Blank,PWR_Blank,CC_Blank,input_ids
92,07-麻生太郎/3.txt,first Taro Aso. I decided to dissolve the hous...,3,1,1,2,4,1,4,"[[tensor(101), tensor(2034), tensor(16985), te..."
152,15-约瑟夫·拜登/拜登2020感恩节讲稿.txt,My fellow Americans:Thanksgiving is a special ...,2,2,1,1,1,2,1,"[[tensor(101), tensor(2026), tensor(3507), ten..."
35,03-福田康夫/5.txt,It is said that it is 14 times in this year th...,1,1,1,4,3,1,4,"[[tensor(101), tensor(2009), tensor(2003), ten..."
23,01-安倍晋三/6.txt,Good morning to you all. When President Juncke...,4,1,1,1,2,4,1,"[[tensor(101), tensor(2204), tensor(2851), ten..."
10,01-安倍晋三/10.txt,Welcome to Yokohama. My thanks go to President...,4,1,1,1,3,4,3,"[[tensor(101), tensor(6160), tensor(2000), ten..."


In [240]:
df_train['CC_Blank'].value_counts()

CC_Blank
4    40
3    30
2    28
1    27
Name: count, dtype: int64

In [241]:
df_val['CC_Blank'].value_counts()

CC_Blank
1    12
2     8
4     6
3     6
Name: count, dtype: int64

In [242]:
# get tokens from train and validation sets
df_train_tokens = torch.from_numpy(np.vstack(df_train['input_ids']))
df_val_tokens = torch.from_numpy(np.vstack(df_val['input_ids']))
df_train_tokens

tensor([[  101,  2034, 16985,  ...,  2001,  2019,   102],
        [  101,  2026,  3507,  ...,  1011,  2146,   102],
        [  101,  2009,  2003,  ...,  1045,  2052,   102],
        ...,
        [  101,  2651,  1010,  ...,  1996,  2601,   102],
        [  101, 15876,  9743,  ...,  2001,  2055,   102],
        [  101,  2206,  1996,  ...,  7803, 17371,   102]])

In [243]:
# get attention masks from train and validation sets
df_train_attention_masks = torch.where(df_train_tokens!=0, 1, 0)
df_val_attention_masks = torch.where(df_val_tokens!=0, 1, 0)
df_train_attention_masks

tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        ...,
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1]])

In [244]:
df_train['attention_masks'] = [i for i in df_train_attention_masks]
df_val['attention_masks'] = [i for i in df_val_attention_masks]

df_train.head()

Unnamed: 0,name,text,BACE_Blank,DIS_Blank,IGB_Blank,SC_Blank,TASK_Blank,PWR_Blank,CC_Blank,input_ids,attention_masks
92,07-麻生太郎/3.txt,first Taro Aso. I decided to dissolve the hous...,3,1,1,2,4,1,4,"[[tensor(101), tensor(2034), tensor(16985), te...","[tensor(1), tensor(1), tensor(1), tensor(1), t..."
152,15-约瑟夫·拜登/拜登2020感恩节讲稿.txt,My fellow Americans:Thanksgiving is a special ...,2,2,1,1,1,2,1,"[[tensor(101), tensor(2026), tensor(3507), ten...","[tensor(1), tensor(1), tensor(1), tensor(1), t..."
35,03-福田康夫/5.txt,It is said that it is 14 times in this year th...,1,1,1,4,3,1,4,"[[tensor(101), tensor(2009), tensor(2003), ten...","[tensor(1), tensor(1), tensor(1), tensor(1), t..."
23,01-安倍晋三/6.txt,Good morning to you all. When President Juncke...,4,1,1,1,2,4,1,"[[tensor(101), tensor(2204), tensor(2851), ten...","[tensor(1), tensor(1), tensor(1), tensor(1), t..."
10,01-安倍晋三/10.txt,Welcome to Yokohama. My thanks go to President...,4,1,1,1,3,4,3,"[[tensor(101), tensor(6160), tensor(2000), ten...","[tensor(1), tensor(1), tensor(1), tensor(1), t..."


In [245]:
# set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [246]:
device

device(type='cuda')

In [247]:
# define the neural network model
class DisasterSentimentModel(nn.Module):
    def __init__(self, num_labels1):
        super(DisasterSentimentModel, self).__init__()
        self.bert_layer = BertModel.from_pretrained('bert-base-uncased')
        for param in self.bert_layer.parameters():
            param.requires_grad = False
        self.dropout_layer = nn.Dropout(0.3)
        
        self.dense_layer1 = nn.Linear(self.bert_layer.config.hidden_size, 512)
        self.classifier_layer1 = nn.Linear(512, num_labels1)
        

    def forward(self, input_ids, attention_mask=None):
        bert_output = self.bert_layer(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = bert_output[0]
        pooled_output = pooled_output[:, 0]
        
        output1 = self.dense_layer1(pooled_output)
        output1 = nn.ReLU()(output1)
        output1 = self.dropout_layer(output1)
        output1 = torch.softmax(self.classifier_layer1(output1), dim=-1)

        #return output1, output2, output3, output4, output5, output6, output7
        return output1

In [248]:
# Create data loader for Multi-Task Learning
batch_size = 16

input_ids = torch.from_numpy(np.vstack(df_train['input_ids'].values))
attention_masks = torch.from_numpy(np.vstack(df_train['attention_masks'].values))
d1_label = torch.from_numpy(np.vstack(df_train['CC_Blank'].values))

train_data = torch.utils.data.TensorDataset(input_ids,
                                                attention_masks,
                                                d1_label,
                                                )

train_loader = torch.utils.data.DataLoader(train_data, batch_size=batch_size, shuffle=True)

In [249]:
def single_task_train(model, dataloader, max_epochs, print_loss=False):
    loss_fn = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

    for epoch in range(max_epochs):
        model.train()
        total_loss = 0.0

        for batch_idx, batch in enumerate(dataloader):
            batch = tuple(t.to(device) for t in batch)
            input_ids, attention_masks, d1_label = batch

            # 将标签从二维转为一维，并进行 0-based 映射
            d1_label = d1_label.view(-1).long() - 1

            # 前向传播
            outputs = model(input_ids, attention_masks)

            # 调试输出
            print(f"Model output shape: {outputs.shape}, Target shape: {d1_label.shape}")

            # 计算损失
            loss = loss_fn(outputs, d1_label)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            if print_loss:
                print(f"Batch {batch_idx + 1}/{len(dataloader)}, Loss: {loss.item():.4f}")

        print(f"Epoch {epoch + 1}/{max_epochs}, Average Loss: {total_loss / len(dataloader):.4f}")


In [250]:
# define new model to train on both d1 and d2 labels
md = DisasterSentimentModel(num_labels1=4).to(device)
single_task_train(md, train_loader, 10, True)



Model output shape: torch.Size([16, 4]), Target shape: torch.Size([16])
Batch 1/8, Loss: 1.3912
Model output shape: torch.Size([16, 4]), Target shape: torch.Size([16])
Batch 2/8, Loss: 1.3893
Model output shape: torch.Size([16, 4]), Target shape: torch.Size([16])
Batch 3/8, Loss: 1.3941
Model output shape: torch.Size([16, 4]), Target shape: torch.Size([16])
Batch 4/8, Loss: 1.3939
Model output shape: torch.Size([16, 4]), Target shape: torch.Size([16])
Batch 5/8, Loss: 1.3734
Model output shape: torch.Size([16, 4]), Target shape: torch.Size([16])
Batch 6/8, Loss: 1.3828
Model output shape: torch.Size([16, 4]), Target shape: torch.Size([16])
Batch 7/8, Loss: 1.3887
Model output shape: torch.Size([13, 4]), Target shape: torch.Size([13])
Batch 8/8, Loss: 1.3770
Epoch 1/10, Average Loss: 1.3863
Model output shape: torch.Size([16, 4]), Target shape: torch.Size([16])
Batch 1/8, Loss: 1.3659
Model output shape: torch.Size([16, 4]), Target shape: torch.Size([16])
Batch 2/8, Loss: 1.3562
Model o

In [251]:
from sklearn.metrics import f1_score, accuracy_score

def evaluate(model, dataloader, return_labels=False):
    """
    评估模型性能，计算 F1 分数和准确率。
    
    参数:
        model: 要评估的模型。
        dataloader: 数据加载器。
        return_labels: 如果为 True，则返回预测标签和实际标签。
        
    返回:
        f1: F1 分数。
        accuracy: 准确率。
        如果 return_labels 为 True，则返回 (f1, accuracy, predictions, true_labels)。
    """
    model.eval()  # 设置模型为评估模式
    all_predictions = []
    all_true_labels = []
    
    with torch.no_grad():
        for batch in dataloader:
            batch = tuple(t.to(device) for t in batch)
            input_ids, attention_masks, d1_label = batch

            # 将标签从二维转为一维，并进行 0-based 映射
            d1_label = d1_label.view(-1).long() - 1

            # 模型输出
            outputs = model(input_ids, attention_masks)

            # 获取预测类别
            predictions = torch.argmax(outputs, dim=1)

            # 收集预测和真实标签
            all_predictions.extend(predictions.cpu().numpy())
            all_true_labels.extend(d1_label.cpu().numpy())

    # 计算 F1 分数和准确率
    f1 = f1_score(all_true_labels, all_predictions, average='weighted')
    accuracy = accuracy_score(all_true_labels, all_predictions)
    print(f"F1 Score: {f1:.4f}, Accuracy: {accuracy:.4f}")

    if return_labels:
        return f1, accuracy, all_predictions, all_true_labels
    return f1, accuracy


In [252]:
# Create data loader for Multi-Task Learning
batch_size = 16

input_ids = torch.from_numpy(np.vstack(df_val['input_ids'].values))
attention_masks = torch.from_numpy(np.vstack(df_val['attention_masks'].values))
d1_label = torch.from_numpy(np.vstack(df_val['CC_Blank'].values))

val_data = torch.utils.data.TensorDataset(input_ids,
                                                attention_masks,
                                                d1_label)

val_loader = torch.utils.data.DataLoader(val_data, batch_size=batch_size, shuffle=True)

In [253]:
f1, accuracy, all_predictions, all_true_labels = evaluate(md, val_loader, return_labels=True)

F1 Score: 0.2809, Accuracy: 0.3438
