In [3]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from transformers import BertModel, BertTokenizer
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

1. Load Data

In [4]:
# load the datasets
df = pd.read_excel("./train.xlsx")

In [5]:
df.head()

Unnamed: 0,name,text,BACE_Blank,DIS_Blank,IGB_Blank,SC_Blank,TASK_Blank,PWR_Blank,CC_Blank
0,01-安东尼·布林肯/2010年9月7日布林肯关于美伊关系的演讲.txt,Celeste gentlemen let me welcome you to as the...,1,2,1,4,4,1,3
1,01-安东尼·布林肯/2014年4月7日布林肯在哥伦比亚大学进行关于人道主义行动的演讲.txt,So let me just make a very quick her welcome t...,2,4,1,2,3,2,4
2,01-安东尼·布林肯/2015布林肯接受印度快报采访.txt,I'm going to be speaking today to antony bilki...,1,3,1,4,3,1,4
3,01-安东尼·布林肯/2016年4月布林肯特别演讲.txt,So I I have a special opportunity to uh huh mi...,1,2,1,3,3,1,2
4,01-安东尼·布林肯/2016年6月30日布林肯发表关于加入CSIS的演讲.txt,Could more.Everyone welcome to the center for ...,1,2,1,4,1,1,3


2. Data Preprocessing

In [6]:
# checking for null values
df.isnull().sum()

name          0
text          0
BACE_Blank    0
DIS_Blank     0
IGB_Blank     0
SC_Blank      0
TASK_Blank    0
PWR_Blank     0
CC_Blank      0
dtype: int64

In [7]:
# tokenize the text using BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')



In [81]:
# define helper function for tokenization
def tokenize_text(text):
    return tokenizer.encode(
        text,
        add_special_tokens=True,
        max_length=256,
        padding='max_length',
        return_token_type_ids=True,
        truncation=True,
        return_tensors='pt'
    )

In [82]:
# tokenize texts in corpus using BERT tokenizer
# 模型的输入将是标记化的文本标记和注意力掩码。
df['input_ids'] = [tokenize_text(text) for text in tqdm(df['text'])]

100%|██████████| 157/157 [00:03<00:00, 47.14it/s]


In [83]:
df.head()

Unnamed: 0,name,text,BACE_Blank,DIS_Blank,IGB_Blank,SC_Blank,TASK_Blank,PWR_Blank,CC_Blank,input_ids
0,01-安东尼·布林肯/2010年9月7日布林肯关于美伊关系的演讲.txt,Celeste gentlemen let me welcome you to as the...,1,2,1,4,4,1,3,"[[tensor(101), tensor(21113), tensor(11218), t..."
1,01-安东尼·布林肯/2014年4月7日布林肯在哥伦比亚大学进行关于人道主义行动的演讲.txt,So let me just make a very quick her welcome t...,2,4,1,2,3,2,4,"[[tensor(101), tensor(2061), tensor(2292), ten..."
2,01-安东尼·布林肯/2015布林肯接受印度快报采访.txt,I'm going to be speaking today to antony bilki...,1,3,1,4,3,1,4,"[[tensor(101), tensor(1045), tensor(1005), ten..."
3,01-安东尼·布林肯/2016年4月布林肯特别演讲.txt,So I I have a special opportunity to uh huh mi...,1,2,1,3,3,1,2,"[[tensor(101), tensor(2061), tensor(1045), ten..."
4,01-安东尼·布林肯/2016年6月30日布林肯发表关于加入CSIS的演讲.txt,Could more.Everyone welcome to the center for ...,1,2,1,4,1,1,3,"[[tensor(101), tensor(2071), tensor(2062), ten..."


In [117]:
# perform train validation split
df_train, df_val = train_test_split(df, test_size=0.2, random_state=2023)

df_train.head()

Unnamed: 0,name,text,BACE_Blank,DIS_Blank,IGB_Blank,SC_Blank,TASK_Blank,PWR_Blank,CC_Blank,input_ids
23,01-安倍晋三/6.txt,Good morning to you all. When President Juncke...,4,1,1,1,2,4,1,"[[tensor(101), tensor(2204), tensor(2851), ten..."
14,01-安倍晋三/17.txt,Thank you very much for your introduction. Pri...,4,1,1,1,1,4,2,"[[tensor(101), tensor(4067), tensor(2017), ten..."
21,01-安倍晋三/27.txt,Happy new year to everyone. I hope that you al...,4,1,1,1,1,4,1,"[[tensor(101), tensor(3407), tensor(2047), ten..."
57,05-菅直人/13.txt,"We, the leaders of Japan, the People's Republi...",1,1,1,4,2,1,1,"[[tensor(101), tensor(2057), tensor(1010), ten..."
132,10-小渕恵三/9.txt,"First, I will tell you about the accident of t...",3,1,1,2,2,4,3,"[[tensor(101), tensor(2034), tensor(1010), ten..."


In [118]:
df_train['DIS_Blank'].value_counts()

DIS_Blank
1    89
4    16
2    12
3     8
Name: count, dtype: int64

In [119]:
df_val['DIS_Blank'].value_counts()

DIS_Blank
1    21
2     7
4     3
3     1
Name: count, dtype: int64

In [120]:
# get tokens from train and validation sets
df_train_tokens = torch.from_numpy(np.vstack(df_train['input_ids']))
df_val_tokens = torch.from_numpy(np.vstack(df_val['input_ids']))
df_train_tokens

tensor([[  101,  2204,  2851,  ...,  2900,  1998,   102],
        [  101,  4067,  2017,  ...,  2005,  2129,   102],
        [  101,  3407,  2047,  ..., 10575,  1998,   102],
        ...,
        [  101,  2061,  1045,  ...,  2057,  2134,   102],
        [  101,  2343, 22072,  ...,  3713,  7580,   102],
        [  101,  2057,  2018,  ...,  3314,  1010,   102]])

In [121]:
# get attention masks from train and validation sets
df_train_attention_masks = torch.where(df_train_tokens!=0, 1, 0)
df_val_attention_masks = torch.where(df_val_tokens!=0, 1, 0)
df_train_attention_masks

tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        ...,
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1]])

In [122]:
df_train['attention_masks'] = [i for i in df_train_attention_masks]
df_val['attention_masks'] = [i for i in df_val_attention_masks]

df_train.head()

Unnamed: 0,name,text,BACE_Blank,DIS_Blank,IGB_Blank,SC_Blank,TASK_Blank,PWR_Blank,CC_Blank,input_ids,attention_masks
23,01-安倍晋三/6.txt,Good morning to you all. When President Juncke...,4,1,1,1,2,4,1,"[[tensor(101), tensor(2204), tensor(2851), ten...","[tensor(1), tensor(1), tensor(1), tensor(1), t..."
14,01-安倍晋三/17.txt,Thank you very much for your introduction. Pri...,4,1,1,1,1,4,2,"[[tensor(101), tensor(4067), tensor(2017), ten...","[tensor(1), tensor(1), tensor(1), tensor(1), t..."
21,01-安倍晋三/27.txt,Happy new year to everyone. I hope that you al...,4,1,1,1,1,4,1,"[[tensor(101), tensor(3407), tensor(2047), ten...","[tensor(1), tensor(1), tensor(1), tensor(1), t..."
57,05-菅直人/13.txt,"We, the leaders of Japan, the People's Republi...",1,1,1,4,2,1,1,"[[tensor(101), tensor(2057), tensor(1010), ten...","[tensor(1), tensor(1), tensor(1), tensor(1), t..."
132,10-小渕恵三/9.txt,"First, I will tell you about the accident of t...",3,1,1,2,2,4,3,"[[tensor(101), tensor(2034), tensor(1010), ten...","[tensor(1), tensor(1), tensor(1), tensor(1), t..."


In [123]:
# set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [124]:
device

device(type='cuda')

3. Model Building

In [125]:
# define the neural network model
class DisasterSentimentModel(nn.Module):
    def __init__(self, num_labels1, num_labels2, num_labels4, num_labels5, num_labels6, num_labels7):
        super(DisasterSentimentModel, self).__init__()
        self.bert_layer = BertModel.from_pretrained('bert-base-uncased')
        for param in self.bert_layer.parameters():
            param.requires_grad = False
        self.dropout_layer = nn.Dropout(0.3)
        
        self.dense_layer1 = nn.Linear(self.bert_layer.config.hidden_size, 512)
        self.classifier_layer1 = nn.Linear(512, num_labels1)
        
        self.dense_layer2 = nn.Linear(self.bert_layer.config.hidden_size, 512)
        self.classifier_layer2 = nn.Linear(512, num_labels2)

        #self.dense_layer3 = nn.Linear(self.bert_layer.config.hidden_size, 512)
        #self.classifier_layer3 = nn.Linear(512, num_labels3)
        
        self.dense_layer4 = nn.Linear(self.bert_layer.config.hidden_size, 512)
        self.classifier_layer4 = nn.Linear(512, num_labels4)

        self.dense_layer5 = nn.Linear(self.bert_layer.config.hidden_size, 512)
        self.classifier_layer5 = nn.Linear(512, num_labels5)

        self.dense_layer6 = nn.Linear(self.bert_layer.config.hidden_size, 512)
        self.classifier_layer6 = nn.Linear(512, num_labels6)

        self.dense_layer7 = nn.Linear(self.bert_layer.config.hidden_size, 512)
        self.classifier_layer7 = nn.Linear(512, num_labels7)

        

    def forward(self, input_ids, attention_mask=None):
        bert_output = self.bert_layer(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = bert_output[0]
        pooled_output = pooled_output[:, 0]
        
        output1 = self.dense_layer1(pooled_output)
        output1 = nn.ReLU()(output1)
        output1 = self.dropout_layer(output1)
        output1 = torch.softmax(self.classifier_layer1(output1), dim=-1)
        
        output2 = self.dense_layer2(pooled_output)
        output2 = nn.ReLU()(output2)
        output2 = self.dropout_layer(output2)
        output2 = torch.softmax(self.classifier_layer2(output2), dim=-1)

        #output3 = self.dense_layer1(pooled_output)
        #output3 = nn.ReLU()(output3)
        #output3 = self.dropout_layer(output3)
        #output3 = torch.softmax(self.classifier_layer1(output3), dim=-1)
        
        output4 = self.dense_layer2(pooled_output)
        output4 = nn.ReLU()(output4)
        output4 = self.dropout_layer(output4)
        output4 = torch.softmax(self.classifier_layer2(output4), dim=-1)

        output5 = self.dense_layer1(pooled_output)
        output5 = nn.ReLU()(output5)
        output5 = self.dropout_layer(output5)
        output5 = torch.softmax(self.classifier_layer1(output5), dim=-1)
        
        output6 = self.dense_layer2(pooled_output)
        output6 = nn.ReLU()(output6)
        output6 = self.dropout_layer(output6)
        output6 = torch.softmax(self.classifier_layer2(output6), dim=-1)

        output7 = self.dense_layer2(pooled_output)
        output7 = nn.ReLU()(output7)
        output7 = self.dropout_layer(output7)
        output7 = torch.softmax(self.classifier_layer2(output7), dim=-1)

        #return output1, output2, output3, output4, output5, output6, output7
        return output1, output2, output4, output5, output6, output7

In [126]:
# Create data loader for Multi-Task Learning
batch_size = 16

input_ids = torch.from_numpy(np.vstack(df_train['input_ids'].values))
attention_masks = torch.from_numpy(np.vstack(df_train['attention_masks'].values))
d1_label = torch.from_numpy(np.vstack(df_train['BACE_Blank'].values))
d2_label = torch.from_numpy(np.vstack(df_train['DIS_Blank'].values))
#d3_label = torch.from_numpy(np.vstack(df_train['IGB_Blank'].values))
d4_label = torch.from_numpy(np.vstack(df_train['SC_Blank'].values))
d5_label = torch.from_numpy(np.vstack(df_train['TASK_Blank'].values))
d6_label = torch.from_numpy(np.vstack(df_train['PWR_Blank'].values))
d7_label = torch.from_numpy(np.vstack(df_train['CC_Blank'].values))

train_data = torch.utils.data.TensorDataset(input_ids,
                                                attention_masks,
                                                d1_label,
                                                d2_label,
                                                #d3_label,
                                                d4_label,
                                                d5_label,
                                                d6_label,
                                                d7_label)

train_loader = torch.utils.data.DataLoader(train_data, batch_size=batch_size, shuffle=True)

4. We want to train M for both T1 and T2 on D12 by minimizing a weighted loss λ1 * l1 + λ2 * l2

In [127]:
def mtl_train(model, dataloader, max_epochs, print_loss=False):
    loss_fn = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

    for epoch in range(max_epochs):
        model.train()
        total_loss = 0.0

        for batch_idx, batch in enumerate(dataloader):
            batch = tuple(t.to(device) for t in batch)
            #input_ids, attention_masks, d1_label, d2_label, d3_label, d4_label, d5_label, d6_label, d7_label = batch
            input_ids, attention_masks, d1_label, d2_label, d4_label, d5_label, d6_label, d7_label = batch

            # 标签映射
            d1_label, d2_label, d4_label, d5_label, d6_label, d7_label = [
                label - 1 for label in [d1_label, d2_label, d4_label, d5_label, d6_label, d7_label]
            ]
            #d3_label = (d3_label - 1) // 3

            # 前向传播
            outputs = model(input_ids, attention_masks)

            # 计算损失
            losses = []
            for output, label in zip(outputs, [d1_label, d2_label, d4_label, d5_label, d6_label, d7_label]):
                label = label.squeeze().long()  # 确保目标为 1D 且是 LongTensor
                losses.append(loss_fn(output, label))
            
            total_task_loss = sum(losses)

            # 反向传播
            optimizer.zero_grad()
            total_task_loss.backward()
            optimizer.step()

            total_loss += total_task_loss.item()

            if print_loss:
                print(f"Batch {batch_idx + 1}/{len(dataloader)}, Loss: {total_task_loss.item():.4f}")

        print(f"Epoch {epoch + 1}/{max_epochs}, Average Loss: {total_loss / len(dataloader):.4f}")



In [128]:
from sklearn.metrics import f1_score, accuracy_score
from tqdm import tqdm
import torch

def mtl_evaluate(model, dataloader, return_labels=False):
    model.eval()

    # 初始化存储容器，每个任务单独存储预测值和真实标签
    task_preds = {f"d{i}_preds": [] for i in range(1, 8)}
    task_labels = {f"d{i}_labels": [] for i in range(1, 8)}

    # 遍历验证数据集
    for batch in tqdm(dataloader):
        batch = tuple(t.to(device) for t in batch)
        input_ids, attention_masks, d1_label, d2_label, d4_label, d5_label, d6_label, d7_label = batch

        # 标签映射
        d1_label, d2_label, d4_label, d5_label, d6_label, d7_label = [
            label - 1 for label in [d1_label, d2_label, d4_label, d5_label, d6_label, d7_label]
        ]
        #d3_label = (d3_label - 1) // 3

        # 前向传播
        with torch.no_grad():
            outputs = model(input_ids, attention_masks)

        # 分别处理每个任务
        labels = [d1_label, d2_label, d4_label, d5_label, d6_label, d7_label]
        for i in range(6):
            task_label = labels[i].flatten()
            valid_mask = ~torch.isnan(task_label)  # 筛选有效标签
            valid_indices = torch.arange(len(task_label)).to(device)[valid_mask]

            if len(valid_indices) > 0:  # 若存在有效样本
                task_output = outputs[i][valid_indices]
                task_pred = torch.argmax(task_output, axis=1).detach().cpu().numpy().tolist()
                # 存储预测值和真实标签
                task_preds[f"d{i+1}_preds"] += task_pred
                task_labels[f"d{i+1}_labels"] += task_label[valid_indices].cpu().numpy().tolist()

    # 计算每个任务的 F1 和 Accuracy
    results = {}
    for i in range(6):
        preds = task_preds[f"d{i+1}_preds"]
        labels = task_labels[f"d{i+1}_labels"]
        
        if len(preds) > 0:
            f1 = f1_score(labels, preds, average='weighted')
            accuracy = accuracy_score(labels, preds)
            results[f"Task {i+1}"] = {"F1 Score": f1, "Accuracy": accuracy}
            print(f"Task {i+1} - F1 Score: {f1:.4f}, Accuracy: {accuracy:.4f}")
        else:
            results[f"Task {i+1}"] = {"F1 Score": None, "Accuracy": None}
            print(f"Task {i+1} - No valid samples for evaluation.")

    # 根据需求返回标签和预测值
    if return_labels:
        return task_labels, task_preds
    else:
        return results


In [129]:
# define new model to train on both d1 and d2 labels
md = DisasterSentimentModel(num_labels1=4, num_labels2=4, num_labels4=4, num_labels5=4, num_labels6=4, num_labels7=4).to(device)
mtl_train(md, train_loader, 10, True)



Batch 1/8, Loss: 8.3519
Batch 2/8, Loss: 8.3201
Batch 3/8, Loss: 8.3042
Batch 4/8, Loss: 8.2552
Batch 5/8, Loss: 8.2509
Batch 6/8, Loss: 8.2192
Batch 7/8, Loss: 8.1710
Batch 8/8, Loss: 8.1510
Epoch 1/10, Average Loss: 8.2529
Batch 1/8, Loss: 8.1326
Batch 2/8, Loss: 8.0668
Batch 3/8, Loss: 8.1360
Batch 4/8, Loss: 8.0968
Batch 5/8, Loss: 8.0006
Batch 6/8, Loss: 8.0366
Batch 7/8, Loss: 7.8501
Batch 8/8, Loss: 8.0341
Epoch 2/10, Average Loss: 8.0442
Batch 1/8, Loss: 7.7954
Batch 2/8, Loss: 7.9753
Batch 3/8, Loss: 8.0188
Batch 4/8, Loss: 8.0438
Batch 5/8, Loss: 7.7973
Batch 6/8, Loss: 7.9398
Batch 7/8, Loss: 7.9486
Batch 8/8, Loss: 8.0289
Epoch 3/10, Average Loss: 7.9435
Batch 1/8, Loss: 7.7845
Batch 2/8, Loss: 7.8501
Batch 3/8, Loss: 7.9170
Batch 4/8, Loss: 8.0477
Batch 5/8, Loss: 7.7496
Batch 6/8, Loss: 7.9188
Batch 7/8, Loss: 8.0713
Batch 8/8, Loss: 7.8701
Epoch 4/10, Average Loss: 7.9011
Batch 1/8, Loss: 7.9100
Batch 2/8, Loss: 7.6671
Batch 3/8, Loss: 7.5833
Batch 4/8, Loss: 8.0146
Batc

In [130]:
# Create data loader for Multi-Task Learning
batch_size = 16

input_ids = torch.from_numpy(np.vstack(df_val['input_ids'].values))
attention_masks = torch.from_numpy(np.vstack(df_val['attention_masks'].values))
d1_label = torch.from_numpy(np.vstack(df_val['BACE_Blank'].values))
d2_label = torch.from_numpy(np.vstack(df_val['DIS_Blank'].values))
#d3_label = torch.from_numpy(np.vstack(df_val['IGB_Blank'].values))
d4_label = torch.from_numpy(np.vstack(df_val['SC_Blank'].values))
d5_label = torch.from_numpy(np.vstack(df_val['TASK_Blank'].values))
d6_label = torch.from_numpy(np.vstack(df_val['PWR_Blank'].values))
d7_label = torch.from_numpy(np.vstack(df_val['CC_Blank'].values))

val_data = torch.utils.data.TensorDataset(input_ids,
                                                attention_masks,
                                                d1_label,
                                                d2_label,
                                                #d3_label,
                                                d4_label,
                                                d5_label,
                                                d6_label,
                                                d7_label)

val_loader = torch.utils.data.DataLoader(val_data, batch_size=batch_size, shuffle=True)

In [131]:
# Calculate accuracy and print the result
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, ConfusionMatrixDisplay
labels, preds = mtl_evaluate(md, val_loader, return_labels=True)

100%|██████████| 2/2 [00:00<00:00, 34.51it/s]

Task 1 - F1 Score: 0.3717, Accuracy: 0.4062
Task 2 - F1 Score: 0.5200, Accuracy: 0.6562
Task 3 - F1 Score: 0.3333, Accuracy: 0.5000
Task 4 - F1 Score: 0.1792, Accuracy: 0.2500
Task 5 - F1 Score: 0.2347, Accuracy: 0.4062
Task 6 - F1 Score: 0.1000, Accuracy: 0.2500



