### Load Data

In [61]:
# Import Packages
import numpy as np
import pandas as pd

# Load original data in pandas
data_tran = pd.read_json('data2/data_tran.json', orient='records', lines=True)
data_test = pd.read_json('data2/data_test.json', orient='records', lines=True)

n_tran = data_tran.shape[0]
n_test = data_test.shape[0]

# Load feature data in numpy
x_tran_coauthors = np.load("data2/x_tran_coauthors.npy")
x_tran_venue_a   = np.load("data2/x_tran_venue_a.npy")
x_tran_venue_b   = np.load("data2/x_tran_venue_b.npy")
x_tran_text_a    = np.load("data2/x_tran_text_a.npy")
x_tran_text_b    = np.load("data2/x_tran_text_b.npy")

x_test_coauthors = np.load("data2/x_test_coauthors.npy")
x_test_venue_a   = np.load("data2/x_test_venue_a.npy")
x_test_venue_b   = np.load("data2/x_test_venue_b.npy")
x_test_text_a    = np.load("data2/x_test_text_a.npy")
x_test_text_b    = np.load("data2/x_test_text_b.npy")

x_tran_title_doc2vec = np.load('data2/x_tran_title_doc2vec.npy')
x_test_title_doc2vec = np.load('data2/x_test_title_doc2vec.npy')

x_tran_abstract_doc2vec = np.load('data2/x_tran_abstract_doc2vec.npy')
x_test_abstract_doc2vec = np.load('data2/x_test_abstract_doc2vec.npy')

y_tran_basic = np.load("data2/y_tran.npy")
x_tran_basic = np.concatenate((x_tran_coauthors, x_tran_venue_a, x_tran_venue_b, x_tran_text_a, x_tran_text_b), axis=1)
x_test_basic = np.concatenate((x_test_coauthors, x_test_venue_a, x_test_venue_b, x_test_text_a, x_test_text_b), axis=1)

In [62]:
import torch

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Train-validation split
idxs = np.random.permutation(n_tran)
inxs_prop = int(0.85 * n_tran)
idxs_tran_indices = idxs[:inxs_prop]
idxs_vald_indices = idxs[inxs_prop:]

x_tran_a = torch.tensor(x_tran_basic[idxs_tran_indices], dtype=torch.float32).to(device)
x_vald_a = torch.tensor(x_tran_basic[idxs_vald_indices], dtype=torch.float32).to(device)
x_test_a = torch.tensor(x_test_basic, dtype=torch.float32).to(device)

y_tran   = torch.tensor(y_tran_basic[idxs_tran_indices], dtype=torch.float32).to(device)
y_vald   = torch.tensor(y_tran_basic[idxs_vald_indices], dtype=torch.float32).to(device)
y_test   = torch.zeros((x_test_a.shape[0], y_tran.shape[1]), dtype=torch.float32)

data_tran_new = data_tran.iloc[idxs_tran_indices].reset_index(drop=True)
data_vald_new = data_tran.iloc[idxs_vald_indices].reset_index(drop=True)
data_test_new = data_test

In [63]:
import json

with open("data2/x_tran_title_word_vectors.json", "r") as f:
    word_vectors_dict_title = json.load(f)

with open("data2/x_tran_abstract_word_vectors.json", "r") as f:
    word_vectors_dict_abstract = json.load(f)

word_vectors_dict_title = {word: np.array(vector) for word, vector in word_vectors_dict_title.items()}
word_vectors_dict_abstract = {word: np.array(vector) for word, vector in word_vectors_dict_abstract.items()}

### Dataset & DataLoader

In [64]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset

class ModelDataset(Dataset):
    def __init__(self, x1, y, data, word_vectors_dict_title, word_vectors_dict_abstract):
        self.x1 = x1
        self.y  = y
        self.data = data
        self.word_vectors_dict_title    = word_vectors_dict_title
        self.word_vectors_dict_abstract = word_vectors_dict_abstract

    def __len__(self):
        return len(self.data)

    def text_to_vector(self, text, word_vectors_dict):
        vectors = [word_vectors_dict.get(word, np.zeros_like(next(iter(word_vectors_dict.values())))) for word in text.split()]
        return torch.tensor(np.stack(vectors), dtype=torch.float32)

    def __getitem__(self, idx):
        
        x1 = self.x1[idx].clone().detach().float()
        x2 = self.text_to_vector(self.data.iloc[idx]['title_text'],    self.word_vectors_dict_title).to(device)
        x3 = self.text_to_vector(self.data.iloc[idx]['abstract_text'], self.word_vectors_dict_abstract).to(device)

        y = self.y[idx].clone().detach().float()
        
        return x1, x2, x3, y


In [65]:
from torch.nn.utils.rnn import pad_sequence

def collate_fn(batch):
    # 解包每个 batch 元素：x1, x2, x3, y
    x1_batch, x2_batch, x3_batch, y_batch = zip(*batch)

    # 将 x1 转为张量 (基础特征)
    x1_batch = torch.stack(x1_batch)

    # 使用 pad_sequence 填充 x2 和 x3
    x2_batch = pad_sequence(x2_batch, batch_first=True)
    x3_batch = pad_sequence(x3_batch, batch_first=True)

    # 计算 x2 和 x3 的长度
    x2_lengths = torch.tensor([len(seq) for seq in x2_batch])
    x3_lengths = torch.tensor([len(seq) for seq in x3_batch])

    # 将 y 转为张量
    y_batch = torch.stack(y_batch)

    return x1_batch, x2_batch, x3_batch, x2_lengths, x3_lengths, y_batch

In [66]:
dataset_tran = ModelDataset(x_tran_a, y_tran, data_tran_new, word_vectors_dict_title, word_vectors_dict_abstract)
dataset_vald = ModelDataset(x_vald_a, y_vald, data_vald_new, word_vectors_dict_title, word_vectors_dict_abstract)
dataset_test = ModelDataset(x_test_a, y_test, data_test_new, word_vectors_dict_title, word_vectors_dict_abstract)

datalod_tran = DataLoader(dataset_tran, batch_size=32, shuffle=True, collate_fn=collate_fn)
datalod_vald = DataLoader(dataset_vald, batch_size=32, shuffle=True, collate_fn=collate_fn)
datalod_test = DataLoader(dataset_test, batch_size=32, shuffle=True, collate_fn=collate_fn)

In [83]:
import torch
import torch.nn as nn
import torch.nn.utils.rnn as rnn_utils

class FNNLSTM(nn.Module):
    def __init__(self, input_dim1, output_dim):
        super(FNNLSTM, self).__init__()

        # 定义 model1 的全连接层
        self.model1 = nn.Sequential(
            nn.Linear(input_dim1, 512),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Linear(128, 50),
            nn.ReLU()
        )

        # 定义 LSTM 层
        self.lstm_title = nn.LSTM(input_size=50, hidden_size=50, batch_first=True)
        self.lstm_abstract = nn.LSTM(input_size=100, hidden_size=100, batch_first=True)

        # 定义输出层
        self.fc = nn.Linear(200, output_dim)  # 50 + 100 + 50 = 200
        self.sigmoid = nn.Sigmoid()

    def forward(self, x1, x2, x2_lengths, x3, x3_lengths):
        # model1 前向传播
        x1_out = self.model1(x1)  # shape: (batch_size, 50)

        # 对 x2 进行 pack，并通过 LSTM
        x2_packed = rnn_utils.pack_padded_sequence(x2, x2_lengths, batch_first=True, enforce_sorted=False)
        x2_out, (x2_hidden, _) = self.lstm_title(x2_packed)
        x2_hidden = x2_hidden[-1]  # 获取最后一层的 hidden state

        # 对 x3 进行 pack，并通过 LSTM
        x3_packed = rnn_utils.pack_padded_sequence(x3, x3_lengths, batch_first=True, enforce_sorted=False)
        x3_out, (x3_hidden, _) = self.lstm_abstract(x3_packed)
        x3_hidden = x3_hidden[-1]  # 获取最后一层的 hidden state

        # 拼接所有特征
        x = torch.cat([x1_out, x2_hidden, x3_hidden], dim=1)  # shape: (batch_size, 200)

        # 通过全连接层
        x = self.fc(x)
        x = self.sigmoid(x)
        return x


In [84]:
input_dim1 = x_tran_a.shape[1]
output_dim = y_tran.shape[1] 

model = FNNLSTM(input_dim1, output_dim).to(device)
criterion = nn.BCELoss() 
optimizer = optim.Adam(model.parameters(), lr=0.00001, weight_decay=0.00001)

In [85]:
from sklearn.metrics import precision_score, recall_score, f1_score

def calculate_metrics(pred_label, true_label):
    pred_label = pred_label.int()
    true_label = true_label.int()
    pc = precision_score(true_label.cpu(), pred_label.cpu(), average='macro', zero_division=0)
    rc = recall_score(true_label.cpu(), pred_label.cpu(), average='macro', zero_division=0)
    f1 = f1_score(true_label.cpu(), pred_label.cpu(), average='macro', zero_division=0)
    return pc, rc, f1

In [86]:
class EarlyStopping:
    def __init__(self, patience=10, delta=0.001):
        self.patience = patience
        self.delta = delta
        self.best_loss = None
        self.counter = 0
        self.early_stop = False

    def __call__(self, train_loss):
        if self.best_loss is None or train_loss < self.best_loss - self.delta:
            self.best_loss = train_loss
            self.counter = 0
        else:
            self.counter += 1
            if self.counter >= self.patience:
                self.early_stop = True

early_stopping = EarlyStopping(patience=10, delta=0.001)

In [87]:
# 开始训练
epochs = 50000

for epoch in range(epochs):
    model.train() 
    total_loss = 0 

    for batch in datalod_tran:
        x1, x2, x3, x2_lengths, x3_lengths, y = batch
        x1, x2, x3, y = x1.to(device), x2.to(device), x3.to(device), y.to(device)

        # 前向传播
        outputs = model(x1, x2, x2_lengths, x3, x3_lengths)

        # 前向传播
        outputs = model(x1, x2, x3)
        loss = criterion(outputs, y.float())

        # 反向传播和优化
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(datalod_tran)  # 计算平均损失

    # 每 100 个 epoch 打印训练和验证结果
    if (epoch + 1) % 100 == 0:
        model.eval()  # 设置为评估模式

        y_tran_pred_prob, y_tran_pred_labl = [], []
        y_vald_pred_prob, y_vald_pred_labl = [], []

        with torch.no_grad():
            # 遍历训练数据集
            for batch in datalod_tran:
                x1_tran, x2_tran, x3_tran, y_tran_batch = batch
                x1_tran, x2_tran, x3_tran, y_tran_batch = (
                    x1_tran.to(device), x2_tran.to(device), 
                    x3_tran.to(device), y_tran_batch.to(device)
                )

                tran_outputs = model(x1_tran, x2_tran, x3_tran)
                y_tran_pred_prob.append(tran_outputs.cpu())
                y_tran_pred_labl.append(y_tran_batch.cpu())

            # 遍历验证数据集
            for batch in datalod_vald:
                x1_vald, x2_vald, x3_vald, y_vald_batch = batch
                x1_vald, x2_vald, x3_vald, y_vald_batch = (
                    x1_vald.to(device), x2_vald.to(device), 
                    x3_vald.to(device), y_vald_batch.to(device)
                )

                vald_outputs = model(x1_vald, x2_vald, x3_vald)
                y_vald_pred_prob.append(vald_outputs.cpu())
                y_vald_pred_labl.append(y_vald_batch.cpu())

            # 合并所有批次的结果
            y_tran_pred_prob = torch.cat(y_tran_pred_prob, dim=0)
            y_tran_pred_labl = torch.cat(y_tran_pred_labl, dim=0)
            y_vald_pred_prob = torch.cat(y_vald_pred_prob, dim=0)
            y_vald_pred_labl = torch.cat(y_vald_pred_labl, dim=0)

            # 计算指标
            tran_pc, tran_rc, tran_f1 = calculate_metrics(
                (y_tran_pred_prob > 0.5).int(), y_tran_pred_labl
            )
            vald_pc, vald_rc, vald_f1 = calculate_metrics(
                (y_vald_pred_prob > 0.5).int(), y_vald_pred_labl
            )

        print(f"Epoch {epoch + 1}/{epochs}, Loss: {avg_loss:.4f}")
        print(f"Train - Precision: {tran_pc:.4f}, Recall: {tran_rc:.4f}, F1 Score: {tran_f1:.4f}")
        print(f"Val   - Precision: {vald_pc:.4f}, Recall: {vald_rc:.4f}, F1 Score: {vald_f1:.4f}")
        print()

        early_stopping(avg_loss)
        if early_stopping.early_stop:
            print("Early Stop!")
            break

# 测试集预测
model.eval()
y_test_pred_prob, y_test_pred_labl = [], []

with torch.no_grad():
    for batch in datalod_test:
        x1_test, x2_test, x3_test, _ = batch  # 测试集无标签
        x1_test, x2_test, x3_test = (
            x1_test.to(device), x2_test.to(device), x3_test.to(device)
        )

        test_outputs = model(x1_test, x2_test, x3_test)
        y_test_pred_prob.append(test_outputs.cpu())

    # 合并所有批次的结果
    y_test_pred_prob = torch.cat(y_test_pred_prob, dim=0)
    y_test_pred_labl = (y_test_pred_prob > 0.5).int()

# 打印测试集预测结果的形状
print(f"Test Prediction Shape: {y_test_pred_labl.shape}")


RuntimeError: shape '[20000, 1]' is invalid for input of size 10000

In [77]:
x3_lengths

tensor([484, 484, 484, 484, 484, 484, 484, 484, 484, 484, 484, 484, 484, 484,
        484, 484, 484, 484, 484, 484, 484, 484, 484, 484, 484, 484, 484, 484,
        484, 484, 484, 484])

In [None]:
def generate_output_csv(x_test_a, y_test_pred_labl):
    
    result = []
    
    for i, row in enumerate(y_test_pred_labl):
        if ((x_test_a[i, :100] < 1).all() or (x_test_a[i, 100:200] == 0).all() or (x_test_a[i, 200:300] == 0).all() or (x_test_a[i, 300:400] == 0).all() or (x_test_a[i, 400:500] == 0).all()):
            result.append("-1")
        elif row.sum() == 0 or row[100] == 1:
            result.append("-1")
        else:
            indices = [str(idx) for idx, val in enumerate(row) if val == 1]
            result.append(" ".join(indices))
    
    result_df = pd.DataFrame({"ID": range(len(result)), "Predict": result})
    
    return result_df

generate_output_csv(x_test_a, y_test_pred_labl).to_csv("result_method2.csv", index=False)