In [1]:
import torch.nn as nn
import numpy as np
import torch
from transformers import AutoModel
from transformers import AutoTokenizer

class MyModel(nn.Module):
    def __init__(self, embed_dim, journal_size):
        super(MyModel, self).__init__()
        self.bert = AutoModel.from_pretrained("microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext")
        
        # self.tokenizer = AutoTokenizer.from_pretrained("scibert_scivocab_uncased")
        self.atten = nn.MultiheadAttention(embed_dim=embed_dim,num_heads=8,dropout=0.1)
        self.liner_query = nn.Linear(embed_dim, embed_dim)
        self.liner_key = nn.Linear(embed_dim, embed_dim)
        self.liner_value = nn.Linear(embed_dim, embed_dim)
        self.liner1 = nn.Linear(journal_size, journal_size)
        self.relu = nn.ReLU()
        self.liner2 = nn.Linear(journal_size + embed_dim*2, 2)
        self.softmax = nn.Softmax(dim=1)

    def get_sentence_feature(self, input_ids):
        outputs = self.bert(input_ids)
        pooled_output = outputs[1]
        # print('pooled_output.shape:',pooled_output.shape)
        return pooled_output

    def forward(self, journal, title, abasruct):
#         print('journal.shape',journal.shape)
#         print('title.shape',title.shape)
#         print('abasruct.shape',abasruct.shape)
        title_vector = self.get_sentence_feature(title).unsqueeze(0)
        abasruct_vector = self.get_sentence_feature(abasruct).unsqueeze(0)
#         print('title.shape',title_vector.shape)
#         print('abasruct.shape',abasruct_vector.shape)
#         title_query_vector, title_key_vector, title_value_vector = self.liner_query(title_vector),self.liner_key(title_vector),self.liner_value(title_vector)
#         title_atten,_ = self.atten(title_query_vector, title_key_vector, title_value_vector)
#         abasruct_query_vector, abasruct_key_vector, abasruct_value_vector = self.liner_query(abasruct_vector),self.liner_key(abasruct_vector),self.liner_value(abasruct_vector)
#         abasruct_atten,_ = self.atten(abasruct_query_vector, abasruct_key_vector, abasruct_value_vector)
        journal_vector = self.liner1(journal)
        journal_vector = self.relu(journal_vector)
        # print('journal_vector.shape',journal_vector.shape)
#         print('title_atten.shape',title_atten.shape)
#         print('abasruct_atten.shape',abasruct_atten.shape)
        feature = torch.cat((journal_vector, title_vector.squeeze(0), abasruct_vector.squeeze(0)), 1)
        # feature = torch.cat((journal_vector, title_vector, abasruct_vector), 1)
        out = self.liner2(feature)
        output = self.softmax(out)
        return output




In [2]:
!nvidia-smi

Tue Jul 25 06:16:03 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 450.102.04   Driver Version: 450.102.04   CUDA Version: 11.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  On   | 00000000:00:08.0 Off |                    0 |
| N/A   30C    P0    26W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  Tesla P100-PCIE...  On   | 00000000:00:09.0 Off |                    0 |
| N/A   25C    P0    25W / 250W |      0MiB / 16280MiB |      0%      Defaul

In [3]:
import torch.nn.functional as F
def focal_loss(
    inputs: torch.Tensor,
    targets: torch.Tensor,
    alpha: float = 0.80,#0.40
    gamma: float = 2,
    reduction: str = "mean",
) -> torch.Tensor:
    """
    Args:
        inputs: A float tensor of arbitrary shape.
                The predictions which have been sigmod for each example.
        targets: A float tensor with the same shape as inputs. Stores the binary
                 classification label for each element in inputs
                (0 for the negative class and 1 for the positive class).
        alpha: (optional) Weighting factor in range (0,1) to balance
                positive vs negative examples. Default = -1 (no weighting).
        gamma: Exponent of the modulating factor (1 - p_t) to
               balance easy vs hard examples.
        reduction: 'none' | 'mean' | 'sum'
                 'none': No reduction will be applied to the output.
                 'mean': The output will be averaged.
                 'sum': The output will be summed.
    Returns:
        Loss tensor with the reduction option applied.
    """
    inputs = inputs.float()
    targets = targets.float()
    p = inputs
    ce_loss = F.binary_cross_entropy(inputs, targets, reduction="none")
    p_t = p * targets + (1 - p) * (1 - targets)
    loss = ce_loss * ((1 - p_t) ** gamma)

    if alpha >= 0:
        alpha_t = alpha * targets + (1 - alpha) * (1 - targets)
        loss = alpha_t * loss

    if reduction == "mean":
        loss = loss.mean()
    elif reduction == "sum":
        loss = loss.sum()

    return loss

In [None]:
import torch
from torch.utils.data import DataLoader
from torch.optim import AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import torch.nn as nn
import pandas as pd
import numpy as np
import os
from sklearn.metrics import classification_report


device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
# device = 'cpu'

batch_size = 8
lr = 2e-5
EPOCHS = 18

def shuffle_dataset(journal, title, abstruct, label):
    length = len(journal)
    rng = np.random.default_rng(12345)
    index = np.arange(length)
    # print(index)
    rng.shuffle(index)
    # print(index)
    return journal[index], title[index], abstruct[index], label[index]

def evaluate(model, test_journal, test_title, test_abstruct, test_Y):
    model.eval()
    pred = []
    logits = []
    true_Y = []
    length = len(test_Y)
    with torch.no_grad():
        for i in range(0, len(test_Y), batch_size):
        # for i in range(0, 10, batch_size):
            logit = model(test_journal[i:min(len(test_Y),i+batch_size)], test_title[i:min(len(test_Y),i+batch_size)], test_abstruct[i:min(len(test_Y),i+batch_size)]).cpu()
            logits.extend(logit)
            y_pred = torch.argmax(logit, dim=1).cpu()
            pred.extend(y_pred)
            true_Y.extend(test_Y[i:min(len(test_Y),i+batch_size)].cpu())
        # print(true_Y)
        # print(logits)
        # print(pred)
        true_label = []
        prob = []
        pred_label = []
        for i in range(len(true_Y)):
          true_label.append(true_Y[i].item())
          prob.append(logits[i][1].item())
          pred_label.append(pred[i].item())
        # print(true_label)
        # print(prob)
        # print(pred_label)

    return {
        'label':true_label, 'proba':prob,
        'AUC':roc_auc_score(true_label, prob),
        'classification_report':classification_report(true_label, pred_label,digits=4)
    }


def main():
    # 按0和1生成训练集和测试集
    X_0 = pd.read_excel('no_key_data/X_0.xlsx').values
    Y_0 = pd.read_excel('no_key_data/Y_0.xlsx').values
    X_1 = pd.read_excel('no_key_data/X_1.xlsx').values
    Y_1 = pd.read_excel('no_key_data/Y_1.xlsx').values
    title_0 = np.load('no_key_data/token_title_0.npy')
    title_1 = np.load('no_key_data/token_title_1.npy')
    abstruct_0 = np.load('no_key_data/token_abstruct_0_512.npy')
    abstruct_1 = np.load('no_key_data/token_abstruct_1_512.npy')
    
    rs = np.random.RandomState(42)
    L = list(rs.randint(0, len(X_0), int(7/3*len(X_1))))
    X_0 = X_0[L]
    Y_0 = Y_0[L]
    title_0=title_0[L]
    abstruct_0 = abstruct_0[L]
    
    journal_train_X_0, journal_test_X_0, train_Y_0,test_Y_0 = train_test_split(X_0, Y_0, train_size=0.80, random_state=42)
    journal_train_X_1, journal_test_X_1, train_Y_1, test_Y_1 = train_test_split(X_1, Y_1, train_size=0.80, random_state=42)
    title_train_X_0, title_test_X_0, _,_ = train_test_split(title_0, Y_0, train_size=0.80, random_state=42)
    title_train_X_1, title_test_X_1, _,_ = train_test_split(title_1, Y_1, train_size=0.80, random_state=42)
    abstruct_train_X_0, abstruct_test_X_0, _,_ = train_test_split(abstruct_0, Y_0, train_size=0.80, random_state=42)
    abstruct_train_X_1, abstruct_test_X_1, _,_ = train_test_split(abstruct_1, Y_1, train_size=0.80, random_state=42)
    
    test_journal = torch.from_numpy(np.vstack((journal_test_X_1, journal_test_X_0))).float().to(device)
    test_title = torch.from_numpy(np.vstack((title_test_X_1, title_test_X_0))).to(device)
    test_abstruct = torch.from_numpy(np.vstack((abstruct_test_X_1, abstruct_test_X_0))).to(device)
    test_Y = torch.from_numpy(np.vstack((test_Y_1, test_Y_0))).to(device)
    test_journal, test_title, test_abstruct, test_Y = shuffle_dataset(test_journal, test_title, test_abstruct, test_Y)


    model = MyModel(embed_dim=768, journal_size=test_journal.shape[1]).to(device)
#     model.load_state_dict(torch.load("./res/new_model_2_0.75.pt")) 

    optimizer = AdamW(model.parameters(), lr=lr)

    journal_train_X_0 = torch.from_numpy(journal_train_X_0).float().to(device)
    title_train_X_0 = torch.from_numpy(title_train_X_0).to(device)
    abstruct_train_X_0 = torch.from_numpy(abstruct_train_X_0).to(device)
    train_Y_0 = torch.from_numpy(train_Y_0).to(device)

    journal_train_X_1 = torch.from_numpy(journal_train_X_1).float().float().to(device)
    title_train_X_1 = torch.from_numpy(title_train_X_1).to(device)
    abstruct_train_X_1 = torch.from_numpy(abstruct_train_X_1).to(device)
    train_Y_1 = torch.from_numpy(train_Y_1).to(device)

    for epoch in range(EPOCHS):
        model.train()
        journal_train_X_0, title_train_X_0, abstruct_train_X_0, train_Y_0 = shuffle_dataset(journal_train_X_0, title_train_X_0, abstruct_train_X_0, train_Y_0)
        train_journal = torch.cat((journal_train_X_1, journal_train_X_0))
        # print('journal_train_X_1.shape:',journal_train_X_1.shape)
        train_title = torch.cat((title_train_X_1, title_train_X_0))
        train_abstruct = torch.cat((abstruct_train_X_1, abstruct_train_X_0))
        train_Y = torch.cat((train_Y_1, train_Y_0))
        train_journal, train_title, train_abstruct, train_Y = shuffle_dataset(train_journal, train_title, train_abstruct, train_Y)
        
        for i in range(0, len(train_Y), batch_size):
        # for i in range(0, 10, batch_size):
            model.zero_grad()
            logits = model(train_journal[i:min(len(train_Y),i+batch_size)], train_title[i:min(len(train_Y),i+batch_size)], train_abstruct[i:min(len(train_Y),i+batch_size)])
            loss = focal_loss(logits[:,1].unsqueeze(1), train_Y[i:i+batch_size])
            loss.backward()
            if((i/batch_size)%20 == 0):
              print('i = ', i)
              print('loss = ', loss.item())
            optimizer.step()
        if(epoch%3 == 0):
            
              print('evaluating...')

              val_metrics = evaluate(model, test_journal, test_title, test_abstruct, test_Y)

              print('epochs:',epoch)
              print('AUC = ',val_metrics['AUC'])
              print(val_metrics['classification_report'])

              torch.save(model.state_dict(), './res/baseline_add_{}.pt'.format(epoch))
          

if __name__ == '__main__':
    if not os.path.exists('./res'):
        os.mkdir('./res')
    main()


Some weights of the model checkpoint at microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


i =  0
loss =  0.07178083807229996
i =  160
loss =  0.06557983160018921
i =  320
loss =  0.06010665372014046
i =  480
loss =  0.05433017015457153
i =  640
loss =  0.0788726881146431
i =  800
loss =  0.05840091407299042
i =  960
loss =  0.06309198588132858
i =  1120
loss =  0.034366074949502945
i =  1280
loss =  0.04041740298271179
i =  1440
loss =  0.06364269554615021
i =  1600
loss =  0.049605630338191986
i =  1760
loss =  0.06821352243423462
i =  1920
loss =  0.031270116567611694
i =  2080
loss =  0.036096442490816116
i =  2240
loss =  0.06218455359339714
i =  2400
loss =  0.050485312938690186
i =  2560
loss =  0.02544092759490013
evaluating...
epochs: 0
AUC =  0.9004126489089732
             precision    recall  f1-score   support

          0     0.9736    0.5501    0.7030       469
          1     0.4790    0.9652    0.6403       201

avg / total     0.8252    0.6746    0.6842       670

i =  0
loss =  0.041654523462057114
i =  160
loss =  0.06687920540571213
i =  320
loss =  0.02

i =  2400
loss =  3.416671461309306e-05
i =  2560
loss =  8.36934668768663e-06
i =  0
loss =  5.941875497228466e-05
i =  160
loss =  7.186869424913311e-06
i =  320
loss =  4.613743385561975e-06
i =  480
loss =  3.783064676099457e-05
i =  640
loss =  6.646783731412143e-05
i =  800
loss =  1.7306209656453575e-06
i =  960
loss =  0.00035974124330095947
i =  1120
loss =  1.1579713827813976e-05
i =  1280
loss =  1.6781647218522266e-06
i =  1440
loss =  1.6728556147427298e-05
i =  1600
loss =  0.0004579097731038928
i =  1760
loss =  3.527995431795716e-05
i =  1920
loss =  0.0007518625934608281
i =  2080
loss =  3.404631934245117e-05
i =  2240
loss =  0.016193941235542297
i =  2400
loss =  0.00010628430754877627
i =  2560
loss =  1.8887962141889147e-05
i =  0
loss =  0.0002606530033517629
i =  160
loss =  6.142697384348139e-05
i =  320
loss =  0.00010568288416834548
i =  480
loss =  0.0005668308003805578
i =  640
loss =  0.00022354690008796751
i =  800
loss =  0.00016957466141320765
i =  960
