In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install transformers
!pip install pytorch
!pip install sklearn
!pip install thop

Collecting transformers
  Downloading transformers-4.30.2-py3-none-any.whl (7.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m71.9 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m24.7 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m67.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m58.1 MB/s[0m eta [36m0:00:0

In [3]:
import torch.nn as nn
import numpy as np
import torch
from transformers import AutoModel
from transformers import AutoTokenizer

class MyModel(nn.Module):
    def __init__(self, embed_dim, journal_size):
        super(MyModel, self).__init__()
        tokenizer = AutoTokenizer.from_pretrained("microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext",padding ='max_length',max_length = 512,truncation=True)
        self.bert = AutoModel.from_pretrained("microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext")


        # self.tokenizer = AutoTokenizer.from_pretrained("scibert_scivocab_uncased")
        self.atten = nn.MultiheadAttention(embed_dim=embed_dim,num_heads=8,dropout=0.1)
        self.liner_query = nn.Linear(embed_dim, embed_dim)
        self.liner_key = nn.Linear(embed_dim, embed_dim)
        self.liner_value = nn.Linear(embed_dim, embed_dim)
        self.liner1 = nn.Linear(journal_size, journal_size)
        self.relu = nn.ReLU()
        self.liner2 = nn.Linear(journal_size+embed_dim, 2)
        self.softmax = nn.Softmax(dim=1)

    def get_sentence_feature(self, input_ids):
        outputs = self.bert(input_ids)
        pooled_output = outputs[1]
        # print('pooled_output.shape:',pooled_output.shape)
        return pooled_output

    def forward(self, journal, title, abasruct):
#         print('journal.shape',journal.shape)
#         print('title.shape',title.shape)
#         print('abasruct.shape',abasruct.shape)
#         title_vector = self.get_sentence_feature(title).unsqueeze(0)
        abasruct_vector = self.get_sentence_feature(abasruct).unsqueeze(0)
#         print('title.shape',title_vector.shape)
#         print('abasruct.shape',abasruct_vector.shape)
#         title_query_vector, title_key_vector, title_value_vector = self.liner_query(title_vector),self.liner_key(title_vector),self.liner_value(title_vector)
#         title_atten,_ = self.atten(title_query_vector, title_key_vector, title_value_vector)
        abasruct_query_vector, abasruct_key_vector, abasruct_value_vector = self.liner_query(abasruct_vector),self.liner_key(abasruct_vector),self.liner_value(abasruct_vector)
        abasruct_atten,_ = self.atten(abasruct_query_vector, abasruct_key_vector, abasruct_value_vector)
        journal_vector = self.liner1(journal)
        journal_vector = self.relu(journal_vector)
        # print('journal_vector.shape',journal_vector.shape)
#         print('title_atten.shape',title_atten.shape)
#         print('abasruct_atten.shape',abasruct_atten.shape)
        feature = torch.cat((journal_vector,abasruct_atten.squeeze(0)), 1)
        # feature = torch.cat((journal_vector, title_vector, abasruct_vector), 1)
        out = self.liner2(feature)
        output = self.softmax(out)
        return output




In [4]:
import torch.nn.functional as F
def focal_loss(
    inputs: torch.Tensor,
    targets: torch.Tensor,
    alpha: float = 0.80,#0.40
    gamma: float = 2,
    reduction: str = "mean",
) -> torch.Tensor:
    """
    Args:
        inputs: A float tensor of arbitrary shape.
                The predictions which have been sigmod for each example.
        targets: A float tensor with the same shape as inputs. Stores the binary
                 classification label for each element in inputs
                (0 for the negative class and 1 for the positive class).
        alpha: (optional) Weighting factor in range (0,1) to balance
                positive vs negative examples. Default = -1 (no weighting).
        gamma: Exponent of the modulating factor (1 - p_t) to
               balance easy vs hard examples.
        reduction: 'none' | 'mean' | 'sum'
                 'none': No reduction will be applied to the output.
                 'mean': The output will be averaged.
                 'sum': The output will be summed.
    Returns:
        Loss tensor with the reduction option applied.
    """
    inputs = inputs.float()
    targets = targets.float()
    p = inputs
    ce_loss = F.binary_cross_entropy(inputs, targets, reduction="none")
    p_t = p * targets + (1 - p) * (1 - targets)
    loss = ce_loss * ((1 - p_t) ** gamma)

    if alpha >= 0:
        alpha_t = alpha * targets + (1 - alpha) * (1 - targets)
        loss = alpha_t * loss

    if reduction == "mean":
        loss = loss.mean()
    elif reduction == "sum":
        loss = loss.sum()

    return loss

In [9]:
import torch
from torch.utils.data import DataLoader
from torch.optim import AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import torch.nn as nn
import pandas as pd
import numpy as np
import os
from sklearn.metrics import classification_report, confusion_matrix
from thop import profile
import time


device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
# device = 'cpu'

batch_size = 16
lr = 3e-5
EPOCHS = 18

def shuffle_dataset(journal, title, abstruct, label):
    length = len(journal)
    rng = np.random.default_rng(12345)
    index = np.arange(length)
    # print(index)
    rng.shuffle(index)
    # print(index)
    return journal[index], title[index], abstruct[index], label[index]

def evaluate(model, test_journal, test_title, test_abstruct, test_Y):
    model.eval()
    pred = []
    logits = []
    true_Y = []
    length = len(test_Y)
    with torch.no_grad():
        for i in range(0, len(test_Y), batch_size):
        # for i in range(0, 10, batch_size):
            logit = model(test_journal[i:min(len(test_Y),i+batch_size)], test_title[i:min(len(test_Y),i+batch_size)], test_abstruct[i:min(len(test_Y),i+batch_size)]).cpu()
            logits.extend(logit)
            y_pred = torch.argmax(logit, dim=1).cpu()
            pred.extend(y_pred)
            true_Y.extend(test_Y[i:min(len(test_Y),i+batch_size)].cpu())
        # print(true_Y)
        # print(logits)
        # print(pred)
        true_label = []
        prob = []
        pred_label = []
        for i in range(len(true_Y)):
          true_label.append(true_Y[i].item())
          prob.append(logits[i][1].item())
          pred_label.append(pred[i].item())
        # print(true_label)
        # print(prob)
        # print(pred_label)

        # 计算specificity、sensitivity
        tn, fp, fn, tp = confusion_matrix(true_label, pred_label).ravel()
        specificity = tn / (tn + fp)
        sensitivity = tp / (tp + fn)

    return {
        'label':true_label, 'proba':prob,
        'AUC':roc_auc_score(true_label, prob),
        'classification_report':classification_report(true_label, pred_label, digits=4),
        'specificity': specificity,
        'sensitivity': sensitivity
    }


def main():
    # 按0和1生成训练集和测试集
    X_0 = pd.read_excel('/content/drive/MyDrive/pajo_data/X_0.xlsx').values
    Y_0 = pd.read_excel('/content/drive/MyDrive/pajo_data/Y_0.xlsx').values
    X_1 = pd.read_excel('/content/drive/MyDrive/pajo_data/X_1.xlsx').values
    Y_1 = pd.read_excel('/content/drive/MyDrive/pajo_data/Y_1.xlsx').values
    title_0 = np.load('/content/drive/MyDrive/pajo_data/token_title_0.npy')
    title_1 = np.load('/content/drive/MyDrive/pajo_data/token_title_1.npy')
    abstruct_0 = np.load('/content/drive/MyDrive/pajo_data/token_abstruct_0_512.npy')
    abstruct_1 = np.load('/content/drive/MyDrive/pajo_data/token_abstruct_1_512.npy')


    rs = np.random.RandomState(42)
    L = list(rs.randint(0, len(X_0), int(7/3*len(X_1))))
    X_0 = X_0[L]
    Y_0 = Y_0[L]
    title_0=title_0[L]
    abstruct_0 = abstruct_0[L]

    journal_train_X_0, journal_test_X_0, train_Y_0,test_Y_0 = train_test_split(X_0, Y_0, train_size=0.80, random_state=42)
    journal_train_X_1, journal_test_X_1, train_Y_1, test_Y_1 = train_test_split(X_1, Y_1, train_size=0.80, random_state=42)
    title_train_X_0, title_test_X_0, _,_ = train_test_split(title_0, Y_0, train_size=0.80, random_state=42)
    title_train_X_1, title_test_X_1, _,_ = train_test_split(title_1, Y_1, train_size=0.80, random_state=42)
    abstruct_train_X_0, abstruct_test_X_0, _,_ = train_test_split(abstruct_0, Y_0, train_size=0.80, random_state=42)
    abstruct_train_X_1, abstruct_test_X_1, _,_ = train_test_split(abstruct_1, Y_1, train_size=0.80, random_state=42)

    test_journal = torch.from_numpy(np.vstack((journal_test_X_1, journal_test_X_0))).float().to(device)
    test_title = torch.from_numpy(np.vstack((title_test_X_1, title_test_X_0))).to(device)
    test_abstruct = torch.from_numpy(np.vstack((abstruct_test_X_1, abstruct_test_X_0))).to(device)
    test_Y = torch.from_numpy(np.vstack((test_Y_1, test_Y_0))).to(device)
    test_journal, test_title, test_abstruct, test_Y = shuffle_dataset(test_journal, test_title, test_abstruct, test_Y)


    model = MyModel(embed_dim=768, journal_size=test_journal.shape[1]).to(device)
#     model.load_state_dict(torch.load("./res/new_model_2_0.75.pt"))

    optimizer = AdamW(model.parameters(), lr=lr)

    journal_train_X_0 = torch.from_numpy(journal_train_X_0).float().to(device)
    title_train_X_0 = torch.from_numpy(title_train_X_0).to(device)
    abstruct_train_X_0 = torch.from_numpy(abstruct_train_X_0).to(device)
    train_Y_0 = torch.from_numpy(train_Y_0).to(device)

    journal_train_X_1 = torch.from_numpy(journal_train_X_1).float().float().to(device)
    title_train_X_1 = torch.from_numpy(title_train_X_1).to(device)
    abstruct_train_X_1 = torch.from_numpy(abstruct_train_X_1).to(device)
    train_Y_1 = torch.from_numpy(train_Y_1).to(device)

    for epoch in range(EPOCHS):
        model.train()
        start_time = time.time()
        journal_train_X_0, title_train_X_0, abstruct_train_X_0, train_Y_0 = shuffle_dataset(journal_train_X_0, title_train_X_0, abstruct_train_X_0, train_Y_0)
        train_journal = torch.cat((journal_train_X_1, journal_train_X_0))
        # print('journal_train_X_1.shape:',journal_train_X_1.shape)
        train_title = torch.cat((title_train_X_1, title_train_X_0))
        train_abstruct = torch.cat((abstruct_train_X_1, abstruct_train_X_0))
        train_Y = torch.cat((train_Y_1, train_Y_0))
        train_journal, train_title, train_abstruct, train_Y = shuffle_dataset(train_journal, train_title, train_abstruct, train_Y)

        for i in range(0, len(train_Y), batch_size):
        # for i in range(0, 10, batch_size):
            model.zero_grad()
            logits = model(train_journal[i:min(len(train_Y),i+batch_size)], train_title[i:min(len(train_Y),i+batch_size)], train_abstruct[i:min(len(train_Y),i+batch_size)])
            loss = focal_loss(logits[:,1].unsqueeze(1), train_Y[i:i+batch_size])
            loss.backward()
            if((i/batch_size)%20 == 0):
              print('i = ', i)
              print('loss = ', loss.item())
            optimizer.step()

        end_time = time.time()

        # 计算每秒钟的训练样本数（即MLOPS）
        num_samples = len(train_Y)
        training_time = end_time - start_time
        samples_per_second = num_samples / training_time

        if(epoch%3 == 0):

              print('evaluating...')

              val_metrics = evaluate(model, test_journal, test_title, test_abstruct, test_Y)

              print('epochs:',epoch)
              print('PAJO-AJ AUC = ',val_metrics['AUC'])
              print('Specificity = ', val_metrics['specificity'])
              print('Sensitivity = ', val_metrics['sensitivity'])
              # 输出模型信息
              flops, params = profile(model, inputs=(train_journal[0:batch_size], train_title[0:batch_size], train_abstruct[0:batch_size]))
              print("Flops: {:.2f}".format(flops))
              print("MLOPS: {:.2f}".format(samples_per_second))
              print(val_metrics['classification_report'])

              torch.save(model.state_dict(), './res/abs_jou_pumbed_2_0.80_{}.pt'.format(epoch))


if __name__ == '__main__':
    if not os.path.exists('./res'):
        os.mkdir('./res')
    main()


Some weights of the model checkpoint at microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


i =  0
loss =  0.08098867535591125
i =  320
loss =  0.06670258939266205
i =  640
loss =  0.05806796997785568
i =  960
loss =  0.05874984711408615
i =  1280
loss =  0.03966303914785385
i =  1600
loss =  0.05389435961842537
i =  1920
loss =  0.03696205094456673
i =  2240
loss =  0.03640926256775856
i =  2560
loss =  0.03566804155707359
evaluating...
epochs: 0
PAJO-AJ AUC =  0.8851690375415034
Specificity =  0.8038379530916845
Sensitivity =  0.8258706467661692
[INFO] Register count_normalization() for <class 'torch.nn.modules.normalization.LayerNorm'>.
[INFO] Register zero_ops() for <class 'torch.nn.modules.dropout.Dropout'>.
[INFO] Register count_linear() for <class 'torch.nn.modules.linear.Linear'>.
[INFO] Register zero_ops() for <class 'torch.nn.modules.activation.ReLU'>.
[INFO] Register count_softmax() for <class 'torch.nn.modules.activation.Softmax'>.
Flops: 696451625552.00
MLOPS: 9.88
              precision    recall  f1-score   support

           0     0.9150    0.8038    0.8558 

In [None]:
from sklearn import metrics
import numpy as np
import warnings
warnings.filterwarnings(action = 'ignore')

def best_yuzhi_aimed_at_1(preda,y_test):
    #preda为预测为1类的概率,输入形式为narray
    precisions = []
    recalls = []
    for i in np.arange(0.01,1,0.01):
        y_pred = np.where(preda<i,0,1)
        TN,FP,FN,TP = metrics.confusion_matrix(y_test,y_pred).ravel()
        precision = TP/(TP+FP)
        recall = TP/(TP+FN)
        precisions.append(precision)
        recalls.append(recall)
    precisions = np.array(precisions)
    recalls = np.array(recalls)
    f1_scores = (2 * precisions * recalls) / (precisions + recalls)
    best_f1_score = np.max(f1_scores[np.isfinite(f1_scores)])
    best_f1_score_index = np.argmax(f1_scores[np.isfinite(f1_scores)])
    return best_f1_score, np.arange(0.01,1,0.01)[best_f1_score_index]
#调用示例
y_test=np.array([0,0,1,1,1,0,1,0,1,0,1,1])
preda = np.array([0.2,0.3,0.4,0.44,0.45,0.56,0.3,0.1,0.7,0.9,0.13,0.5])
print(best_yuzhi_aimed_at_1(preda,y_test))
#输出
(0.7777777777777778, 0.11)#（1类最佳f1值和对应的阈值）

In [None]:
from matplotlib import pyplot as plt
def plot_picture(y_test,probas):
    # y_test测试集
    # probas预测概率
    CVD = pd.DataFrame()
    CVD['正样本'] = y_test
    CVD['score'] = probas
    CVD = CVD.sort_values(by='score')
    cvd_risk = CVD.reset_index(drop=True)
    print(cvd_risk)
    H = len(cvd_risk)
    h = int(H / 10)
    cvd = []
    risk = []
    h0 = 0
    risk_count = 0
    cvd_count = 0
    for i in range(len(cvd_risk)):
        if h0 + h > i + 1:
            risk_count = risk_count + cvd_risk.loc[i, "score"]
            if cvd_risk.loc[i, "正样本"] == 1:
                cvd_count = cvd_count + 1
        else:
            h0 = h0 + h
            cvd.append(round(cvd_count / h, 3))
            risk.append(round(risk_count / h, 3))
            risk_count = 0
            cvd_count = 0

    labels = ['10', '9', '8', '7', '6', '5', '4', '3', '2', '1']
    cvd.reverse()

    x = np.arange(len(labels))  # the label locations
    width = 0.8  # the width of the bars

    fig, ax = plt.subplots()
    rects1 = ax.bar(x, cvd, width, color='royalblue')
    # rects2 = ax.bar(x + width / 2, risk, width, label='Estimated', color='indianred')

    # Add some text for labels, title and custom x-axis tick labels, etc.
    ax.set_ylabel('Proportion of Positive Samples')
    ax.set_xlabel('Decile of Estimated Score')
    # ax.set_title('Observed Vs Estimated')
    ax.set_xticks(x)
    ax.set_xticklabels(labels)
    ax.legend()
    plt.axhline(y=np.mean(cvd),  linestyle='--', color='black')
    def autolabel(rects):
        """Attach a text label above each bar in *rects*, displaying its height."""
        for rect in rects:
            height = rect.get_height()
            ax.annotate('{}'.format(height),
                        xy=(rect.get_x() + rect.get_width() / 2, height),
                        xytext=(0, 2),  # 3 points vertical offset
                        textcoords="offset points",
                        ha='center', va='bottom')

    autolabel(rects1)
    # autolabel(rects2)
    fig.tight_layout()
    plt.savefig("柱状图.png")
    plt.show()


In [None]:
def yuzhi(preda,door=0.1):
    predict=[]
    for i in range(len(preda)):
        if preda[i] < door:
            predict.append(0)
        else:
            predict.append(1)
    return predict

In [None]:
#按阈值最大
from sklearn.metrics import roc_auc_score,roc_curve,auc
from numpy import argmax
def find_optimal_cutoff(tpr,fpr,threshold):
    optimal_idx = np.argmax(tpr - fpr)
    optimal_threshold = threshold[optimal_idx]
    return optimal_threshold

def best_confusion_matrix(y_test, y_test_predprob):
    """
        根据真实值和预测值（预测概率）的向量来计算混淆矩阵和最优的划分阈值

        Args:
            y_test:真实值
            y_test_predprob：预测值

        Returns:
            返回最佳划分阈值和混淆矩阵
        """
    fpr, tpr, thresholds = roc_curve(y_test, y_test_predprob, pos_label=1)
    cutoff = find_optimal_cutoff(tpr,fpr,thresholds)
    y_pred = yuzhi(y_test_predprob,cutoff)
    print(metrics.confusion_matrix(y_true=y_test, y_pred=y_pred))
    print(metrics.classification_report(y_true=y_test, y_pred=y_pred))
    TN,FP,FN,TP = metrics.confusion_matrix(y_test,y_pred).ravel()
    return cutoff,TN,FN,FP,TP
best_confusion_matrix(test_Y_input,preda)

In [None]:
precisions, recalls, thresholds = precision_recall_curve(test_Y_input,predict)

# 拿到最优结果以及索引
f1_scores = (2 * precisions * recalls) / (precisions + recalls)
best_f1_score = np.max(f1_scores[np.isfinite(f1_scores)])
best_f1_score_index = np.argmax(f1_scores[np.isfinite(f1_scores)])

# 阈值
best_f1_score, thresholds[best_f1_score_index]

In [None]:
!unzip data.zip

Archive:  data.zip
   creating: data/
  inflating: data/token_abstruct_0_400.npy  
  inflating: data/token_abstruct_1_400.npy  
  inflating: data/token_title_0.npy  
  inflating: data/token_title_1.npy  
  inflating: data/X_0.xlsx           
  inflating: data/X_1.xlsx           
  inflating: data/Y_0.xlsx           
  inflating: data/Y_1.xlsx           
   creating: data/__pycache__/
  inflating: data/__pycache__/dataset.cpython-37.pyc  


In [None]:
!pip install pytorch_metric_learning

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pytorch_metric_learning
  Downloading pytorch_metric_learning-1.5.2-py3-none-any.whl (111 kB)
[K     |████████████████████████████████| 111 kB 5.0 MB/s 
Installing collected packages: pytorch-metric-learning
Successfully installed pytorch-metric-learning-1.5.2


In [None]:
!pip install sklearn

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.21.1-py3-none-any.whl (4.7 MB)
[K     |████████████████████████████████| 4.7 MB 5.6 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 47.9 MB/s 
[?25hCollecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 47.8 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.8.1-py3-none-any.whl (101 kB)
[K     |████████████████████████████████| 101 kB 12.0 MB/s 
Installing collected packages: pyyaml, tokenizers, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninstal

In [None]:
!unzip data.zip

Archive:  data.zip
   creating: data/
  inflating: data/token_abstruct_0_400.npy  
  inflating: data/token_abstruct_1_400.npy  
  inflating: data/token_title_0.npy  
  inflating: data/token_title_1.npy  
  inflating: data/X_0.xlsx           
  inflating: data/X_1.xlsx           
  inflating: data/Y_0.xlsx           
  inflating: data/Y_1.xlsx           
   creating: data/__pycache__/
  inflating: data/__pycache__/dataset.cpython-37.pyc  
