In [None]:
from pytorch_lightning  import seed_everything
import numpy as np
import torch

import pandas as pd
from processing_utils import *
from pytorch_widedeep.preprocessing import TabPreprocessor
from pytorch_lightning.loggers import TensorBoardLogger
from models import *
from model_utils import *

seed  = 625
seed_everything(seed, workers=True)
np.random.seed(seed)
torch.cuda.manual_seed_all(seed)

In [None]:
df = pd.read_pickle('model_data.pkl')

In [None]:
entity_vocab = get_vocab(df)

In [None]:
words = df.pop('words')
text =  df.pop('术前诊断').fillna("无")
y = df.pop('肺部并发症').values

In [None]:
pre_model, text = get_text_model(text)

In [None]:
words_ids = get_entity_id(words, entity_vocab)

In [None]:
cat, cont = cat_cont_split(df) 
df = remove_outliers(df, cont) 

In [None]:
method = "cart"
for num in range(len(cont)):
    dtype="numerical"
    binning(df, cont, num, method, y, dtype)

In [None]:
tab_preprocessor = TabPreprocessor(embed_cols=df.columns,  
                                    for_transformer=True
                                )
X_tab = tab_preprocessor.fit_transform(df)

In [None]:
X_tab_train, X_tab_test, y_train_valid, y_test, text_train, text_test, words_ids_train, words_ids_test = time_split(X_tab, text, words_ids, y, 13904)

In [None]:
b_size = 1024
lr = 3e-5
epoch = 500
agd = 1
dropout = 0.6
weight_decay = 0.01

In [13]:
kf = KFold(n_splits=5, shuffle = True, random_state = 625)
results = []
n = 0
data_loaders = []
for train_index, valid_index in kf.split(X_tab_train):
    
    n += 1

    data_loader_train, data_loader_valid, data_loader_valid_test = get_data_loader(
            X_tab_train, y_train_valid,train_index, valid_index, b_size     
    )
    
    data_loaders.append(data_loader_valid_test)
#     pt_path = "model_checkpoint/Tabular"
#     pt_name = str(n)+"_lr="+str(lr)+"_b_size="+str(b_size)+"_agd="+str(agd)+"_dropout="+str(dropout)
#     logger = TensorBoardLogger(pt_path, name = pt_name)
#     model = Tabular(dropout = dropout, lr = lr,column_idx=tab_preprocessor.column_idx ,embed_input=tab_preprocessor.embeddings_input)
#     trainer = get_trainer(agd, logger, epoch)
#     trainer.fit(model, data_loader_train, data_loader_valid)

In [None]:
# kf = KFold(n_splits=5, shuffle = True, random_state = 625)
# results = []
# n = 0
# data_loaders = []
# for train_index, valid_index in kf.split(X_tab_train):
    
#     n += 1

#     y_train_valid[train_index]
#     data_loader_train, data_loader_valid, data_loader_valid_test, data_loader_test = get_Dataloader(
#         X_tab_train, text_train, np.array(words_ids_train), y_train_valid, train_index, valid_index, b_size, 
#         X_tab_test, text_test, words_ids_test, y_test
#     )
    
#     data_loaders.append(data_loader_valid_test)
#     pt_path = "model_checkpoint/test"
#     pt_name = str(n)+"_lr="+str(lr)+"_b_size="+str(b_size)+"_agd="+str(agd)+"_dropout="+str(dropout)
#     logger = TensorBoardLogger(pt_path, name = pt_name)
#     model = Tabular_text_entity(use_res = True, use_transformer = True,vocab_len = 17372, pre_model = pre_model, dropout = dropout, weight_decay = weight_decay, lr = lr,column_idx=tab_preprocessor.column_idx ,embed_input=tab_preprocessor.embeddings_input)
#     trainer = get_trainer(agd, logger, epoch)
#     trainer.fit(model, data_loader_train, data_loader_valid)
#     torch.cuda.empty_cache()
#     break

In [None]:
import numpy as np; np.random.seed(0)
import seaborn as sns; sns.set_theme()
uniform_data = np.random.rand(10, 12)
ax = sns.heatmap(model.attention_weights.cpu().detach().numpy()[0][0])

In [14]:
import os
def get_pt():
    ckpts = []
    for i in range(1, 6):
        for filepath,dirnames,filenames in os.walk('/home/yx/肺部并发症预测/model_checkpoint/Tabular/'+str(i)+'_lr=3e-05_b_size=1024_agd=1_dropout=0.5/version_0/checkpoints'):
            for filename in filenames:
                ckpts.append(os.path.join(filepath,filename))
    return ckpts

In [15]:
def get_predict(data_loader_test, model):
    model.cuda()
    model.eval()
    P = []
    Y = []
    with torch.no_grad():
        for data in data_loader_test:
            X = data[0].cuda()
            y = data[1].tolist()[0]
            Y.append(y)
            log_p = model(X)
            p = torch.exp(log_p[0][1]).tolist()
            P.append(p)
    return P, Y

In [16]:
ckpts = get_pt()

In [18]:
def get_model_result(ckpts, data_loaders):
    results = []
    for pt, data_loader in zip(ckpts, data_loaders):
        model = Tabular.load_from_checkpoint(pt, dropout = dropout, lr = lr, column_idx=tab_preprocessor.column_idx, embed_input=tab_preprocessor.embeddings_input)
        P, Y = get_predict(data_loader, model)
        result = get_metrics(Y, P)
        results.append(result)
    df_result = get_result(results)
    return df_result

In [20]:
# ckpts = get_pt('/home/yx/3090/project/P_prediction/肺部并发症预测/model_checkpoint/')
df_result = get_model_result(ckpts, data_loaders)

precision:    0.60455 	 recall:    0.64602 	 f1:    0.62460 	 accuracy:    0.87343 	 aucprc:    0.61837 	 aucroc:    0.87769 	 NPV:    0.92997 	 Specificity:    0.91799 	 
precision:    0.49423 	 recall:    0.69930 	 f1:    0.57915 	 accuracy:    0.84286 	 aucprc:    0.57410 	 aucroc:    0.86350 	 NPV:    0.94023 	 Specificity:    0.86947 	 
precision:    0.49191 	 recall:    0.68934 	 f1:    0.57413 	 accuracy:    0.83747 	 aucprc:    0.54481 	 aucroc:    0.85090 	 NPV:    0.93623 	 Specificity:    0.86581 	 
precision:    0.50275 	 recall:    0.69192 	 f1:    0.58236 	 accuracy:    0.85832 	 aucprc:    0.55073 	 aucroc:    0.87552 	 NPV:    0.94502 	 Specificity:    0.88637 	 
precision:    0.59389 	 recall:    0.64916 	 f1:    0.62030 	 accuracy:    0.87986 	 aucprc:    0.60112 	 aucroc:    0.87875 	 NPV:    0.93629 	 Specificity:    0.92122 	 


In [21]:
df_result

Unnamed: 0,precision,recall,f1,accuracy,aucprc,aucroc,NPV,Specificity
均值,0.537467,0.675149,0.596106,0.858388,0.577827,0.86927,0.937547,0.892174
置信区间-左,0.487818,0.652845,0.574831,0.842191,0.550024,0.858801,0.932672,0.869173
置信区间-右,0.587117,0.697453,0.617382,0.874584,0.60563,0.879739,0.942421,0.915174


In [None]:
df_result['model'] = 'deep-learning'
df_result['text'] = '无'