In [1]:
import pandas as pd
import numpy as np
import math
from collections import Counter
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score
from imblearn.over_sampling import SMOTE

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
from torch.utils.tensorboard import SummaryWriter

from itertools import product
from collections import OrderedDict, namedtuple
from easydl import clear_output
from IPython.display import display
import torch.utils.data as Data

import time
import matplotlib.pyplot as plt

[easydl] tensorflow not available!


In [2]:
data = pd.read_csv('E:/hxf_prediction/data_mimic/hxf/mimic0710/hxf/SelfData/data/total_data_xinjiao_20210125.csv', encoding='utf-8')
print('原数据size', data.shape)
X, y = np.array(data.drop(labels=['label'], axis=1)), np.array(data['label'].apply(int))
print('原数据X_size:%s, y_size:%s' % (X.shape, y.shape))
print('原数据y分布', Counter(y))
X_text, X_digital = X[:, 0:768], X[:, 805:]
flag = 'feature_5'
print('文本数据', X_text.shape)
print('数值数据', X_digital.shape)
X_sub = np.hstack((X_text, X_digital))
print('分割后X_size:%s, y_size:%s' % (X_sub.shape, y.shape))

原数据size (1132, 816)
原数据X_size:(1132, 815), y_size:(1132,)
原数据y分布 Counter({0: 876, 1: 256})
文本数据 (1132, 768)
数值数据 (1132, 10)
分割后X_size:(1132, 778), y_size:(1132,)


In [3]:
SMO = SMOTE(random_state=666)
X_res,y_res = SMO.fit_resample(X_sub,y)
print('插值后y分布', Counter(y_res))

插值后y分布 Counter({0: 876, 1: 876})


In [4]:
class Network(nn.Module):
    def __init__(self, text_in_features, digital_in_features):
        super(Network, self).__init__()
        self.bn_text = nn.BatchNorm1d(text_in_features)
        self.bn_digital = nn.BatchNorm1d(digital_in_features)
        self.fc_text_1 = nn.Linear(in_features=text_in_features, out_features=128)
        self.fc_digital_1 = nn.Linear(in_features=digital_in_features, out_features=64)
        self.fc_connect_1 = nn.Linear(in_features=192, out_features=256)
        self.fc_connect_2 = nn.Linear(in_features=256, out_features=128)
        self.fc_connect_3 = nn.Linear(in_features=128, out_features=64)
        self.fc_connect_4 = nn.Linear(in_features=64, out_features=2)

    def forward(self, text_input, digital_input):
        text_input_bn = self.bn_text(text_input)
        text_1 = self.fc_text_1(text_input_bn)
        text_1 = F.softsign(text_1)
        
        digital_input_bn = self.bn_digital(digital_input)
        digital_1 = self.fc_digital_1(digital_input_bn)
        digital_1 = F.tanh(digital_1)

        t_d_connect = torch.cat([text_1, digital_1], 1)
        t_d_connect = F.tanh(self.fc_connect_1(t_d_connect))
        t_d_connect = F.tanh(self.fc_connect_2(t_d_connect))
        t_d_connect = F.tanh(self.fc_connect_3(t_d_connect))
        t_d_connect = nn.Dropout(p=0.5)(t_d_connect)
        t_d_connect = self.fc_connect_4(t_d_connect)
        return t_d_connect

In [5]:
import pandas as pd
import numpy as np
import os
import json
import datetime

def save_result(model, run_data):
    """
    运行结果保存
        默认文件路径 ./run_data
        默认模型路径 ./model
    :param model: 模型
    :param run_data: 运行数据
    """
    result_dir = './run_data'
    model_dir = './model'
    name = 'result'
    
    if not os.path.exists(result_dir):
        os.mkdir(result_dir)
        
    if not os.path.exists(model_dir):
        os.mkdir(model_dir)
    
    time_index = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
    
    # 保存运行文件
    run_data_path = os.path.join(result_dir, name)
    pd.DataFrame(run_data).to_csv(f'{run_data_path}_{time_index}.csv', index=False)
    with open(f'{run_data_path}_{time_index}.json', 'w', encoding='utf-8') as f:
        json.dump(run_data, f, ensure_ascii=False, indent=4)
    
    # 保存运行模型
    model_path = os.path.join(model_dir, name)
    torch.save(model.state_dict(), f'{model_path}_{time_index}.pt')

In [6]:
class RunBuilder:
    @staticmethod
    def get_run(params):  # 静态方法，不需要实例化

        Run = namedtuple('Run', params.keys())
        runs = []
        for v in product(*params.values()):
            runs.append(Run(*v))

        return runs

In [7]:
train_params = OrderedDict(
    lr = [.01, .001],
    batch_size = [50, 100],
    shuffle = [False]
#     device = ['cuda'],
#     num_workers = [1]  # 有多少子进程被用来加载数据 默认为0即在主进程中加载数据 可以利用多核CPU的特点指定num_workers个数 提前将数据加载到内存中
)

test_params = OrderedDict(
    lr = [np.nan],
    batch_size = [50],
    shuffle = [False]
#     device = ['cuda'],
#     num_workers = [1]  # 有多少子进程被用来加载数据 默认为0即在主进程中加载数据 可以利用多核CPU的特点指定num_workers个数 提前将数据加载到内存中
)
num_workers = 1
device = 'cuda'

X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.3, random_state=np.random.seed())
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(1226, 778) (526, 778) (1226,) (526,)


In [8]:
run_count = 0
run_data = []

model = None
hightest_accuracy = 0

test_run = next(iter(RunBuilder.get_run(test_params)))

for run in RunBuilder.get_run(train_params):
    
    device = torch.device(device)
    network = Network(X_text.shape[1], X_digital.shape[1]).to(device)
    train_loader = Data.DataLoader(
        Data.TensorDataset(torch.tensor(X_train).to(torch.float32), torch.tensor(y_train)),
        batch_size=run.batch_size,
        num_workers=num_workers,
        shuffle=run.shuffle
    )
    test_loader = Data.DataLoader(
        Data.TensorDataset(torch.tensor(X_test).to(torch.float32), torch.tensor(y_test)),
        batch_size=test_run.batch_size,
        num_workers=num_workers,
        shuffle=test_run.shuffle
    )
    optimizer = optim.Adam(network.parameters(), lr=run.lr)
    
    run_start_time = time.time()
    run_count += 1
    epoch_count = 0
    test_epoch_count = 0
    tb = SummaryWriter(comment=f'-{run}-{flag}')

    for epoch in range(20):
        
        epoch_start_time = time.time()
        epoch_count += 1
        epoch_loss = 0
        epoch_correct_num = 0
        epoch_precision_score = 0
        epoch_recall_score = 0
        epoch_f1_score = 0
        epoch_auc_score = 0
        for batch in train_loader:
            
            X_batch_train = batch[0].to(device)
            labels_train = batch[1].to(device)
            X_text_train, X_digital_train = X_batch_train[:, :768], X_batch_train[:, 768:]
            preds = network(X_text_train, X_digital_train)     # 前向传播 根据权重参数进行预测 
            loss = F.cross_entropy(preds, labels_train)  # 计算损失 构建计算图

            optimizer.zero_grad()                  # pytorch会积累梯度 在优化每个batch的权重的梯度之前将之前权重的梯度置为0
            loss.backward()                        # 在最后一个张量上调用反向传播方法 在计算图中计算权重梯度
            optimizer.step()                       # 使用预先设置的learning_rate的梯度来更新权重参数

            epoch_loss += loss.item() * train_loader.batch_size
            epoch_correct_num += preds.argmax(dim=1).eq(labels_train).sum().item()
            epoch_precision_score += precision_score(labels_train.to('cpu'), preds.argmax(dim=1).to('cpu'))
            epoch_recall_score += recall_score(labels_train.to('cpu'), preds.argmax(dim=1).to('cpu'))
            epoch_f1_score += f1_score(labels_train.to('cpu'), preds.argmax(dim=1).to('cpu'))
            epoch_auc_score += roc_auc_score(labels_train.to('cpu'), preds.argmax(dim=1).to('cpu'))

        epoch_duration = time.time() - epoch_start_time
        run_duration = time.time() - run_start_time
        
        loss = epoch_loss / len(train_loader.dataset)
        accuracy = epoch_correct_num / len(train_loader.dataset)
        precision = epoch_precision_score / math.ceil(len(train_loader.dataset) / run.batch_size)
        recall = epoch_recall_score / math.ceil(len(train_loader.dataset) / run.batch_size)
        f1 = epoch_f1_score / math.ceil(len(train_loader.dataset) / run.batch_size)
        auc = epoch_auc_score / math.ceil(len(train_loader.dataset) / run.batch_size)
    
        tb.add_scalar('Train Loss', loss, epoch_count)
        tb.add_scalar('Train Accuracy', accuracy, epoch_count)
        tb.add_scalar('Train Precision', precision, epoch_count)
        tb.add_scalar('Train Recall', recall, epoch_count)
        tb.add_scalar('Train F1', f1, epoch_count)
        tb.add_scalar('Train AUC', auc, epoch_count)
        
        for name, param in network.named_parameters():  # 将network中的每一层参数都存入tensorboard 
            tb.add_histogram(name, param, epoch_count)
            tb.add_histogram(f'{name}.grad', param.grad, epoch_count)
        
        # 保存训练参数
        results = OrderedDict()
        results['flag'] = flag
        results['current'] = 'Train' 
        results['run'] = run_count
        results['epoch'] = epoch_count
        results['loss'] = loss
        results['accuracy'] = accuracy
        results['precision'] = precision
        results['recall'] = recall
        results['f1'] = f1
        results['auc'] = auc
        results['epoch_duration'] = epoch_duration
        results['run_duration'] = run_duration
        for k, v in run._asdict().items():
            results[k] = v
        run_data.append(results)
        
        clear_output()                   # 清除输出
        display(pd.DataFrame(run_data))  # 输出
        
        #  对测试集进行预测
        if epoch_count % 4 == 0:
            test_epoch_start_time = time.time()
            test_epoch_count += 1
            test_epoch_loss = 0
            test_epoch_correct_num = 0
            test_epoch_precision_score = 0
            test_epoch_recall_score = 0
            test_epoch_f1_score = 0
            test_epoch_auc_score = 0
            for batch in test_loader:
                
                X_batch_test = batch[0].to(device)
                labels_test = batch[1].to(device)
                X_text_test, X_digital_test = X_batch_test[:, :768], X_batch_test[:, 768:]
                preds = network(X_text_test, X_digital_test)          # 前向传播 根据权重参数进行预测 
                test_loss = F.cross_entropy(preds, labels_test)  # 计算损失 构建计算图

                test_epoch_loss += test_loss.item() * test_loader.batch_size
                test_epoch_correct_num += preds.argmax(dim=1).eq(labels_test).sum().item()
                test_epoch_precision_score += precision_score(labels_test.to('cpu'), preds.argmax(dim=1).to('cpu'))
                test_epoch_recall_score += recall_score(labels_test.to('cpu'), preds.argmax(dim=1).to('cpu'))
                test_epoch_f1_score += f1_score(labels_test.to('cpu'), preds.argmax(dim=1).to('cpu'))
                test_epoch_auc_score += roc_auc_score(labels_test.to('cpu'), preds.argmax(dim=1).to('cpu'))

            test_epoch_duration = time.time() - test_epoch_start_time
            test_run_duration = time.time() - run_start_time

            test_loss = test_epoch_loss / len(test_loader.dataset)
            test_accuracy = test_epoch_correct_num / len(test_loader.dataset)
            test_precision = test_epoch_precision_score / math.ceil(len(test_loader.dataset) / test_run.batch_size)
            test_recall = test_epoch_recall_score / math.ceil(len(test_loader.dataset) / test_run.batch_size)
            test_f1 = test_epoch_f1_score / math.ceil(len(test_loader.dataset) / test_run.batch_size)
            test_auc = test_epoch_auc_score / math.ceil(len(test_loader.dataset) / test_run.batch_size)

            tb.add_scalar('Test Loss', test_loss, test_epoch_count)
            tb.add_scalar('Test Accuracy', test_accuracy, test_epoch_count)
            tb.add_scalar('Test Precision', test_precision, test_epoch_count)
            tb.add_scalar('Test Recall', test_recall, test_epoch_count)
            tb.add_scalar('Test F1', test_f1, test_epoch_count)
            tb.add_scalar('Test AUC', test_auc, test_epoch_count)

            results = OrderedDict()
            results['flag'] = flag
            results['current'] = 'Test' 
            results['run'] = run_count
            results['epoch'] = test_epoch_count
            results['loss'] = test_loss
            results['accuracy'] = test_accuracy
            results['precision'] = test_precision
            results['recall'] = test_recall
            results['f1'] = test_f1
            results['auc'] = test_auc
            results['epoch_duration'] = test_epoch_duration
            results['run_duration'] = test_run_duration
            for k, v in test_run._asdict().items():
                if k == 'lr': v = run.lr
                if k == 'batch_size': v = str(run.batch_size) + '-' + str(v)
                results[k] = v
            run_data.append(results)

            clear_output()
            display(pd.DataFrame(run_data))
        
    if test_accuracy > hightest_accuracy:
        hightest_accuracy = test_accuracy
        model = network
    tb.close()
save_result(model, run_data)
print(model)

Unnamed: 0,flag,current,run,epoch,loss,accuracy,precision,recall,f1,auc,epoch_duration,run_duration,lr,batch_size,shuffle
0,feature_5,Train,1,1,0.757145,0.491843,0.497676,0.528847,0.502497,0.488002,8.442451,8.468382,0.010,50,False
1,feature_5,Train,1,2,0.717341,0.531811,0.539937,0.563606,0.542589,0.536895,0.711098,9.604316,0.010,50,False
2,feature_5,Train,1,3,0.699180,0.570147,0.572026,0.609985,0.582605,0.573741,1.050193,10.842008,0.010,50,False
3,feature_5,Train,1,4,0.714798,0.576672,0.578434,0.624680,0.594105,0.578917,0.850754,11.884248,0.010,50,False
4,feature_5,Test,1,1,0.724999,0.566540,0.541953,0.569294,0.551174,0.570118,0.714062,12.792791,0.010,50-50,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,feature_5,Train,4,17,0.109815,0.962480,0.954530,0.968461,0.961206,0.962539,0.607348,16.632527,0.001,100,False
96,feature_5,Train,4,18,0.041969,0.984502,0.987466,0.985556,0.986336,0.985014,0.709105,17.558052,0.001,100,False
97,feature_5,Train,4,19,0.048390,0.984502,0.987265,0.985863,0.986397,0.985297,0.602416,18.345972,0.001,100,False
98,feature_5,Train,4,20,0.056569,0.980424,0.980092,0.983322,0.981591,0.981391,0.621339,19.152788,0.001,100,False


Network(
  (bn_text): BatchNorm1d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (bn_digital): BatchNorm1d(10, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (fc_text_1): Linear(in_features=768, out_features=128, bias=True)
  (fc_digital_1): Linear(in_features=10, out_features=64, bias=True)
  (fc_connect_1): Linear(in_features=192, out_features=256, bias=True)
  (fc_connect_2): Linear(in_features=256, out_features=128, bias=True)
  (fc_connect_3): Linear(in_features=128, out_features=64, bias=True)
  (fc_connect_4): Linear(in_features=64, out_features=2, bias=True)
)


In [9]:
run_data_df = pd.DataFrame(run_data)
run_data_df_evaluate = run_data_df.iloc[
    [
        run_data_df[run_data_df['current'] == 'Test']['accuracy'].sort_values(ascending=False).index[0],
        run_data_df[run_data_df['current'] == 'Test']['precision'].sort_values(ascending=False).index[0],
        run_data_df[run_data_df['current'] == 'Test']['recall'].sort_values(ascending=False).index[0],
        run_data_df[run_data_df['current'] == 'Test']['f1'].sort_values(ascending=False).index[0],
        run_data_df[run_data_df['current'] == 'Test']['auc'].sort_values(ascending=False).index[0]
    ],
    :
]
run_data_df_evaluate

Unnamed: 0,flag,current,run,epoch,loss,accuracy,precision,recall,f1,auc,epoch_duration,run_duration,lr,batch_size,shuffle
99,feature_5,Test,4,5,0.934041,0.802281,0.732325,0.910106,0.811024,0.804067,0.561498,19.892809,0.001,100-50,False
44,feature_5,Test,2,4,0.550302,0.790875,0.757163,0.824792,0.787465,0.793196,0.5984,16.626544,0.01,100-50,False
69,feature_5,Test,3,4,0.920927,0.790875,0.720169,0.927141,0.808313,0.799335,0.630315,17.105262,0.001,50-50,False
99,feature_5,Test,4,5,0.934041,0.802281,0.732325,0.910106,0.811024,0.804067,0.561498,19.892809,0.001,100-50,False
74,feature_5,Test,3,5,0.841099,0.802281,0.736393,0.905665,0.81022,0.804953,0.563524,21.270156,0.001,50-50,False


In [10]:
results = pd.read_csv('./results.csv')
results = pd.concat([results, run_data_df_evaluate])
results.to_csv('./results.csv', index=False)