In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold, train_test_split

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
from torch.utils.tensorboard import SummaryWriter
import math

from itertools import product
from collections import OrderedDict, namedtuple
from easydl import clear_output
from IPython.display import display
import torch.utils.data as Data
#
from imblearn.over_sampling import SMOTE
from collections import Counter

from sklearn.metrics import f1_score, precision_score, recall_score,roc_auc_score
#
import time
import matplotlib.pyplot as plt

[easydl] tensorflow not available!


In [64]:
#SMO = SMOTE(random_state=666)

In [2]:
data = pd.read_csv('E:/hxf_prediction/data_mimic/hxf/mimic0710/hxf/SelfData/data/total_data_xinjiao_20210125.csv')
print('data', data.shape)
X, y = np.array(data.drop(labels=['label'], axis=1)), np.array(data['label'].apply(int))
print(Counter(y))
print(X.shape, y.shape)
X_unstructure, X_structure = X[:, 0:768], X[:, 768:]
print('文本数据', X_unstructure.shape)
print('数值数据', X_structure.shape)

data (1132, 816)
Counter({0: 876, 1: 256})
(1132, 815) (1132,)
文本数据 (1132, 768)
数值数据 (1132, 47)


In [3]:
#
SMO = SMOTE(random_state=666)
X_res,y_res = SMO.fit_resample(X,y)
print(Counter(y_res))
#

Counter({0: 876, 1: 876})


data = pd.read_csv('E:/hxf_prediction/data_mimic/hxf/mimic0710/hxf/SelfData/data/total_data_xinjiao_20210125.csv')
print('data', data.shape)

X, y = np.array(data.drop(labels=['label'], axis=1)), np.array(data['label'].apply(int))
print(Counter(y))
print(X.shape, y.shape)
X_unstructure, X_structure = X[:, 0:768], X[:, 768:]
print('文本数据', X_unstructure.shape)
print('数值数据', X_structure.shape)

In [4]:
class Network(nn.Module):
    def __init__(self, text_in_features, digital_in_features):
        super(Network, self).__init__()
        self.bn_text = nn.BatchNorm1d(text_in_features)
        self.bn_digital = nn.BatchNorm1d(digital_in_features)
        self.fc_text_1 = nn.Linear(in_features=text_in_features, out_features=128)
        self.fc_digital_1 = nn.Linear(in_features=digital_in_features, out_features=64)
        self.fc_connect_1 = nn.Linear(in_features=192, out_features=256)
        self.fc_connect_2 = nn.Linear(in_features=256, out_features=128)
        self.fc_connect_3 = nn.Linear(in_features=128, out_features=64)
        self.fc_connect_4 = nn.Linear(in_features=64, out_features=2)

    def forward(self, text_input, digital_input):
        text_input_bn = self.bn_text(text_input)
        text_1 = self.fc_text_1(text_input_bn)
        text_1 = F.softsign(text_1)
        
        digital_input_bn = self.bn_digital(digital_input)
        digital_1 = self.fc_digital_1(digital_input_bn)
        digital_1 = F.tanh(digital_1)

        t_d_connect = torch.cat([text_1, digital_1], 1)
        t_d_connect = F.tanh(self.fc_connect_1(t_d_connect))
        t_d_connect = F.tanh(self.fc_connect_2(t_d_connect))
        t_d_connect = F.tanh(self.fc_connect_3(t_d_connect))
        t_d_connect = nn.Dropout(p=0.5)(t_d_connect)
        t_d_connect = self.fc_connect_4(t_d_connect)
        return t_d_connect

In [5]:
import pandas as pd
import numpy as np
import os
import json
import datetime

def save_result(model, run_data):
    """
    运行结果保存
        默认文件路径 ./run_data
        默认模型路径 ./model
    :param model: 模型
    :param run_data: 运行数据
    """
    result_dir = './run_data'
    model_dir = './model'
    name = 'result'
    
    if not os.path.exists(result_dir):
        os.mkdir(result_dir)
        
    if not os.path.exists(model_dir):
        os.mkdir(model_dir)
    
    time_index = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
    
    # 保存运行文件
    run_data_path = os.path.join(result_dir, name)
    pd.DataFrame(run_data).to_csv(f'{run_data_path}_{time_index}.csv', index=False)
    with open(f'{run_data_path}_{time_index}.json', 'w', encoding='utf-8') as f:
        json.dump(run_data, f, ensure_ascii=False, indent=4)
    
    # 保存运行模型
    model_path = os.path.join(model_dir, name)
    torch.save(model.state_dict(), f'{model_path}_{time_index}.pt')

In [6]:
class RunBuilder:
    @staticmethod
    def get_run(params):  # 静态方法，不需要实例化

        Run = namedtuple('Run', params.keys())
        runs = []
        for v in product(*params.values()):
            runs.append(Run(*v))

        return runs

In [7]:
train_params = OrderedDict(
    lr = [.01, .001],
    batch_size = [50, 100],
    shuffle = [False],
    device = ['cuda'],
    num_workers = [1]  # 有多少子进程被用来加载数据 默认为0即在主进程中加载数据 可以利用多核CPU的特点指定num_workers个数 提前将数据加载到内存中
)

test_params = OrderedDict(
    lr = [np.nan],
    batch_size = [50],
    shuffle = [False],
    device = ['cuda'],
    num_workers = [1]  # 有多少子进程被用来加载数据 默认为0即在主进程中加载数据 可以利用多核CPU的特点指定num_workers个数 提前将数据加载到内存中
)

X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.3, random_state=np.random.seed())
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(1226, 815) (526, 815) (1226,) (526,)


In [8]:
run_count = 0
run_data = []

model = None
hightest_accuracy = 0

test_run = next(iter(RunBuilder.get_run(test_params)))

for run in RunBuilder.get_run(train_params):
    
    device = torch.device(run.device)
    network = Network(X_unstructure.shape[1], X_structure.shape[1]).to(device)
    train_loader = Data.DataLoader(
        Data.TensorDataset(torch.tensor(X_train).to(torch.float32), torch.tensor(y_train)),
        batch_size=run.batch_size,
        num_workers=run.num_workers,
        shuffle=run.shuffle
    )
    test_loader = Data.DataLoader(
        Data.TensorDataset(torch.tensor(X_test).to(torch.float32), torch.tensor(y_test)),
        batch_size=test_run.batch_size,
        num_workers=test_run.num_workers,
        shuffle=test_run.shuffle
    )
    optimizer = optim.Adam(network.parameters(), lr=run.lr)
    
    run_start_time = time.time()
    run_count += 1
    epoch_count = 0
    test_epoch_count = 0
    tb = SummaryWriter(comment=f'-{run}')

    for epoch in range(60):
        
        epoch_start_time = time.time()
        epoch_count += 1
        epoch_loss = 0
        epoch_correct_num = 0
        epoch_precision = 0
        epoch_f1 = 0
        for batch in train_loader:
            
            X_batch_train = batch[0].to(device)
            labels_train = batch[1].to(device)
            X_text_train, X_digital_train = X_batch_train[:, :768], X_batch_train[:, 768:]
            preds = network(X_text_train, X_digital_train)     # 前向传播 根据权重参数进行预测 
            loss = F.cross_entropy(preds, labels_train)  # 计算损失 构建计算图

            optimizer.zero_grad()                  # pytorch会积累梯度 在优化每个batch的权重的梯度之前将之前权重的梯度置为0
            loss.backward()                        # 在最后一个张量上调用反向传播方法 在计算图中计算权重梯度
            optimizer.step()                       # 使用预先设置的learning_rate的梯度来更新权重参数

            epoch_loss += loss.item() * train_loader.batch_size
            epoch_correct_num += preds.argmax(dim=1).eq(labels_train).sum().item()
            epoch_f1 += f1_score(labels_train.to('cpu'), preds.argmax(dim=1).to('cpu'))
            epoch_recall = recall_score(labels_train, preds.argmax(dim=1))
            epoch_precision += precision_score(labels_train.to('cpu'), preds.argmax(dim=1).to('cpu'))
            epoch_rocauc = roc_auc_score(labels_train, preds.argmax(dim=1))
            
        epoch_duration = time.time() - epoch_start_time
        run_duration = time.time() - run_start_time
        
        loss = epoch_loss / len(train_loader.dataset)
        accuracy = epoch_correct_num / len(train_loader.dataset)
        f1 =  epoch_f1 / math.ceil(len(train_loader.dataset)/run.batch_size)
        #recall = epoch_f1 / len(train_loader.dataset)
        precision = epoch_precision / math.ceil(len(train_loader.dataset)/run.batch_size)
        #rocauc = epoch_rocauc / len(train_loader.dataset)
        
        tb.add_scalar('Train Loss', loss, epoch_count)
        tb.add_scalar('Train Accuracy', accuracy, epoch_count)
        
        for name, param in network.named_parameters():  # 将network中的每一层参数都存入tensorboard 
            tb.add_histogram(name, param, epoch_count)
            tb.add_histogram(f'{name}.grad', param.grad, epoch_count)
        
        # 保存训练参数
        results = OrderedDict()
        results['current'] = 'Train' 
        results['run'] = run_count
        results['epoch'] = epoch_count
        results['loss'] = loss
        results['f1'] = f1
        #results['recall'] = recall
        results['precision'] = precision
        #results['rocauc'] = recall
        results['accuracy'] = accuracy
        results['epoch duration'] = epoch_duration
        results['run duration'] = run_duration
        for k, v in run._asdict().items():
            results[k] = v
        run_data.append(results)
        
        clear_output()                   # 清除输出
        display(pd.DataFrame(run_data))  # 输出
        
        #  对测试集进行预测
        if epoch_count % 5 == 0:
            test_epoch_start_time = time.time()
            test_epoch_count += 1
            test_epoch_loss = 0
            test_epoch_correct_num = 0
            test_epoch_precision = 0
            test_epoch_f1 = 0
            for batch in test_loader:
                
                X_batch_test = batch[0].to(device)
                labels_test = batch[1].to(device)
                X_text_test, X_digital_test = X_batch_test[:, :768], X_batch_test[:, 768:]
                preds = network(X_text_test, X_digital_test)          # 前向传播 根据权重参数进行预测 
                test_loss = F.cross_entropy(preds, labels_test)  # 计算损失 构建计算图

                test_epoch_loss += test_loss.item() * test_loader.batch_size
                test_epoch_correct_num += preds.argmax(dim=1).eq(labels_test).sum().item()
                test_epoch_precision += precision_score(labels_test.to('cpu'), preds.argmax(dim=1).to('cpu'))
                test_epoch_f1 += f1_score(labels_test.to('cpu'), preds.argmax(dim=1).to('cpu'))
            test_epoch_duration = time.time() - test_epoch_start_time
            test_run_duration = time.time() - run_start_time

            test_loss = test_epoch_loss / len(test_loader.dataset)
            test_accuracy = test_epoch_correct_num / len(test_loader.dataset)
            precision = test_epoch_precision / math.ceil(len(train_loader.dataset)/run.batch_size)
            f1 =  test_epoch_f1 / math.ceil(len(train_loader.dataset)/run.batch_size)

            tb.add_scalar('Test Loss', test_loss, test_epoch_count)
            tb.add_scalar('Test Accuracy', test_accuracy, test_epoch_count)

            results = OrderedDict()
            results['current'] = 'Test' 
            results['run'] = run_count
            results['epoch'] = test_epoch_count
            results['precision'] = precision
            results['f1'] = f1
            results['loss'] = test_loss
            results['accuracy'] = test_accuracy
            results['epoch duration'] = test_epoch_duration
            results['run duration'] = test_run_duration
            for k, v in test_run._asdict().items():
                results[k] = v
            run_data.append(results)

            clear_output()
            display(pd.DataFrame(run_data))
        
    if test_accuracy > hightest_accuracy:
        hightest_accuracy = test_accuracy
        model = network
    tb.close()
save_result(model, run_data)
print('测试集准确率: %s'% hightest_accuracy)
print(model)

Unnamed: 0,current,run,epoch,loss,f1,precision,accuracy,epoch duration,run duration,lr,batch_size,shuffle,device,num_workers
0,Train,1,1,0.767473,0.465796,0.490947,0.519576,7.659516,7.671487,0.010,50,False,cuda,1
1,Train,1,2,0.718011,0.507659,0.511024,0.517945,0.669183,8.683779,0.010,50,False,cuda,1
2,Train,1,3,0.694284,0.576045,0.593379,0.592170,0.673200,9.545476,0.010,50,False,cuda,1
3,Train,1,4,0.674099,0.613962,0.642423,0.635400,0.656271,10.396228,0.010,50,False,cuda,1
4,Train,1,5,0.605749,0.688430,0.691426,0.696574,0.666219,11.260889,0.010,50,False,cuda,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
283,Train,4,57,0.000184,1.000000,1.000000,1.000000,0.578427,50.636606,0.001,100,False,cuda,1
284,Train,4,58,0.000168,1.000000,1.000000,1.000000,0.571501,51.378650,0.001,100,False,cuda,1
285,Train,4,59,0.000168,1.000000,1.000000,1.000000,0.578454,52.136595,0.001,100,False,cuda,1
286,Train,4,60,0.000171,1.000000,1.000000,1.000000,0.634327,52.951441,0.001,100,False,cuda,1


测试集准确率: 0.8231939163498099
Network(
  (bn_text): BatchNorm1d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (bn_digital): BatchNorm1d(47, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (fc_text_1): Linear(in_features=768, out_features=128, bias=True)
  (fc_digital_1): Linear(in_features=47, out_features=64, bias=True)
  (fc_connect_1): Linear(in_features=192, out_features=256, bias=True)
  (fc_connect_2): Linear(in_features=256, out_features=128, bias=True)
  (fc_connect_3): Linear(in_features=128, out_features=64, bias=True)
  (fc_connect_4): Linear(in_features=64, out_features=2, bias=True)
)
