In [10]:
import pandas as pd
import numpy as np
import math
from collections import Counter
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score
from imblearn.over_sampling import SMOTE

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
from torch.utils.tensorboard import SummaryWriter

from itertools import product
from collections import OrderedDict, namedtuple
from easydl import clear_output
from IPython.display import display
import torch.utils.data as Data

import time
import matplotlib.pyplot as plt

In [29]:
data = pd.read_csv('./total_data_xinjiao_20210125.csv', encoding='gbk')
print('原数据size', data.shape)
X, y = np.array(data.drop(labels=['label'], axis=1)), np.array(data['label'].apply(int))
print('原数据X_size:%s, y_size:%s' % (X.shape, y.shape))
print('原数据y分布', Counter(y))
X_text, X_digital = X[:, 0:768], X[:, 768:]
print('文本数据', X_text.shape)
print('数值数据', X_digital.shape)
X_sub = np.hstack((X_text, X_digital))
print('分割后X_size:%s, y_size:%s' % (X_sub.shape, y.shape))

原数据size (1132, 816)
原数据X_size:(1132, 815), y_size:(1132,)
原数据y分布 Counter({0: 876, 1: 256})
文本数据 (1132, 768)
数值数据 (1132, 47)
分割后X_size:(1132, 815), y_size:(1132,)


In [30]:
SMO = SMOTE(random_state=666)
X_res,y_res = SMO.fit_resample(X_sub,y)
print('插值后y分布', Counter(y_res))

插值后y分布 Counter({0: 876, 1: 876})


In [31]:
class Network(nn.Module):
    def __init__(self, text_in_features, digital_in_features):
        super(Network, self).__init__()
        self.bn_text = nn.BatchNorm1d(text_in_features)
        self.bn_digital = nn.BatchNorm1d(digital_in_features)
        self.fc_text_1 = nn.Linear(in_features=text_in_features, out_features=128)
        self.fc_digital_1 = nn.Linear(in_features=digital_in_features, out_features=64)
        self.fc_connect_1 = nn.Linear(in_features=192, out_features=256)
        self.fc_connect_2 = nn.Linear(in_features=256, out_features=128)
        self.fc_connect_3 = nn.Linear(in_features=128, out_features=64)
        self.fc_connect_4 = nn.Linear(in_features=64, out_features=2)

    def forward(self, text_input, digital_input):
        text_input_bn = self.bn_text(text_input)
        text_1 = self.fc_text_1(text_input_bn)
        text_1 = F.softsign(text_1)
        
        digital_input_bn = self.bn_digital(digital_input)
        digital_1 = self.fc_digital_1(digital_input_bn)
        digital_1 = F.tanh(digital_1)

        t_d_connect = torch.cat([text_1, digital_1], 1)
        t_d_connect = F.tanh(self.fc_connect_1(t_d_connect))
        t_d_connect = F.tanh(self.fc_connect_2(t_d_connect))
        t_d_connect = F.tanh(self.fc_connect_3(t_d_connect))
        t_d_connect = nn.Dropout(p=0.5)(t_d_connect)
        t_d_connect = self.fc_connect_4(t_d_connect)
        return t_d_connect

In [32]:
import pandas as pd
import numpy as np
import os
import json
import datetime

def save_result(model, run_data):
    """
    运行结果保存
        默认文件路径 ./run_data
        默认模型路径 ./model
    :param model: 模型
    :param run_data: 运行数据
    """
    result_dir = './run_data'
    model_dir = './model'
    name = 'result'
    
    if not os.path.exists(result_dir):
        os.mkdir(result_dir)
        
    if not os.path.exists(model_dir):
        os.mkdir(model_dir)
    
    time_index = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
    
    # 保存运行文件
    run_data_path = os.path.join(result_dir, name)
    pd.DataFrame(run_data).to_csv(f'{run_data_path}_{time_index}.csv', index=False)
    with open(f'{run_data_path}_{time_index}.json', 'w', encoding='utf-8') as f:
        json.dump(run_data, f, ensure_ascii=False, indent=4)
    
    # 保存运行模型
    model_path = os.path.join(model_dir, name)
    torch.save(model.state_dict(), f'{model_path}_{time_index}.pt')

In [33]:
class RunBuilder:
    @staticmethod
    def get_run(params):  # 静态方法，不需要实例化

        Run = namedtuple('Run', params.keys())
        runs = []
        for v in product(*params.values()):
            runs.append(Run(*v))

        return runs

In [34]:
train_params = OrderedDict(
    lr = [.01, .001],
    batch_size = [50, 100],
    shuffle = [False],
    device = ['cuda'],
    num_workers = [1]  # 有多少子进程被用来加载数据 默认为0即在主进程中加载数据 可以利用多核CPU的特点指定num_workers个数 提前将数据加载到内存中
)

test_params = OrderedDict(
    lr = [np.nan],
    batch_size = [50],
    shuffle = [False],
    device = ['cuda'],
    num_workers = [1]  # 有多少子进程被用来加载数据 默认为0即在主进程中加载数据 可以利用多核CPU的特点指定num_workers个数 提前将数据加载到内存中
)

X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.3, random_state=np.random.seed())
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(1226, 815) (526, 815) (1226,) (526,)


In [35]:
run_count = 0
run_data = []

model = None
hightest_accuracy = 0

test_run = next(iter(RunBuilder.get_run(test_params)))

for run in RunBuilder.get_run(train_params):
    
    device = torch.device(run.device)
    network = Network(X_text.shape[1], X_digital.shape[1]).to(device)
    train_loader = Data.DataLoader(
        Data.TensorDataset(torch.tensor(X_train).to(torch.float32), torch.tensor(y_train)),
        batch_size=run.batch_size,
        num_workers=run.num_workers,
        shuffle=run.shuffle
    )
    test_loader = Data.DataLoader(
        Data.TensorDataset(torch.tensor(X_test).to(torch.float32), torch.tensor(y_test)),
        batch_size=test_run.batch_size,
        num_workers=test_run.num_workers,
        shuffle=test_run.shuffle
    )
    optimizer = optim.Adam(network.parameters(), lr=run.lr)
    
    run_start_time = time.time()
    run_count += 1
    epoch_count = 0
    test_epoch_count = 0
    tb = SummaryWriter(comment=f'-{run}')

    for epoch in range(10):
        
        epoch_start_time = time.time()
        epoch_count += 1
        epoch_loss = 0
        epoch_correct_num = 0
        epoch_precision_score = 0
        epoch_recall_score = 0
        epoch_f1_score = 0
        epoch_auc_score = 0
        for batch in train_loader:
            
            X_batch_train = batch[0].to(device)
            labels_train = batch[1].to(device)
            X_text_train, X_digital_train = X_batch_train[:, :768], X_batch_train[:, 768:]
            preds = network(X_text_train, X_digital_train)     # 前向传播 根据权重参数进行预测 
            loss = F.cross_entropy(preds, labels_train)  # 计算损失 构建计算图

            optimizer.zero_grad()                  # pytorch会积累梯度 在优化每个batch的权重的梯度之前将之前权重的梯度置为0
            loss.backward()                        # 在最后一个张量上调用反向传播方法 在计算图中计算权重梯度
            optimizer.step()                       # 使用预先设置的learning_rate的梯度来更新权重参数

            epoch_loss += loss.item() * train_loader.batch_size
            epoch_correct_num += preds.argmax(dim=1).eq(labels_train).sum().item()
            epoch_precision_score += precision_score(labels_train.to('cpu'), preds.argmax(dim=1).to('cpu'))
            epoch_recall_score += recall_score(labels_train.to('cpu'), preds.argmax(dim=1).to('cpu'))
            epoch_f1_score += f1_score(labels_train.to('cpu'), preds.argmax(dim=1).to('cpu'))
            epoch_auc_score += roc_auc_score(labels_train.to('cpu'), preds.argmax(dim=1).to('cpu'))

        epoch_duration = time.time() - epoch_start_time
        run_duration = time.time() - run_start_time
        
        loss = epoch_loss / len(train_loader.dataset)
        accuracy = epoch_correct_num / len(train_loader.dataset)
        precision = epoch_precision_score / math.ceil(len(train_loader.dataset) / run.batch_size)
        recall = epoch_recall_score / math.ceil(len(train_loader.dataset) / run.batch_size)
        f1 = epoch_f1_score / math.ceil(len(train_loader.dataset) / run.batch_size)
        auc = epoch_auc_score / math.ceil(len(train_loader.dataset) / run.batch_size)
    
        tb.add_scalar('Train Loss', loss, epoch_count)
        tb.add_scalar('Train Accuracy', accuracy, epoch_count)
        tb.add_scalar('Train Precision', precision, epoch_count)
        tb.add_scalar('Train Recall', recall, epoch_count)
        tb.add_scalar('Train F1', f1, epoch_count)
        tb.add_scalar('Train AUC', auc, epoch_count)
        
        for name, param in network.named_parameters():  # 将network中的每一层参数都存入tensorboard 
            tb.add_histogram(name, param, epoch_count)
            tb.add_histogram(f'{name}.grad', param.grad, epoch_count)
        
        # 保存训练参数
        results = OrderedDict()
        results['current'] = 'Train' 
        results['run'] = run_count
        results['epoch'] = epoch_count
        results['loss'] = loss
        results['accuracy'] = accuracy
        results['precision'] = precision
        results['recall'] = recall
        results['f1'] = f1
        results['auc'] = auc
        results['epoch_duration'] = epoch_duration
        results['run_duration'] = run_duration
        for k, v in run._asdict().items():
            results[k] = v
        run_data.append(results)
        
        clear_output()                   # 清除输出
        display(pd.DataFrame(run_data))  # 输出
        
        #  对测试集进行预测
        if epoch_count % 5 == 0:
            test_epoch_start_time = time.time()
            test_epoch_count += 1
            test_epoch_loss = 0
            test_epoch_correct_num = 0
            test_epoch_precision_score = 0
            test_epoch_recall_score = 0
            test_epoch_f1_score = 0
            test_epoch_auc_score = 0
            for batch in test_loader:
                
                X_batch_test = batch[0].to(device)
                labels_test = batch[1].to(device)
                X_text_test, X_digital_test = X_batch_test[:, :768], X_batch_test[:, 768:]
                preds = network(X_text_test, X_digital_test)          # 前向传播 根据权重参数进行预测 
                test_loss = F.cross_entropy(preds, labels_test)  # 计算损失 构建计算图

                test_epoch_loss += test_loss.item() * test_loader.batch_size
                test_epoch_correct_num += preds.argmax(dim=1).eq(labels_test).sum().item()
                test_epoch_precision_score += precision_score(labels_test.to('cpu'), preds.argmax(dim=1).to('cpu'))
                test_epoch_recall_score += recall_score(labels_test.to('cpu'), preds.argmax(dim=1).to('cpu'))
                test_epoch_f1_score += f1_score(labels_test.to('cpu'), preds.argmax(dim=1).to('cpu'))
                test_epoch_auc_score += roc_auc_score(labels_test.to('cpu'), preds.argmax(dim=1).to('cpu'))

            test_epoch_duration = time.time() - test_epoch_start_time
            test_run_duration = time.time() - run_start_time

            test_loss = test_epoch_loss / len(test_loader.dataset)
            test_accuracy = test_epoch_correct_num / len(test_loader.dataset)
            test_precision = test_epoch_precision_score / math.ceil(len(test_loader.dataset) / test_run.batch_size)
            test_recall = test_epoch_recall_score / math.ceil(len(test_loader.dataset) / test_run.batch_size)
            test_f1 = test_epoch_f1_score / math.ceil(len(test_loader.dataset) / test_run.batch_size)
            test_auc = test_epoch_auc_score / math.ceil(len(test_loader.dataset) / test_run.batch_size)

            tb.add_scalar('Test Loss', test_loss, test_epoch_count)
            tb.add_scalar('Test Accuracy', test_accuracy, test_epoch_count)
            tb.add_scalar('Test Precision', test_precision, test_epoch_count)
            tb.add_scalar('Test Recall', test_recall, test_epoch_count)
            tb.add_scalar('Test F1', test_f1, test_epoch_count)
            tb.add_scalar('Test AUC', test_auc, test_epoch_count)

            results = OrderedDict()
            results['current'] = 'Test' 
            results['run'] = run_count
            results['epoch'] = test_epoch_count
            results['loss'] = test_loss
            results['accuracy'] = test_accuracy
            results['precision'] = test_precision
            results['recall'] = test_recall
            results['f1'] = test_f1
            results['auc'] = test_auc
            results['epoch_duration'] = test_epoch_duration
            results['run_duration'] = test_run_duration
            for k, v in test_run._asdict().items():
                if k == 'lr': v = run.lr
                results[k] = v
            run_data.append(results)

            clear_output()
            display(pd.DataFrame(run_data))
        
    if test_accuracy > hightest_accuracy:
        hightest_accuracy = test_accuracy
        model = network
    tb.close()
save_result(model, run_data)
print(model)

Unnamed: 0,current,run,epoch,loss,accuracy,precision,recall,f1,auc,epoch_duration,run_duration,lr,batch_size,shuffle,device,num_workers
0,Train,1,1,0.769638,0.508972,0.510535,0.537891,0.513002,0.512987,2.514282,2.608067,0.01,50,False,cuda,1
1,Train,1,2,0.721156,0.538336,0.535867,0.541875,0.535704,0.537411,0.731228,3.729005,0.01,50,False,cuda,1
2,Train,1,3,0.746929,0.544046,0.541017,0.537122,0.52779,0.548713,0.770299,4.686018,0.01,50,False,cuda,1
3,Train,1,4,0.679554,0.622349,0.620692,0.609339,0.607429,0.624394,0.746809,5.630275,0.01,50,False,cuda,1
4,Train,1,5,0.651983,0.673736,0.652925,0.744823,0.688383,0.674505,0.691796,6.544637,0.01,50,False,cuda,1
5,Test,1,1,0.736961,0.604563,0.607733,0.630048,0.616877,0.608744,0.575966,7.305112,0.01,50,False,cuda,1
6,Train,1,6,0.61097,0.696574,0.681147,0.743845,0.704912,0.701735,0.735094,8.107706,0.01,50,False,cuda,1
7,Train,1,7,0.624197,0.685971,0.660086,0.75394,0.697094,0.688172,0.691899,8.992023,0.01,50,False,cuda,1
8,Train,1,8,0.555547,0.757749,0.719216,0.844485,0.771006,0.760249,0.733862,9.927741,0.01,50,False,cuda,1
9,Train,1,9,0.508206,0.768352,0.731862,0.843123,0.778908,0.770899,0.716938,10.870217,0.01,50,False,cuda,1


Network(
  (bn_text): BatchNorm1d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (bn_digital): BatchNorm1d(47, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (fc_text_1): Linear(in_features=768, out_features=128, bias=True)
  (fc_digital_1): Linear(in_features=47, out_features=64, bias=True)
  (fc_connect_1): Linear(in_features=192, out_features=256, bias=True)
  (fc_connect_2): Linear(in_features=256, out_features=128, bias=True)
  (fc_connect_3): Linear(in_features=128, out_features=64, bias=True)
  (fc_connect_4): Linear(in_features=64, out_features=2, bias=True)
)


In [36]:
run_data_df = pd.DataFrame(run_data)
run_data_df.iloc[
    [
        run_data_df[run_data_df['current'] == 'Test']['accuracy'].sort_values(ascending=False).index[0],
        run_data_df[run_data_df['current'] == 'Test']['precision'].sort_values(ascending=False).index[0],
        run_data_df[run_data_df['current'] == 'Test']['recall'].sort_values(ascending=False).index[0],
        run_data_df[run_data_df['current'] == 'Test']['f1'].sort_values(ascending=False).index[0],
        run_data_df[run_data_df['current'] == 'Test']['auc'].sort_values(ascending=False).index[0]
    ],
    :
]

Unnamed: 0,current,run,epoch,loss,accuracy,precision,recall,f1,auc,epoch_duration,run_duration,lr,batch_size,shuffle,device,num_workers
35,Test,3,2,0.904197,0.769962,0.754419,0.820214,0.782698,0.774173,0.570897,11.286926,0.001,50,False,cuda,1
35,Test,3,2,0.904197,0.769962,0.754419,0.820214,0.782698,0.774173,0.570897,11.286926,0.001,50,False,cuda,1
35,Test,3,2,0.904197,0.769962,0.754419,0.820214,0.782698,0.774173,0.570897,11.286926,0.001,50,False,cuda,1
35,Test,3,2,0.904197,0.769962,0.754419,0.820214,0.782698,0.774173,0.570897,11.286926,0.001,50,False,cuda,1
35,Test,3,2,0.904197,0.769962,0.754419,0.820214,0.782698,0.774173,0.570897,11.286926,0.001,50,False,cuda,1
