In [1]:
import scipy.io as sio
import pandas as pd
import numpy as np
#from gensim.models import word2vec
from sklearn import preprocessing
from sklearn import metrics
import os
from tqdm import tqdm
from sklearn.model_selection import StratifiedKFold
import torch
from torch.utils.data import DataLoader
from torch import nn, optim
from torch.autograd import Variable
import time
import warnings
warnings.filterwarnings("ignore")
import random
def setup_seed(seed):
     torch.manual_seed(seed)
     torch.cuda.manual_seed_all(seed)
     np.random.seed(seed)
     random.seed(seed)
     torch.backends.cudnn.deterministic = True
# 设置随机数种子
setup_seed(2021)

In [2]:
path = 'drive/MyDrive/心电图/'
df = pd.read_csv(path+'trainreference.csv')

In [3]:
from torch.utils.data import Dataset, DataLoader
setup_seed(2021)
class myDataset(Dataset):
    def __init__(self, df, idx=None, if_train=True):
        self.if_train = if_train
        if self.if_train:
            self.paths = df.loc[idx, 'name'].reset_index(drop=True)
            self.labels = df.loc[idx, 'tag'].reset_index(drop=True)
        else:
            self.paths = df['name'].reset_index(drop=True)
            self.labels = df['tag'].reset_index(drop=True)

    def __getitem__(self, index):
        if self.if_train:
            sample = sio.loadmat(path+'train/'+self.paths[index])['ecgdata']
        else:
            sample = sio.loadmat(path+'val/'+self.paths[index])['ecgdata']
        return sample, self.labels[index]

    def __len__(self):
        return len(self.paths)

In [4]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable

class SeqNet(nn.Module):
    def __init__(self):
        super(SeqNet, self).__init__()
        # input 
        self.conv1 = nn.Conv1d(12, 10, 50)
        self.conv2 = nn.Conv1d(12, 10, 200)
        self.conv3 = nn.Conv1d(12, 10, 500)
        self.conv4 = nn.Conv1d(12, 10, 1000)
        self.pooling = nn.MaxPool2d((1, 200))
        self.fc1 = nn.Linear(900, 64)
        self.fc2 = nn.Linear(64, 1)

    def forward(self, x):
        batch_size = x.size(0)
        
        out1 = self.pooling(F.relu(self.conv1(x)))
        out2 = self.pooling(F.relu(self.conv2(x)))
        out3 = self.pooling(F.relu(self.conv3(x)))
        out4 = self.pooling(F.relu(self.conv4(x)))

        out = torch.cat([out1, out2, out3, out4], 2)
        out = out.view(batch_size, -1)
        out = self.fc1(out)
        out = F.relu(out)
        # out = F.dropout(out, p=0.2)
        out = self.fc2(out)
        return out


In [5]:
import time
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
max_epoch = 20
model_save_dir = path+'model/'
def train_model(model, criterion, optimizer, lr_scheduler=None):
    total_iters=len(trainloader)
    print('--------------total_iters:{}'.format(total_iters))
    since = time.time()
    best_loss = 1e7
    best_epoch = 0
    best_f1 = 0
    #
    iters = len(trainloader)
    for epoch in range(1,max_epoch+1):
        model.train(True)
        begin_time=time.time()
        # print('learning rate:{}'.format(optimizer.param_groups[-1]['lr']))
        print('Fold{} Epoch {}/{}'.format(fold+1,epoch, max_epoch))
        running_corrects_linear = 0
        count=0
        train_loss = []
        for i, (inputs, labels) in (enumerate(trainloader)):
            # print(inputs)
            count+=1
            inputs = inputs.to(device)
            labels = labels.float().to(device)

            out_linear = model(inputs).to(device)
            loss = criterion(out_linear, labels.unsqueeze(1))

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            # 更新cosine学习率
            if lr_scheduler!=None:
                lr_scheduler.step(epoch + count / iters)
            if print_interval>0 and (i % print_interval == 0 or out_linear.size()[0] < train_batch_size):
                spend_time = time.time() - begin_time
                print(
                    ' Fold:{} Epoch:{}({}/{}) loss:{:.3f} lr:{:.7f} epoch_Time:{}min:'.format(
                        fold+1,epoch, count, total_iters,
                        loss.item(), optimizer.param_groups[-1]['lr'],
                        spend_time / count * total_iters // 60 - spend_time // 60))
            #
            train_loss.append(loss.item())
        #lr_scheduler.step()
        val_f1, val_loss= val_model(model, criterion)
        print('valf1: {:.4f}  valLogLoss: {:.4f}'.format(val_f1, val_loss))
        model_out_path = model_save_dir+"/"+'fold_'+str(fold+1)+'_'+str(epoch) + '.pth'
        best_model_out_path = model_save_dir+"/"+'fold_'+str(fold+1)+'_best'+'.pth'
        #save the best model
        if val_f1 >= best_f1:
            best_loss = val_loss
            best_f1 = val_f1
            best_epoch=epoch
            torch.save(model.state_dict(), best_model_out_path)
            print("save best epoch: {} best f1: {:.5f} best logloss: {:.5f}".format(best_epoch,val_f1,val_loss))
  
    print('Fold{} Best f1: {:.3f} Best logloss: {:.3f} Best epoch:{}'.format(fold+1,best_f1, best_loss,best_epoch))
    time_elapsed = time.time() - since
    return best_loss, best_f1

@torch.no_grad()
def val_model(model, criterion):
    model.eval()
    running_loss = 0.0
    running_corrects = 0
    cont = 0
    outPre = []
    outLabel = []
    pres_list=[]
    labels_list=[]
    for data in val_loader:
        inputs, labels = data
        inputs, labels = inputs.cuda(), labels.cuda()
        outputs = model(inputs)
        pres_list+=outputs.sigmoid().detach().cpu().numpy().tolist()
        labels_list+=labels.detach().cpu().numpy().tolist()

    preds = np.array(pres_list)
    labels = np.array(labels_list)
    val_f1 = metrics.f1_score(labels, list(map(lambda x: 1 if x > 0.5 else 0, preds)))
    log_loss = metrics.log_loss(labels, preds)#
    return val_f1, log_loss


In [9]:
setup_seed(2021)
skf = StratifiedKFold(n_splits=5)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)
criterion = nn.BCEWithLogitsLoss()
print_interval=-1
kfold_best_loss = []
kfold_best_f1 = []
# print(len(df))
for fold, (train_idx, val_idx) in enumerate(skf.split(df, df['tag'].values)):
    trainloader = torch.utils.data.DataLoader(
        myDataset(df, train_idx), 
        batch_size=32, shuffle=True, pin_memory=True, num_workers=1)
    val_loader = torch.utils.data.DataLoader(
        myDataset(df, val_idx), 
        batch_size=128, shuffle=False, pin_memory=True, num_workers=1)
    model = SeqNet()
    model.to(device)
    optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4 ,weight_decay=5e-4)
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=max_epoch)

    best_loss, best_acc = train_model(model, criterion, optimizer, lr_scheduler=scheduler)
    kfold_best_loss.append(best_loss)
    kfold_best_f1.append(best_acc)

print(kfold_best_f1)                  
print('loss...', np.mean(kfold_best_loss), 'f1...', np.mean(kfold_best_f1))

cuda
--------------total_iters:40
Fold1 Epoch 1/20
valf1: 0.5882  valLogLoss: 0.6690
save best epoch: 1 best f1: 0.58824 best logloss: 0.66902
Fold1 Epoch 2/20
valf1: 0.7696  valLogLoss: 0.6188
save best epoch: 2 best f1: 0.76963 best logloss: 0.61883
Fold1 Epoch 3/20
valf1: 0.7797  valLogLoss: 0.5340
save best epoch: 3 best f1: 0.77966 best logloss: 0.53395
Fold1 Epoch 4/20
valf1: 0.7781  valLogLoss: 0.4845
Fold1 Epoch 5/20
valf1: 0.7988  valLogLoss: 0.4509
save best epoch: 5 best f1: 0.79878 best logloss: 0.45091
Fold1 Epoch 6/20
valf1: 0.8082  valLogLoss: 0.4231
save best epoch: 6 best f1: 0.80822 best logloss: 0.42306
Fold1 Epoch 7/20
valf1: 0.8072  valLogLoss: 0.4224
Fold1 Epoch 8/20
valf1: 0.8143  valLogLoss: 0.3949
save best epoch: 8 best f1: 0.81433 best logloss: 0.39485
Fold1 Epoch 9/20
valf1: 0.7987  valLogLoss: 0.4063
Fold1 Epoch 10/20
valf1: 0.8173  valLogLoss: 0.3972
save best epoch: 10 best f1: 0.81734 best logloss: 0.39722
Fold1 Epoch 11/20
valf1: 0.8194  valLogLoss: 0.3

In [10]:
import torch
import pandas as pd
import numpy as np
from tqdm import tqdm
import os
def load_model(weight_path):
    print(weight_path)
    model = SeqNet()
    model.load_state_dict(torch.load(weight_path))
    model.to(device)
    model.eval()
    return model

@torch.no_grad()
def predict(test_loader):
    ret = 0
    for i, model in enumerate(model_list):
        print('----model ', i)
        pres_list = []
        for data in tqdm(test_loader):
            inputs, _a = data
            inputs = inputs.cuda()
            outputs = model(inputs)
            pres_list+=outputs.sigmoid().detach().cpu().numpy().tolist()
        ret += np.array(pres_list) / len(model_list)
    return list(map(lambda x: 1 if x > 0.5 else 0, ret))

In [11]:
device=torch.device('cuda')
model_list=[]
for i in range(5):
    model_list.append(load_model(path+'model/fold_'+str(i+1)+'_best.pth'))
import os

sub = pd.read_csv(path+'answer.csv')
test_loader = torch.utils.data.DataLoader(
        myDataset(sub, if_train=False), 
        batch_size=64, shuffle=False, num_workers=16, pin_memory=True)
sub['tag'] = predict(test_loader)
sub.to_csv(path+'ans/sub_20211118_%.5f.csv'%np.mean(kfold_best_f1), index=False)


drive/MyDrive/心电图/model/fold_1_best.pth
drive/MyDrive/心电图/model/fold_2_best.pth
drive/MyDrive/心电图/model/fold_3_best.pth
drive/MyDrive/心电图/model/fold_4_best.pth
drive/MyDrive/心电图/model/fold_5_best.pth
----model  0


100%|██████████| 7/7 [00:22<00:00,  3.24s/it]


----model  1


100%|██████████| 7/7 [00:03<00:00,  2.29it/s]


----model  2


100%|██████████| 7/7 [00:03<00:00,  2.26it/s]


----model  3


100%|██████████| 7/7 [00:03<00:00,  2.28it/s]


----model  4


100%|██████████| 7/7 [00:03<00:00,  2.27it/s]
