In [1]:
import time
import numpy as np
import torchmetrics
from torchmetrics import Accuracy

import torch
import torch.utils.data as Data
from torch.utils.data import DataLoader, TensorDataset
from torch import nn
from torch.autograd import Variable

import transformers
from transformers import GPT2Tokenizer
from transformers import GPT2LMHeadModel
 

In [2]:
# choose train mode
global mode
# mode = 'pretrain'
mode = 'finetune'
# mode = 'else'


In [4]:
# device and tokenizer

device = torch.device('cuda:0')


tokenizer = GPT2Tokenizer('./vocab_file/vocab.json', './vocab_file/merges.txt')
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
if mode == 'pretrain':
    tokenizer.save_pretrained('./save_model/pretrain_model/pretrained-gpt-10-64raw-50epochs') # when pretrain model
print(tokenizer)

PreTrainedTokenizer(name_or_path='', vocab_size=23, model_max_len=1000000000000000019884624838656, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'eos_token': AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'unk_token': AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'pad_token': '[PAD]'})


In [5]:
#定义数据集
class Dataset(torch.utils.data.Dataset):

    def __init__(self):
        if mode == 'finetune':
#             with open('./data/ADP3_amp.txt') as f:  # when finetune
            with open('./data/ADP3_amp.txt') as f:  # when finetune
                lines = f.readlines()
        elif mode == 'pretrain':
            with open('./data/pretrain_data/uniprot10-63.txt') as f:  # when pretrain model 
                lines = f.readlines()
        else:
            print('train mode error')
            with open('./data/ADP3_amp.txt') as f:
                lines = f.readlines()
        lines = [i.strip() for i in lines]

        self.lines = lines

    def __len__(self):
        return len(self.lines)

    def __getitem__(self, i):
        return self.lines[i]


global val_split
if mode == 'pretrain':
    val_split = 0.01
elif mode =='finetune':
    val_split = 0.1
else:
    val_split = 0.4

shuffle_dataset = True
random_seed = 42

dataset = Dataset()
dataset_size = len(dataset)

indices = list(range(dataset_size))
split = int(np.floor(val_split * dataset_size))
if shuffle_dataset :
    np.random.seed(random_seed)
    np.random.shuffle(indices)
train_indices, val_indices = indices[split:], indices[:split]

# Creating PT data samplers and loaders:
train_sampler = Data.SubsetRandomSampler(train_indices)
val_sampler = Data.SubsetRandomSampler(val_indices)

def collate_fn(data):
    data = tokenizer.batch_encode_plus(data,
                                       padding=True,
                                       truncation=True,
                                       max_length=48,
                                       return_tensors='pt')

    data['labels'] = data['input_ids'].clone()

    return data



train_loader = torch.utils.data.DataLoader(
    dataset=dataset,
    batch_size=32, 
    sampler=train_sampler,
    collate_fn=collate_fn,
    drop_last=True,)


val_loader = torch.utils.data.DataLoader(
    dataset=dataset,
    batch_size=32, 
    sampler=val_sampler,
    collate_fn=collate_fn,
    drop_last=True,)

# for i, data in enumerate(val_loader):
    

#     for k, v in data.items():
#         print(k, v.shape, v)

len(train_loader)

178

In [6]:
# define GPT model

from transformers import GPT2Model, GPT2Config

# Initializing a GPT2 configuration
configuration = GPT2Config(n_layer=12, 
                           n_head=12,
                           n_embd=768)

# print(configuration)

# Initializing a model from the configuration
if mode == 'pretrain':
    model = GPT2LMHeadModel(configuration)  # when pretrain model
elif mode == 'finetune':
    model = GPT2LMHeadModel.from_pretrained('./save_model/pretrain_model/pretrained-GPT-10-64washed-30epochs')
else:
    pass
#     model = GPT2LMHeadModel.from_pretrained('./save_model/pretrain_model/pretrained-GPT-10-64raw-20epochs/')
# model = torch.load('./save_model/pretrained-gpt-10-48-30epochs/pytorch_model.bin')  # pretrain model use


In [7]:
from torch.optim import AdamW
from transformers.optimization import get_scheduler

accuracy = Accuracy(task="multiclass",num_classes=23,ignore_index=23)
accuracy = accuracy.to(device)

epochs = 50

train_loss_list = []
val_loss_list = []
train_acc_list =[]
val_acc_list = []
blue_score_list = []
    
#训练
def train():
    global model
#     device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model = model.to(device)
    
    optimizer = AdamW(model.parameters(), lr=1e-6)
    scheduler = get_scheduler(name='linear',
                              num_warmup_steps=400,
                              num_training_steps=len(train_loader)*epochs,
                              optimizer=optimizer)

    model.train()
    print('开始训练')
    start_time = time.time()

    for epoch in range(epochs):
        train_loss = []
        train_accuracy = []
        val_loss = []
        val_accuracy = []
        for batch_idx, batch_data in enumerate(train_loader):
            batch_data = batch_data.to(device)
            out = model(**batch_data)
            loss = out['loss']
            
            train_loss.append(loss.item())
            
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

            optimizer.step()
            scheduler.step()

            optimizer.zero_grad()
            model.zero_grad()
            
            labels = batch_data['labels'][:, 1:]
            out = out['logits'].argmax(dim=2)[:, :-1]
            train_acc = accuracy(out, labels)
            train_accuracy.append(train_acc.tolist())
            lr = optimizer.state_dict()['param_groups'][0]['lr']
            
            if batch_idx % 50 == 0:
                print('train_batch: {:3d}  loss:{:.4f}  accuracy:{:.4f}'
                      .format(batch_idx, loss.item(), train_acc.item()))
            
        for batch_idx, batch_data in enumerate(val_loader):
            batch_data = batch_data.to(device)
            out = model(**batch_data)
            loss = out['loss']
            labels = batch_data['labels'][:, 1:]
            out = out['logits'].argmax(dim=2)[:, :-1]
            val_acc = accuracy(out, labels)
            val_loss.append(loss.item())
            val_accuracy.append(val_acc.tolist())
            
            if batch_idx % 50 == 0:
                print('val_batch:   {:3d}  loss:{:.4f}  accuracy:{:.4f}'
                      .format(batch_idx, loss.item(), val_acc.item()))
            
        train_loss_list.append(np.mean(train_loss))
        train_acc_list.append(np.mean(train_accuracy))
        val_loss_list.append(np.mean(val_loss))
        val_acc_list.append(np.mean(val_accuracy))
        
        train_time = time.time()
        print('第{}代训练完成,历时{}秒'.format(epoch+1,train_time-start_time))
        print('epoch {} mean training loss:{:.4f}'.format(epoch+1, np.mean(train_loss)))
        print('epoch {} mean training accuracy:{:.4f}'.format(epoch+1, np.mean(train_accuracy)))
        print('epoch {} mean val loss:{:.4f}'.format(epoch+1, np.mean(val_loss)))
        print('epoch {} mean val accuracy:{:.4f} '.format(epoch+1, np.mean(val_accuracy)))
        print(' ')
        
    
    end_time = time.time()
    print('训练结束,训练时长：',end_time-start_time, '秒')   
    
    

In [8]:
# 训练模型

train()


开始训练
train_batch:   0  loss:1.5708  accuracy:0.1773
train_batch:  50  loss:1.9412  accuracy:0.1529
train_batch: 100  loss:1.7510  accuracy:0.1672
train_batch: 150  loss:1.7228  accuracy:0.1403
val_batch:     0  loss:1.7831  accuracy:0.1637
第1代训练完成,历时17.8853702545166秒
epoch 1 mean training loss:1.7813
epoch 1 mean training accuracy:0.1601
epoch 1 mean val loss:1.7604
epoch 1 mean val accuracy:0.1836 
 
train_batch:   0  loss:1.6970  accuracy:0.2013
train_batch:  50  loss:1.6088  accuracy:0.1804
train_batch: 100  loss:1.7907  accuracy:0.1988
train_batch: 150  loss:1.6859  accuracy:0.1907
val_batch:     0  loss:1.6240  accuracy:0.2530
第2代训练完成,历时35.07102298736572秒
epoch 2 mean training loss:1.6514
epoch 2 mean training accuracy:0.1997
epoch 2 mean val loss:1.6729
epoch 2 mean val accuracy:0.2225 
 
train_batch:   0  loss:1.3336  accuracy:0.2311
train_batch:  50  loss:1.4629  accuracy:0.2220
train_batch: 100  loss:1.6727  accuracy:0.2095
train_batch: 150  loss:1.4565  accuracy:0.2503
val_ba

train_batch: 150  loss:1.2547  accuracy:0.4440
val_batch:     0  loss:1.3799  accuracy:0.3662
第21代训练完成,历时364.2272231578827秒
epoch 21 mean training loss:1.2097
epoch 21 mean training accuracy:0.4169
epoch 21 mean val loss:1.3955
epoch 21 mean val accuracy:0.3590 
 
train_batch:   0  loss:1.3915  accuracy:0.3780
train_batch:  50  loss:1.1482  accuracy:0.4672
train_batch: 100  loss:1.1568  accuracy:0.4468
train_batch: 150  loss:1.1807  accuracy:0.4524
val_batch:     0  loss:1.2567  accuracy:0.3902
第22代训练完成,历时381.5475811958313秒
epoch 22 mean training loss:1.2005
epoch 22 mean training accuracy:0.4211
epoch 22 mean val loss:1.4063
epoch 22 mean val accuracy:0.3627 
 
train_batch:   0  loss:1.1230  accuracy:0.4377
train_batch:  50  loss:1.1583  accuracy:0.4691
train_batch: 100  loss:1.2165  accuracy:0.4002
train_batch: 150  loss:1.2302  accuracy:0.4061
val_batch:     0  loss:1.4588  accuracy:0.3344
第23代训练完成,历时398.86718821525574秒
epoch 23 mean training loss:1.1917
epoch 23 mean training accur

第41代训练完成,历时710.753725528717秒
epoch 41 mean training loss:1.1005
epoch 41 mean training accuracy:0.4683
epoch 41 mean val loss:1.3711
epoch 41 mean val accuracy:0.3761 
 
train_batch:   0  loss:0.9857  accuracy:0.4854
train_batch:  50  loss:1.0387  accuracy:0.4860
train_batch: 100  loss:0.9914  accuracy:0.4983
train_batch: 150  loss:1.0535  accuracy:0.4375
val_batch:     0  loss:1.4588  accuracy:0.3651
第42代训练完成,历时728.0653424263秒
epoch 42 mean training loss:1.0985
epoch 42 mean training accuracy:0.4696
epoch 42 mean val loss:1.3841
epoch 42 mean val accuracy:0.3764 
 
train_batch:   0  loss:1.3116  accuracy:0.4347
train_batch:  50  loss:1.1381  accuracy:0.4703
train_batch: 100  loss:1.1224  accuracy:0.4175
train_batch: 150  loss:1.1214  accuracy:0.4575
val_batch:     0  loss:1.4697  accuracy:0.3285
第43代训练完成,历时745.3863489627838秒
epoch 43 mean training loss:1.0955
epoch 43 mean training accuracy:0.4704
epoch 43 mean val loss:1.3752
epoch 43 mean val accuracy:0.3779 
 
train_batch:   0  los

In [12]:
# 计算Bleu分数

from torchmetrics import BLEUScore
from statistics import mean

def turn_numlist_into_strlist(num_list):
    return [' '.join(list(map(str,num_list)))]
"""将数字列表转化为bleu计算的字符串列表"""

bleu = BLEUScore(5)

bleu_list = []
for batch_idx, batch_data in enumerate(val_loader):
            batch_data = batch_data.to(device)
            out = model(**batch_data)
            loss = out['loss']
            labels = batch_data['labels'][:, 1:].tolist()
            outputs = out['logits'].argmax(dim=2)[:, :-1].tolist()
#             print(labels)
            fix_outputs = []
            fix_labels = []
            fix_idx = []

            for seq_num in labels:
                fix_labels.append(seq_num[:len(seq_num)-seq_num.count(23)])
                fix_idx.append(len(seq_num)-seq_num.count(23))
            for idx, seq_num in enumerate(outputs):
                fix_outputs.append(seq_num[:fix_idx[idx]])
           
            str_labels = []
            str_outputs = []
            
            for o in fix_outputs:
                str_outputs.append(turn_numlist_into_strlist(o))
            
            for l in fix_labels:
                str_labels.append(turn_numlist_into_strlist(l))
#             print(str_labels)
            
            for i, c in enumerate(str_outputs):
#                 print(i ,c)
                bleu_list.append(bleu(c, [str_labels[i]]).tolist())
        
# print(bleu_list)
print(mean(bleu_list))

0.14114237015478706


In [48]:
print(sum(i.numel() for i in model.parameters()) / 10000)


12443.9808


In [None]:
# 保存模型

model = model.to('cpu')

if mode == 'pretrain':
    model.save_pretrained('./save_model/pretrain_model/pretrained-GPT-10-64raw-20epochs/')  # when pretrain model
elif mode == 'finetune':
    torch.save(model, './save_model/finetune-model/finetune_with_AMPbert_data/finetune-10-48-GPT-'+str(epochs)+'epochs')  # finetune model
else:
    pass
    

In [None]:
# 结果作图

import matplotlib.pyplot as plt

x1 = [(x+1) for x in range(len(train_loss_list))]
x2 = [(x+1) for x in range(len(val_acc_list))]
y1 = train_loss_list
y2 = train_acc_list
y3 = val_loss_list
y4 = val_acc_list

plt.plot(x1, y1, label="AMP training loss")
plt.plot(x1, y3, label="AMP val_loss")
plt.xlabel('step')
plt.ylabel('loss')
plt.title('AMP train losses show')
plt.legend()
if mode == 'pretrain':
    plt.savefig('/xms/AMP-master/generate_file/train_graphical_result-2023/pretrain_result/'+
                mode+'10-48washed-'+str(epochs)+'epochs_loss.jpg')
elif mode == 'finetune':
    plt.savefig('/xms/AMP-master/generate_file/train_graphical_result-2023/finetune_result/'+
                mode+'10-48washed-'+str(epochs)+'epochs_loss.jpg')
else:
    pass
plt.show()


plt.plot(x2, y2, label="AMP train_acc curse")
plt.plot(x2, y4, label="AMP val_acc curse")
plt.xlabel('step')
plt.ylabel('acc')
plt.title('AMP val_acc show')
plt.legend()
if mode == 'pretrain':
    plt.savefig('/xms/AMP-master/generate_file/train_graphical_result-2023/pretrain_result/'+
                mode+'10-48washed-'+str(epochs)+'epochs_accuracy.jpg')
elif mode == 'finetune':
    plt.savefig('/xms/AMP-master/generate_file/train_graphical_result-2023/finetune_result/'+
                mode+'10-48washed-'+str(epochs)+'epochs_accuracy.jpg')
else:
    pass

plt.show()