# 模型验证

## 划分训练集和验证集

In [21]:
import torch
import torch.nn as nn
from torch.utils.data.dataset import Dataset
from torch.utils.data import DataLoader
import numpy as np
import pandas as pd
from tqdm import tqdm
import os

def split_train_valid(csv_path,train_prec=0.75):
    csv_data = pd.read_csv(csv_path,sep='\t') 
    text_data = csv_data.text
    label_data = csv_data.label
    
    mask = np.random.rand(len(text_data))<train_prec
    train_text_data = text_data[mask].reset_index(drop=True)
    valid_text_data = text_data[~mask].reset_index(drop=True)
    train_label_data = label_data[mask].reset_index(drop=True)
    valid_label_data = label_data[~mask].reset_index(drop=True)
    
    return train_text_data,valid_text_data,train_label_data,valid_label_data

class MyDataset(Dataset):
    def __init__(self,text_data,label_data):
        self.text_data = text_data
        self.label_data = label_data
        
    def __getitem__(self,index):
        #所有text内的token索引增加1，0空出来代表空格，将每个text控制在1000长
        text_str = self.text_data[index]
        text_list = [int(x)+1 for x in text_str.split()]
        if len(text_list)>=1000:
            text_list = text_list[:1000]
        else:
            text_list.extend([0]*(1000-len(text_list)))
        text_array = np.array(text_list)
        label_array = np.array(self.label_data[index])
        return text_array,label_array
        
    def __len__(self):
        return len(self.text_data)

class MyModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.embedding = nn.Embedding(7551,100)
        self.rnn = nn.GRU(100,50,batch_first=True)
        self.fc = nn.Linear(50,14)
        
    def forward(self,X):
#         print('X:',X.shape)
        X = self.embedding(X)
#         print('embedding:',X.shape)
        _,X = self.rnn(X)
#         print('rnn:',X.shape)
        X = X.squeeze(dim=0)
#         print('squeeze:',X.shape)
        y_hat = self.fc(X)
#         print('y_hat:',y_hat.shape)
        return y_hat

    
class MyTrain():
    def __init__(self,max_epoch=1,random_seed=1,lr=0.001,out_dir='./'):
        self.max_epoch = max_epoch
        self.random_seed = random_seed
        self.lr = lr
        self.out_dir = out_dir
        self.iter = 0
        
    def fix_random(self):
        import random
        import numpy as np
        import torch
        random.seed(self.random_seed)
        np.random.seed(self.random_seed)
        torch.random.manual_seed(self.random_seed)
        torch.cuda.random.manual_seed_all(self.random_seed)
        torch.backends.cudnn.deterministic = True
        print(f'random seed:{self.random_seed}')
        
    def my_train(self):
        
        max_epoch,lr = self.max_epoch,self.lr
        
        if self.random_seed is not None:
            self.fix_random()
        
        train_text_data,valid_text_data,train_label_data,valid_label_data = split_train_valid('./train_set.csv',train_prec=0.75)
        train_dataset = MyDataset(train_text_data,train_label_data)
        valid_dataset = MyDataset(valid_text_data,valid_label_data)
        
        my_model = MyModel()
        my_optim = torch.optim.Adam(my_model.parameters(),lr=lr)
        my_loss = nn.CrossEntropyLoss()
        
        if torch.cuda.is_available():
            my_model.cuda()
            my_loss.cuda()
        print(f'train device:{next(iter(my_model.parameters())).device}')  #显示训练设备
        
        best_f1_score = 0
        epoch_index = 0

        for epoch_index in range(max_epoch):
            
            loss_list = []
            train_f1_score_list = []
            valid_f1_score_list = []
            train_dataloader = DataLoader(train_dataset,batch_size=120,shuffle=True)
            valid_dataloader = DataLoader(valid_dataset,batch_size=40,shuffle=True)
            my_dataloader = tqdm(train_dataloader)
            
            batch_index = 0
                        
            for train_data,valid_data in zip(my_dataloader,valid_dataloader):
                                
                my_model.train()  #将模型设置为训练模式
                train_text,train_label = train_data
                valid_text,valid_label = valid_data
                
                if torch.cuda.is_available():
                    train_text = train_text.cuda()
                    train_label = train_label.cuda()
                    valid_text = valid_text.cuda()
                    valid_label = valid_label.cuda()
                
                train_y_hat = my_model(train_text)
                batch_train_loss = my_loss(train_y_hat,train_label)
                
                my_optim.zero_grad()
                batch_train_loss.backward()
                my_optim.step()
                my_optim.param_groups[0]['lr'] = lr*(0.8**(epoch_index%10))
        
                my_model.eval()  #将模型设置为验证模式
                with torch.no_grad():
                    valid_y_hat = my_model(valid_text)
                    batch_valid_f1_score = self.f1_score(valid_y_hat.data,valid_label.data)
                    valid_f1_score_list.append(batch_valid_f1_score)
                    mean_valid_f1 = round(sum(valid_f1_score_list)/len(valid_f1_score_list),3)

                    #显示batch结果
                    batch_lr = round(my_optim.param_groups[0]['lr'],5)
                    batch_loss = round(batch_train_loss.item(),4)
                    loss_list.append(batch_loss)
                    mean_loss = round((sum(loss_list)/len(loss_list)),3)

                    batch_train_f1_score = self.f1_score(train_y_hat.data,train_label.data)
                    train_f1_score_list.append(batch_train_f1_score)
                    mean_train_f1 = round(sum(train_f1_score_list)/len(train_f1_score_list),3)

                    my_dataloader.set_description(f'epoch:{epoch_index},batch:{batch_index},lr:{batch_lr},loss:{batch_loss},mean_loss:{mean_loss},train_f1:{mean_train_f1},valid_f1:{mean_valid_f1}')

                    #存储模型
                    if batch_train_f1_score>best_f1_score:
                        torch.save(my_model.state_dict(),os.path.join(self.out_dir,'embedding_gru_best'))
                        best_f1_score = batch_train_f1_score

                    batch_index+=1
            
                
    def f1_score(self,y_hat,label,eps=1e-8):
        #y_hat(N,C),label(1)
        y_hat = y_hat.cpu()
        label = label.cpu()
        preds_list = list(torch.argmax(y_hat,dim=1).numpy())
        label_list = list(label.numpy())
#         print(f'preds:{preds_list},label:{label_list}')
        class_index_list = []
        for class_index in label_list:
            if class_index not in class_index_list:
                class_index_list.append(class_index)

        f1_score_list = []
        for index in class_index_list:
            if index not in preds_list:
                sub_f1_score = 0
            else:
                tp = 0
                fp = 0
                fn = 0
                for i in range(len(preds_list)):
                    if preds_list[i] == index and label_list[i] == index:
                        tp+=1
                    if preds_list[i] == index and label_list[i] != index: 
                        fp+=1
                    if preds_list[i] != index and label_list[i] == index:
                        fn+=1
                prec_val = tp/(tp+fp) 
                recall_val = tp/(tp+fn)
                sub_f1_score = 2*(prec_val*recall_val)/(prec_val+recall_val+eps)
            f1_score_list.append(sub_f1_score)

        batch_f1_score = sum(f1_score_list)/len(f1_score_list)

        return batch_f1_score
        

In [22]:
if __name__ == '__main__':
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
    MyTrain(max_epoch=100,random_seed=1).my_train()

random seed:1
train device:cuda:0


epoch:0,batch:1248,lr:0.001,loss:0.7694,mean_loss:1.442,train_f1:0.264,valid_f1:0.304: 100%|▉| 1249/1251 [01:39<00:00, 
epoch:1,batch:1248,lr:0.0008,loss:0.4283,mean_loss:0.509,train_f1:0.691,valid_f1:0.73: 100%|▉| 1249/1251 [01:40<00:00, 
epoch:2,batch:1248,lr:0.00064,loss:0.2753,mean_loss:0.319,train_f1:0.839,valid_f1:0.847: 100%|▉| 1249/1251 [01:39<00:00
epoch:3,batch:1248,lr:0.00051,loss:0.2345,mean_loss:0.256,train_f1:0.88,valid_f1:0.876: 100%|▉| 1249/1251 [01:40<00:00,
epoch:4,batch:1248,lr:0.00041,loss:0.2978,mean_loss:0.223,train_f1:0.9,valid_f1:0.89: 100%|▉| 1249/1251 [01:40<00:00, 1
epoch:5,batch:1248,lr:0.00033,loss:0.1748,mean_loss:0.201,train_f1:0.911,valid_f1:0.895: 100%|▉| 1249/1251 [01:39<00:00
epoch:6,batch:1248,lr:0.00026,loss:0.183,mean_loss:0.185,train_f1:0.921,valid_f1:0.897: 100%|▉| 1249/1251 [01:40<00:00,
epoch:7,batch:1248,lr:0.00021,loss:0.2157,mean_loss:0.174,train_f1:0.926,valid_f1:0.9: 100%|▉| 1249/1251 [01:40<00:00, 
epoch:8,batch:1248,lr:0.00017,loss:0.198

KeyboardInterrupt: 

## 预测

In [30]:
def predict(model_path,test_data_path):
    p_model = MyModel()
    best_stat_dict = torch.load(model_path)
    p_model.load_state_dict(best_stat_dict)
    if torch.cuda.is_available():
        p_model.cuda()
    test_data = pd.read_csv(test_data_path,sep='\t')
    test_text = test_data.text
    test_label = np.ones_like(test_text) #虚拟一个方便生成dataset和DataLoader
    test_dataset = MyDataset(test_text,test_label)
    test_dataloader = DataLoader(test_dataset,batch_size=1)
    test_dataloader = tqdm(test_dataloader)
    pred_list = []
    for test_index,batch_data in enumerate(test_dataloader):
        batch_text,batch_label = batch_data
        if torch.cuda.is_available():
            batch_text = batch_text.cuda()
        p_model.eval()
        preds = p_model(batch_text)
        pred_class_index = preds.argmax()
        pred_list.append(pred_class_index.item())
    print(len(pred_list))
    return pred_list

In [33]:
if __name__ == '__main__':
    pred_list = predict('./embedding_gru_best_test_data_0.918','./test.csv')
    pred_data = pd.DataFrame(pred_list)
    pred_data.head()
    pred_data.to_csv('test_pred.csv',sep='\t',index=False)
    print('down')

100%|███████████████████████████████████████████████████████████████████████████| 50000/50000 [01:53<00:00, 439.91it/s]

50000





AttributeError: 'numpy.ndarray' object has no attribute 'to_csv'

## 调参

使用上面的模型在测试集上获得的得分是0.918，下面进行调参，看是否能获得更好的分数，首先，把只取前1000个字，改成测试取前不同个数的字，对比结果

In [45]:
import torch
import torch.nn as nn
from torch.utils.data.dataset import Dataset
from torch.utils.data import DataLoader
import numpy as np
import pandas as pd
from tqdm import tqdm
import os

def split_train_valid(csv_path,train_prec=0.75):
    csv_data = pd.read_csv(csv_path,sep='\t') 
    text_data = csv_data.text
    label_data = csv_data.label
    
    mask = np.random.rand(len(text_data))<train_prec
    train_text_data = text_data[mask].reset_index(drop=True)
    valid_text_data = text_data[~mask].reset_index(drop=True)
    train_label_data = label_data[mask].reset_index(drop=True)
    valid_label_data = label_data[~mask].reset_index(drop=True)
    
    return train_text_data,valid_text_data,train_label_data,valid_label_data

class MyDataset(Dataset):
    def __init__(self,text_data,label_data,word_num):
        self.text_data = text_data
        self.label_data = label_data
        self.word_num = word_num
        
    def __getitem__(self,index):
        #所有text内的token索引增加1，0空出来代表空格，将每个text控制在一定长度内
        text_str = self.text_data[index]
        text_list = [int(x)+1 for x in text_str.split()]
        if len(text_list)>=self.word_num:
            text_list = text_list[:self.word_num]
        else:
            text_list.extend([0]*(self.word_num-len(text_list)))
        text_array = np.array(text_list)
        label_array = np.array(self.label_data[index])
        return text_array,label_array
        
    def __len__(self):
        return len(self.text_data)

class MyModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.embedding = nn.Embedding(7551,100)
        self.rnn = nn.GRU(100,50,batch_first=True)
        self.fc = nn.Linear(50,14)
        
    def forward(self,X):
#         print('X:',X.shape)
        X = self.embedding(X)
#         print('embedding:',X.shape)
        _,X = self.rnn(X)
#         print('rnn:',X.shape)
        X = X.squeeze(dim=0)
#         print('squeeze:',X.shape)
        y_hat = self.fc(X)
#         print('y_hat:',y_hat.shape)
        return y_hat

    
class MyTrain():
    def __init__(self,max_epoch=1,random_seed=1,lr=0.001,out_dir='./',word_num= 1000):
        self.max_epoch = max_epoch
        self.random_seed = random_seed
        self.lr = lr
        self.out_dir = out_dir
        self.iter = 0
        self.word_num = word_num
        
    def fix_random(self):
        import random
        import numpy as np
        import torch
        random.seed(self.random_seed)
        np.random.seed(self.random_seed)
        torch.random.manual_seed(self.random_seed)
        torch.cuda.random.manual_seed_all(self.random_seed)
        torch.backends.cudnn.deterministic = True
        print(f'random seed:{self.random_seed}')
        
    def my_train(self):
        
        max_epoch,lr = self.max_epoch,self.lr
        
        if self.random_seed is not None:
            self.fix_random()
        
        train_text_data,valid_text_data,train_label_data,valid_label_data = split_train_valid('./train_set.csv',train_prec=0.75)
        train_dataset = MyDataset(train_text_data,train_label_data,self.word_num)
        valid_dataset = MyDataset(valid_text_data,valid_label_data,self.word_num)
        
        my_model = MyModel()
        my_optim = torch.optim.Adam(my_model.parameters(),lr=lr)
        my_loss = nn.CrossEntropyLoss()
        
        if torch.cuda.is_available():
            my_model.cuda()
            my_loss.cuda()
        print(f'train device:{next(iter(my_model.parameters())).device}')  #显示训练设备
        
        best_f1_score = 0
        epoch_index = 0

        for epoch_index in range(max_epoch):
            
            loss_list = []
            train_f1_score_list = []
            valid_f1_score_list = []
            train_dataloader = DataLoader(train_dataset,batch_size=120,shuffle=True)
            valid_dataloader = DataLoader(valid_dataset,batch_size=40,shuffle=True)
            my_dataloader = tqdm(train_dataloader)
            
            batch_index = 0
                        
            for train_data,valid_data in zip(my_dataloader,valid_dataloader):
                                
                my_model.train()  #将模型设置为训练模式
                train_text,train_label = train_data
                valid_text,valid_label = valid_data
                
                if torch.cuda.is_available():
                    train_text = train_text.cuda()
                    train_label = train_label.cuda()
                    valid_text = valid_text.cuda()
                    valid_label = valid_label.cuda()
                
                train_y_hat = my_model(train_text)
                batch_train_loss = my_loss(train_y_hat,train_label)
                
                my_optim.zero_grad()
                batch_train_loss.backward()
                my_optim.step()
                my_optim.param_groups[0]['lr'] = lr*(0.8**(epoch_index%10))
        
                my_model.eval()  #将模型设置为验证模式
                with torch.no_grad():
                    valid_y_hat = my_model(valid_text)
                    batch_valid_f1_score = self.f1_score(valid_y_hat.data,valid_label.data)
                    valid_f1_score_list.append(batch_valid_f1_score)
                    mean_valid_f1 = round(sum(valid_f1_score_list)/len(valid_f1_score_list),3)

                    #显示batch结果
                    batch_lr = round(my_optim.param_groups[0]['lr'],5)
                    batch_loss = round(batch_train_loss.item(),4)
                    loss_list.append(batch_loss)
                    mean_loss = round((sum(loss_list)/len(loss_list)),3)

                    batch_train_f1_score = self.f1_score(train_y_hat.data,train_label.data)
                    train_f1_score_list.append(batch_train_f1_score)
                    mean_train_f1 = round(sum(train_f1_score_list)/len(train_f1_score_list),3)

                    my_dataloader.set_description(f'epoch:{epoch_index},batch:{batch_index},lr:{batch_lr},loss:{batch_loss},mean_loss:{mean_loss},train_f1:{mean_train_f1},valid_f1:{mean_valid_f1}')

                    #存储模型
                    if batch_valid_f1_score>0.96:
                        torch.save(my_model.state_dict(),os.path.join(self.out_dir,f'embedding_gru_best_{self.word_num}word_valid_f1_score_{round(batch_valid_f1_score,4)}'))
                        best_f1_score = batch_train_f1_score

                    batch_index+=1
            
                
    def f1_score(self,y_hat,label,eps=1e-8):
        #y_hat(N,C),label(1)
        y_hat = y_hat.cpu()
        label = label.cpu()
        preds_list = list(torch.argmax(y_hat,dim=1).numpy())
        label_list = list(label.numpy())
#         print(f'preds:{preds_list},label:{label_list}')
        class_index_list = []
        for class_index in label_list:
            if class_index not in class_index_list:
                class_index_list.append(class_index)

        f1_score_list = []
        for index in class_index_list:
            if index not in preds_list:
                sub_f1_score = 0
            else:
                tp = 0
                fp = 0
                fn = 0
                for i in range(len(preds_list)):
                    if preds_list[i] == index and label_list[i] == index:
                        tp+=1
                    if preds_list[i] == index and label_list[i] != index: 
                        fp+=1
                    if preds_list[i] != index and label_list[i] == index:
                        fn+=1
                prec_val = tp/(tp+fp) 
                recall_val = tp/(tp+fn)
                sub_f1_score = 2*(prec_val*recall_val)/(prec_val+recall_val+eps)
            f1_score_list.append(sub_f1_score)

        batch_f1_score = sum(f1_score_list)/len(f1_score_list)

        return batch_f1_score
        

In [48]:
if __name__ == '__main__':
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
    word_num_list = [50,100,300,500,800,1000,2000]
    for word_num in word_num_list:
        print('word num:',word_num)
        MyTrain(max_epoch=1,random_seed=1,word_num=word_num).my_train()

word num: 50
random seed:1
train device:cuda:0


epoch:0,batch:1248,lr:0.001,loss:0.5032,mean_loss:0.988,train_f1:0.49,valid_f1:0.534: 100%|▉| 1249/1251 [01:04<00:00, 1


word num: 100
random seed:1
train device:cuda:0


epoch:0,batch:1248,lr:0.001,loss:0.4708,mean_loss:1.019,train_f1:0.47,valid_f1:0.506: 100%|▉| 1249/1251 [01:01<00:00, 2


word num: 300
random seed:1
train device:cuda:0


epoch:0,batch:1248,lr:0.001,loss:0.4394,mean_loss:1.103,train_f1:0.432,valid_f1:0.476: 100%|▉| 1249/1251 [01:09<00:00, 


word num: 500
random seed:1
train device:cuda:0


epoch:0,batch:1248,lr:0.001,loss:0.4475,mean_loss:1.196,train_f1:0.386,valid_f1:0.43: 100%|▉| 1249/1251 [01:20<00:00, 1


word num: 800
random seed:1
train device:cuda:0


epoch:0,batch:1248,lr:0.001,loss:0.5433,mean_loss:1.261,train_f1:0.351,valid_f1:0.395: 100%|▉| 1249/1251 [01:28<00:00, 


word num: 1000
random seed:1
train device:cuda:0


epoch:0,batch:1248,lr:0.001,loss:0.7694,mean_loss:1.442,train_f1:0.264,valid_f1:0.304: 100%|▉| 1249/1251 [01:38<00:00, 


word num: 2000
random seed:1
train device:cuda:0


epoch:0,batch:1248,lr:0.001,loss:0.8041,mean_loss:1.791,train_f1:0.161,valid_f1:0.19: 100%|▉| 1249/1251 [05:02<00:00,  


In [49]:
if __name__ == '__main__':
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
    word_num_list = [10,20,30,40]
    for word_num in word_num_list:
        print('word num:',word_num)
        MyTrain(max_epoch=1,random_seed=1,word_num=word_num).my_train()

word num: 10
random seed:1
train device:cuda:0


epoch:0,batch:1248,lr:0.001,loss:0.6856,mean_loss:1.24,train_f1:0.434,valid_f1:0.466: 100%|▉| 1249/1251 [00:56<00:00, 2


word num: 20
random seed:1
train device:cuda:0


epoch:0,batch:1248,lr:0.001,loss:0.5959,mean_loss:1.061,train_f1:0.496,valid_f1:0.523: 100%|▉| 1249/1251 [00:59<00:00, 


word num: 30
random seed:1
train device:cuda:0


epoch:0,batch:1248,lr:0.001,loss:0.5128,mean_loss:0.916,train_f1:0.528,valid_f1:0.566: 100%|▉| 1249/1251 [01:19<00:00, 


word num: 40
random seed:1
train device:cuda:0


epoch:0,batch:1248,lr:0.001,loss:0.4599,mean_loss:0.957,train_f1:0.513,valid_f1:0.555: 100%|▉| 1249/1251 [01:04<00:00, 


每段text，前后各取word_num个字拼成一个新的text，下面是计算结果

In [1]:
import torch
import torch.nn as nn
from torch.utils.data.dataset import Dataset
from torch.utils.data import DataLoader
import numpy as np
import pandas as pd
from tqdm import tqdm
import os

def split_train_valid(csv_path,train_prec=0.75):
    csv_data = pd.read_csv(csv_path,sep='\t') 
    text_data = csv_data.text
    label_data = csv_data.label
    
    mask = np.random.rand(len(text_data))<train_prec
    train_text_data = text_data[mask].reset_index(drop=True)
    valid_text_data = text_data[~mask].reset_index(drop=True)
    train_label_data = label_data[mask].reset_index(drop=True)
    valid_label_data = label_data[~mask].reset_index(drop=True)
    
    return train_text_data,valid_text_data,train_label_data,valid_label_data

class MyDataset(Dataset):
    def __init__(self,text_data,label_data,word_num):
        self.text_data = text_data
        self.label_data = label_data
        self.word_num = word_num
        
    def __getitem__(self,index):
        #所有text内的token索引增加1，0空出来代表空格，将每个text控制在一定长度内
        text_str = self.text_data[index]
        text_list = [int(x)+1 for x in text_str.split()]
        if len(text_list)>=2*self.word_num:
            text_list = text_list[:self.word_num]+text_list[-1*self.word_num:]
        else:
            text_list.extend([0]*(2*self.word_num-len(text_list)))
        text_array = np.array(text_list)
        label_array = np.array(self.label_data[index])
        return text_array,label_array
        
    def __len__(self):
        return len(self.text_data)

class MyModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.embedding = nn.Embedding(7551,100)
        self.rnn = nn.GRU(100,50,batch_first=True)
        self.fc = nn.Linear(50,14)
        
    def forward(self,X):
#         print('X:',X.shape)
        X = self.embedding(X)
#         print('embedding:',X.shape)
        _,X = self.rnn(X)
#         print('rnn:',X.shape)
        X = X.squeeze(dim=0)
#         print('squeeze:',X.shape)
        y_hat = self.fc(X)
#         print('y_hat:',y_hat.shape)
        return y_hat

    
class MyTrain():
    def __init__(self,max_epoch=1,random_seed=1,lr=0.001,out_dir='./',word_num= 1000):
        self.max_epoch = max_epoch
        self.random_seed = random_seed
        self.lr = lr
        self.out_dir = out_dir
        self.iter = 0
        self.word_num = word_num
        
    def fix_random(self):
        import random
        import numpy as np
        import torch
        random.seed(self.random_seed)
        np.random.seed(self.random_seed)
        torch.random.manual_seed(self.random_seed)
        torch.cuda.random.manual_seed_all(self.random_seed)
        torch.backends.cudnn.deterministic = True
        print(f'random seed:{self.random_seed}')
        
    def my_train(self):
        
        max_epoch,lr = self.max_epoch,self.lr
        
        if self.random_seed is not None:
            self.fix_random()
        
        train_text_data,valid_text_data,train_label_data,valid_label_data = split_train_valid('./train_set.csv',train_prec=0.75)
        train_dataset = MyDataset(train_text_data,train_label_data,self.word_num)
        valid_dataset = MyDataset(valid_text_data,valid_label_data,self.word_num)
        
        my_model = MyModel()
        my_optim = torch.optim.Adam(my_model.parameters(),lr=lr)
        my_loss = nn.CrossEntropyLoss()
        
        if torch.cuda.is_available():
            my_model.cuda()
            my_loss.cuda()
        print(f'train device:{next(iter(my_model.parameters())).device}')  #显示训练设备
        
        best_f1_score = 0
        epoch_index = 0

        for epoch_index in range(max_epoch):
            
            loss_list = []
            train_f1_score_list = []
            valid_f1_score_list = []
            train_dataloader = DataLoader(train_dataset,batch_size=120,shuffle=True)
            valid_dataloader = DataLoader(valid_dataset,batch_size=40,shuffle=True)
            my_dataloader = tqdm(train_dataloader)
            
            batch_index = 0
                        
            for train_data,valid_data in zip(my_dataloader,valid_dataloader):
                                
                my_model.train()  #将模型设置为训练模式
                train_text,train_label = train_data
                valid_text,valid_label = valid_data
                
                if torch.cuda.is_available():
                    train_text = train_text.cuda()
                    train_label = train_label.cuda()
                    valid_text = valid_text.cuda()
                    valid_label = valid_label.cuda()
                
                train_y_hat = my_model(train_text)
                batch_train_loss = my_loss(train_y_hat,train_label)
                
                my_optim.zero_grad()
                batch_train_loss.backward()
                my_optim.step()
                my_optim.param_groups[0]['lr'] = lr*(0.8**(epoch_index%10))
        
                my_model.eval()  #将模型设置为验证模式
                with torch.no_grad():
                    valid_y_hat = my_model(valid_text)
                    batch_valid_f1_score = self.f1_score(valid_y_hat.data,valid_label.data)
                    valid_f1_score_list.append(batch_valid_f1_score)
                    mean_valid_f1 = round(sum(valid_f1_score_list)/len(valid_f1_score_list),3)

                    #显示batch结果
                    batch_lr = round(my_optim.param_groups[0]['lr'],5)
                    batch_loss = round(batch_train_loss.item(),4)
                    loss_list.append(batch_loss)
                    mean_loss = round((sum(loss_list)/len(loss_list)),3)

                    batch_train_f1_score = self.f1_score(train_y_hat.data,train_label.data)
                    train_f1_score_list.append(batch_train_f1_score)
                    mean_train_f1 = round(sum(train_f1_score_list)/len(train_f1_score_list),3)

                    my_dataloader.set_description(f'epoch:{epoch_index},batch:{batch_index},lr:{batch_lr},loss:{batch_loss},mean_loss:{mean_loss},train_f1:{mean_train_f1},valid_f1:{mean_valid_f1}')

                    #存储模型
                    if mean_valid_f1>0.96:
                        torch.save(my_model.state_dict(),os.path.join(self.out_dir,f'embedding_gru_best_{self.word_num}word_valid_f1_score_{mean_valid_f1}'))

                    batch_index+=1
            
                
    def f1_score(self,y_hat,label,eps=1e-8):
        #y_hat(N,C),label(1)
        y_hat = y_hat.cpu()
        label = label.cpu()
        preds_list = list(torch.argmax(y_hat,dim=1).numpy())
        label_list = list(label.numpy())
#         print(f'preds:{preds_list},label:{label_list}')
        class_index_list = []
        for class_index in label_list:
            if class_index not in class_index_list:
                class_index_list.append(class_index)

        f1_score_list = []
        for index in class_index_list:
            if index not in preds_list:
                sub_f1_score = 0
            else:
                tp = 0
                fp = 0
                fn = 0
                for i in range(len(preds_list)):
                    if preds_list[i] == index and label_list[i] == index:
                        tp+=1
                    if preds_list[i] == index and label_list[i] != index: 
                        fp+=1
                    if preds_list[i] != index and label_list[i] == index:
                        fn+=1
                prec_val = tp/(tp+fp) 
                recall_val = tp/(tp+fn)
                sub_f1_score = 2*(prec_val*recall_val)/(prec_val+recall_val+eps)
            f1_score_list.append(sub_f1_score)

        batch_f1_score = sum(f1_score_list)/len(f1_score_list)

        return batch_f1_score
        

In [2]:
if __name__ == '__main__':
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
    word_num_list = [10,20,30,40]
    for word_num in word_num_list:
        print('word num:',word_num)
        MyTrain(max_epoch=1,random_seed=1,word_num=word_num).my_train()

word num: 10
random seed:1
train device:cuda:0


epoch:0,batch:1248,lr:0.001,loss:0.6808,mean_loss:1.243,train_f1:0.424,valid_f1:0.447: 100%|▉| 1249/1251 [00:57<00:00, 


word num: 20
random seed:1
train device:cuda:0


epoch:0,batch:1248,lr:0.001,loss:0.6044,mean_loss:1.189,train_f1:0.434,valid_f1:0.454: 100%|▉| 1249/1251 [01:03<00:00, 


word num: 30
random seed:1
train device:cuda:0


epoch:0,batch:1248,lr:0.001,loss:0.5977,mean_loss:1.112,train_f1:0.455,valid_f1:0.48: 100%|▉| 1249/1251 [00:53<00:00, 2


word num: 40
random seed:1
train device:cuda:0


epoch:0,batch:1248,lr:0.001,loss:0.5908,mean_loss:1.125,train_f1:0.449,valid_f1:0.473: 100%|▉| 1249/1251 [01:13<00:00, 


可以看到，效果比只用前面word_num个字效果要差

In [6]:
import torch
import torch.nn as nn
from torch.utils.data.dataset import Dataset
from torch.utils.data import DataLoader
import numpy as np
import pandas as pd
from tqdm import tqdm
import os

def split_train_valid(csv_path,train_prec=0.75):
    csv_data = pd.read_csv(csv_path,sep='\t') 
    text_data = csv_data.text
    label_data = csv_data.label
    
    mask = np.random.rand(len(text_data))<train_prec
    train_text_data = text_data[mask].reset_index(drop=True)
    valid_text_data = text_data[~mask].reset_index(drop=True)
    train_label_data = label_data[mask].reset_index(drop=True)
    valid_label_data = label_data[~mask].reset_index(drop=True)
    
    return train_text_data,valid_text_data,train_label_data,valid_label_data

class MyDataset(Dataset):
    def __init__(self,text_data,label_data,word_num):
        self.text_data = text_data
        self.label_data = label_data
        self.word_num = word_num
        
    def __getitem__(self,index):
        #所有text内的token索引增加1，0空出来代表空格，将每个text控制在一定长度内
        text_str = self.text_data[index]
        text_list = [int(x)+1 for x in text_str.split()]
        if len(text_list)>=self.word_num:
            text_list = text_list[:self.word_num]
        else:
            text_list.extend([0]*(self.word_num-len(text_list)))
        text_array = np.array(text_list)
        label_array = np.array(self.label_data[index])
        return text_array,label_array
        
    def __len__(self):
        return len(self.text_data)

class MyModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.embedding = nn.Embedding(7551,100)
        self.rnn = nn.GRU(100,50,batch_first=True)
        self.fc = nn.Linear(50,14)
        
    def forward(self,X):
#         print('X:',X.shape)
        X = self.embedding(X)
#         print('embedding:',X.shape)
        _,X = self.rnn(X)
#         print('rnn:',X.shape)
        X = X.squeeze(dim=0)
#         print('squeeze:',X.shape)
        y_hat = self.fc(X)
#         print('y_hat:',y_hat.shape)
        return y_hat

    
class MyTrain():
    def __init__(self,max_epoch=1,random_seed=1,lr=0.001,out_dir='./',word_num= 1000):
        self.max_epoch = max_epoch
        self.random_seed = random_seed
        self.lr = lr
        self.out_dir = out_dir
        self.iter = 0
        self.word_num = word_num
        
    def fix_random(self):
        import random
        import numpy as np
        import torch
        random.seed(self.random_seed)
        np.random.seed(self.random_seed)
        torch.random.manual_seed(self.random_seed)
        torch.cuda.random.manual_seed_all(self.random_seed)
        torch.backends.cudnn.deterministic = True
        print(f'random seed:{self.random_seed}')
        
    def my_train(self):
        
        max_epoch,lr = self.max_epoch,self.lr
        
        if self.random_seed is not None:
            self.fix_random()
        
        train_text_data,valid_text_data,train_label_data,valid_label_data = split_train_valid('./train_set.csv',train_prec=0.75)
        train_dataset = MyDataset(train_text_data,train_label_data,self.word_num)
        valid_dataset = MyDataset(valid_text_data,valid_label_data,self.word_num)
        
        my_model = MyModel()
        my_optim = torch.optim.Adam(my_model.parameters(),lr=lr)
        my_loss = nn.CrossEntropyLoss()
        
        if torch.cuda.is_available():
            my_model.cuda()
            my_loss.cuda()
        print(f'train device:{next(iter(my_model.parameters())).device}')  #显示训练设备
        
        best_f1_score = 0
        epoch_index = 0

        for epoch_index in range(max_epoch):
            
            loss_list = []
            train_f1_score_list = []
            valid_f1_score_list = []
            train_dataloader = DataLoader(train_dataset,batch_size=120,shuffle=True)
            valid_dataloader = DataLoader(valid_dataset,batch_size=40,shuffle=True)
            my_dataloader = tqdm(train_dataloader)
            
            batch_index = 0
                        
            for train_data,valid_data in zip(my_dataloader,valid_dataloader):
                                
                my_model.train()  #将模型设置为训练模式
                train_text,train_label = train_data
                valid_text,valid_label = valid_data
                
                if torch.cuda.is_available():
                    train_text = train_text.cuda()
                    train_label = train_label.cuda()
                    valid_text = valid_text.cuda()
                    valid_label = valid_label.cuda()
                
                train_y_hat = my_model(train_text)
                batch_train_loss = my_loss(train_y_hat,train_label)
                
                my_optim.zero_grad()
                batch_train_loss.backward()
                my_optim.step()
                my_optim.param_groups[0]['lr'] = lr*(0.8**(epoch_index%10))
        
                my_model.eval()  #将模型设置为验证模式
                with torch.no_grad():
                    valid_y_hat = my_model(valid_text)
                    batch_valid_f1_score = self.f1_score(valid_y_hat.data,valid_label.data)
                    valid_f1_score_list.append(batch_valid_f1_score)
                    mean_valid_f1 = round(sum(valid_f1_score_list)/len(valid_f1_score_list),3)

                    #显示batch结果
                    batch_lr = round(my_optim.param_groups[0]['lr'],5)
                    batch_loss = round(batch_train_loss.item(),4)
                    loss_list.append(batch_loss)
                    mean_loss = round((sum(loss_list)/len(loss_list)),3)

                    batch_train_f1_score = self.f1_score(train_y_hat.data,train_label.data)
                    train_f1_score_list.append(batch_train_f1_score)
                    mean_train_f1 = round(sum(train_f1_score_list)/len(train_f1_score_list),3)

                    my_dataloader.set_description(f'epoch:{epoch_index},batch:{batch_index},lr:{batch_lr},loss:{batch_loss},mean_loss:{mean_loss},train_f1:{mean_train_f1},valid_f1:{mean_valid_f1}')
                    
                    batch_index+=1
            #存储模型
            if mean_valid_f1>0.96:
                torch.save(my_model.state_dict(),os.path.join(self.out_dir,f'embedding_gru_best_{self.word_num}word_valid_f1_score_{mean_valid_f1}'))

            
                
    def f1_score(self,y_hat,label,eps=1e-8):
        #y_hat(N,C),label(1)
        y_hat = y_hat.cpu()
        label = label.cpu()
        preds_list = list(torch.argmax(y_hat,dim=1).numpy())
        label_list = list(label.numpy())
#         print(f'preds:{preds_list},label:{label_list}')
        class_index_list = []
        for class_index in label_list:
            if class_index not in class_index_list:
                class_index_list.append(class_index)

        f1_score_list = []
        for index in class_index_list:
            if index not in preds_list:
                sub_f1_score = 0
            else:
                tp = 0
                fp = 0
                fn = 0
                for i in range(len(preds_list)):
                    if preds_list[i] == index and label_list[i] == index:
                        tp+=1
                    if preds_list[i] == index and label_list[i] != index: 
                        fp+=1
                    if preds_list[i] != index and label_list[i] == index:
                        fn+=1
                prec_val = tp/(tp+fp) 
                recall_val = tp/(tp+fn)
                sub_f1_score = 2*(prec_val*recall_val)/(prec_val+recall_val+eps)
            f1_score_list.append(sub_f1_score)

        batch_f1_score = sum(f1_score_list)/len(f1_score_list)

        return batch_f1_score
        

In [4]:
if __name__ == '__main__':
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
    word_num_list = [10,20,30,40,50,100,300,500]
    for word_num in word_num_list:
        print('word num:',word_num)
        MyTrain(max_epoch=50,random_seed=1,word_num=word_num).my_train()

word num: 10
random seed:1
train device:cuda:0


epoch:0,batch:1248,lr:0.001,loss:0.6856,mean_loss:1.24,train_f1:0.434,valid_f1:0.466: 100%|▉| 1249/1251 [00:56<00:00, 2
epoch:1,batch:1248,lr:0.0008,loss:0.8455,mean_loss:0.758,train_f1:0.678,valid_f1:0.674: 100%|▉| 1249/1251 [00:55<00:00,
epoch:2,batch:1248,lr:0.00064,loss:0.8682,mean_loss:0.653,train_f1:0.729,valid_f1:0.708: 100%|▉| 1249/1251 [00:55<00:00
epoch:3,batch:1248,lr:0.00051,loss:0.5282,mean_loss:0.594,train_f1:0.755,valid_f1:0.723: 100%|▉| 1249/1251 [00:55<00:00
epoch:4,batch:1248,lr:0.00041,loss:0.6104,mean_loss:0.554,train_f1:0.772,valid_f1:0.733: 100%|▉| 1249/1251 [00:56<00:00
epoch:5,batch:1248,lr:0.00033,loss:0.4169,mean_loss:0.525,train_f1:0.784,valid_f1:0.737: 100%|▉| 1249/1251 [00:56<00:00
epoch:6,batch:1248,lr:0.00026,loss:0.5275,mean_loss:0.503,train_f1:0.794,valid_f1:0.738: 100%|▉| 1249/1251 [00:57<00:00
epoch:7,batch:1248,lr:0.00021,loss:0.5297,mean_loss:0.486,train_f1:0.799,valid_f1:0.742: 100%|▉| 1249/1251 [00:55<00:00
epoch:8,batch:1248,lr:0.00017,loss:0.395

word num: 20
random seed:1
train device:cuda:0


epoch:0,batch:1248,lr:0.001,loss:0.5959,mean_loss:1.061,train_f1:0.496,valid_f1:0.523: 100%|▉| 1249/1251 [00:56<00:00, 
epoch:1,batch:1248,lr:0.0008,loss:0.6406,mean_loss:0.547,train_f1:0.745,valid_f1:0.748: 100%|▉| 1249/1251 [00:56<00:00,
epoch:2,batch:1248,lr:0.00064,loss:0.4429,mean_loss:0.449,train_f1:0.798,valid_f1:0.786: 100%|▉| 1249/1251 [00:56<00:00
epoch:3,batch:1248,lr:0.00051,loss:0.309,mean_loss:0.395,train_f1:0.827,valid_f1:0.8: 100%|▉| 1249/1251 [00:56<00:00, 2
epoch:4,batch:1248,lr:0.00041,loss:0.4139,mean_loss:0.361,train_f1:0.845,valid_f1:0.81: 100%|▉| 1249/1251 [00:56<00:00,
epoch:5,batch:1248,lr:0.00033,loss:0.2833,mean_loss:0.335,train_f1:0.856,valid_f1:0.816: 100%|▉| 1249/1251 [00:57<00:00
epoch:6,batch:1248,lr:0.00026,loss:0.3046,mean_loss:0.316,train_f1:0.866,valid_f1:0.816: 100%|▉| 1249/1251 [00:56<00:00
epoch:7,batch:1248,lr:0.00021,loss:0.2363,mean_loss:0.301,train_f1:0.873,valid_f1:0.819: 100%|▉| 1249/1251 [00:56<00:00
epoch:8,batch:1248,lr:0.00017,loss:0.276

word num: 30
random seed:1
train device:cuda:0


epoch:0,batch:1248,lr:0.001,loss:0.5128,mean_loss:0.916,train_f1:0.528,valid_f1:0.566: 100%|▉| 1249/1251 [01:00<00:00, 
epoch:1,batch:1248,lr:0.0008,loss:0.4261,mean_loss:0.437,train_f1:0.78,valid_f1:0.792: 100%|▉| 1249/1251 [01:00<00:00, 
epoch:2,batch:1248,lr:0.00064,loss:0.3267,mean_loss:0.346,train_f1:0.835,valid_f1:0.827: 100%|▉| 1249/1251 [00:59<00:00
epoch:3,batch:1248,lr:0.00051,loss:0.1965,mean_loss:0.3,train_f1:0.862,valid_f1:0.843: 100%|▉| 1249/1251 [01:00<00:00, 
epoch:4,batch:1248,lr:0.00041,loss:0.3427,mean_loss:0.27,train_f1:0.878,valid_f1:0.85: 100%|▉| 1249/1251 [01:00<00:00, 
epoch:5,batch:1248,lr:0.00033,loss:0.1555,mean_loss:0.247,train_f1:0.888,valid_f1:0.853: 100%|▉| 1249/1251 [01:00<00:00
epoch:6,batch:1248,lr:0.00026,loss:0.2605,mean_loss:0.23,train_f1:0.897,valid_f1:0.853: 100%|▉| 1249/1251 [01:00<00:00,
epoch:7,batch:1248,lr:0.00021,loss:0.143,mean_loss:0.218,train_f1:0.904,valid_f1:0.854: 100%|▉| 1249/1251 [01:00<00:00,
epoch:8,batch:1248,lr:0.00017,loss:0.172

word num: 40
random seed:1
train device:cuda:0


epoch:0,batch:1248,lr:0.001,loss:0.4599,mean_loss:0.957,train_f1:0.513,valid_f1:0.555: 100%|▉| 1249/1251 [01:02<00:00, 
epoch:1,batch:1248,lr:0.0008,loss:0.3588,mean_loss:0.423,train_f1:0.779,valid_f1:0.797: 100%|▉| 1249/1251 [01:02<00:00,
epoch:2,batch:1248,lr:0.00064,loss:0.3769,mean_loss:0.34,train_f1:0.833,valid_f1:0.832: 100%|▉| 1249/1251 [01:03<00:00,
epoch:3,batch:1248,lr:0.00051,loss:0.2402,mean_loss:0.295,train_f1:0.86,valid_f1:0.844: 100%|▉| 1249/1251 [01:02<00:00,
epoch:4,batch:1248,lr:0.00041,loss:0.3819,mean_loss:0.266,train_f1:0.876,valid_f1:0.852: 100%|▉| 1249/1251 [01:02<00:00
epoch:5,batch:1248,lr:0.00033,loss:0.1234,mean_loss:0.244,train_f1:0.889,valid_f1:0.856: 100%|▉| 1249/1251 [01:02<00:00
epoch:6,batch:1248,lr:0.00026,loss:0.257,mean_loss:0.227,train_f1:0.899,valid_f1:0.856: 100%|▉| 1249/1251 [01:02<00:00,
epoch:7,batch:1248,lr:0.00021,loss:0.1587,mean_loss:0.215,train_f1:0.906,valid_f1:0.859: 100%|▉| 1249/1251 [01:03<00:00
epoch:8,batch:1248,lr:0.00017,loss:0.200

word num: 50
random seed:1
train device:cuda:0


epoch:0,batch:1248,lr:0.001,loss:0.5032,mean_loss:0.988,train_f1:0.49,valid_f1:0.534: 100%|▉| 1249/1251 [01:06<00:00, 1
epoch:1,batch:1248,lr:0.0008,loss:0.2788,mean_loss:0.417,train_f1:0.777,valid_f1:0.798: 100%|▉| 1249/1251 [01:07<00:00,
epoch:2,batch:1248,lr:0.00064,loss:0.2759,mean_loss:0.328,train_f1:0.84,valid_f1:0.839: 100%|▉| 1249/1251 [01:06<00:00,
epoch:3,batch:1248,lr:0.00051,loss:0.1784,mean_loss:0.284,train_f1:0.867,valid_f1:0.853: 100%|▉| 1249/1251 [01:07<00:00
epoch:4,batch:1248,lr:0.00041,loss:0.3331,mean_loss:0.255,train_f1:0.885,valid_f1:0.862: 100%|▉| 1249/1251 [01:06<00:00
epoch:5,batch:1248,lr:0.00033,loss:0.1801,mean_loss:0.234,train_f1:0.894,valid_f1:0.866: 100%|▉| 1249/1251 [01:06<00:00
epoch:6,batch:1248,lr:0.00026,loss:0.2821,mean_loss:0.219,train_f1:0.904,valid_f1:0.866: 100%|▉| 1249/1251 [01:06<00:00
epoch:7,batch:1248,lr:0.00021,loss:0.1501,mean_loss:0.206,train_f1:0.91,valid_f1:0.869: 100%|▉| 1249/1251 [01:05<00:00,
epoch:8,batch:1248,lr:0.00017,loss:0.269

word num: 100
random seed:1
train device:cuda:0


epoch:0,batch:1248,lr:0.001,loss:0.4708,mean_loss:1.019,train_f1:0.47,valid_f1:0.506: 100%|▉| 1249/1251 [00:54<00:00, 2
epoch:1,batch:1248,lr:0.0008,loss:0.2799,mean_loss:0.379,train_f1:0.802,valid_f1:0.819: 100%|▉| 1249/1251 [01:06<00:00,
epoch:2,batch:1248,lr:0.00064,loss:0.2497,mean_loss:0.296,train_f1:0.86,valid_f1:0.858: 100%|▉| 1249/1251 [01:07<00:00,
epoch:3,batch:1248,lr:0.00051,loss:0.2341,mean_loss:0.256,train_f1:0.884,valid_f1:0.869: 100%|▉| 1249/1251 [01:06<00:00
epoch:4,batch:1248,lr:0.00041,loss:0.2924,mean_loss:0.23,train_f1:0.898,valid_f1:0.877: 100%|▉| 1249/1251 [01:06<00:00,
epoch:5,batch:1248,lr:0.00033,loss:0.1201,mean_loss:0.211,train_f1:0.908,valid_f1:0.88: 100%|▉| 1249/1251 [01:07<00:00,
epoch:6,batch:1248,lr:0.00026,loss:0.2448,mean_loss:0.197,train_f1:0.915,valid_f1:0.88: 100%|▉| 1249/1251 [01:07<00:00,
epoch:7,batch:1248,lr:0.00021,loss:0.1354,mean_loss:0.186,train_f1:0.92,valid_f1:0.883: 100%|▉| 1249/1251 [01:06<00:00,
epoch:8,batch:1248,lr:0.00017,loss:0.239

word num: 300
random seed:1
train device:cuda:0


epoch:0,batch:1248,lr:0.001,loss:0.4394,mean_loss:1.103,train_f1:0.432,valid_f1:0.476: 100%|▉| 1249/1251 [01:05<00:00, 
epoch:1,batch:1248,lr:0.0008,loss:0.3127,mean_loss:0.36,train_f1:0.816,valid_f1:0.831: 100%|▉| 1249/1251 [01:09<00:00, 
epoch:2,batch:1248,lr:0.00064,loss:0.2718,mean_loss:0.277,train_f1:0.864,valid_f1:0.869: 100%|▉| 1249/1251 [01:16<00:00
epoch:3,batch:1248,lr:0.00051,loss:0.2046,mean_loss:0.237,train_f1:0.889,valid_f1:0.881: 100%|▉| 1249/1251 [01:16<00:00
epoch:4,batch:1248,lr:0.00041,loss:0.3445,mean_loss:0.213,train_f1:0.904,valid_f1:0.889: 100%|▉| 1249/1251 [01:15<00:00
epoch:5,batch:1248,lr:0.00033,loss:0.1453,mean_loss:0.195,train_f1:0.913,valid_f1:0.893: 100%|▉| 1249/1251 [01:15<00:00
epoch:6,batch:1248,lr:0.00026,loss:0.2266,mean_loss:0.182,train_f1:0.921,valid_f1:0.893: 100%|▉| 1249/1251 [01:15<00:00
epoch:7,batch:1248,lr:0.00021,loss:0.1746,mean_loss:0.172,train_f1:0.927,valid_f1:0.896: 100%|▉| 1249/1251 [01:15<00:00
epoch:8,batch:1248,lr:0.00017,loss:0.149

word num: 500
random seed:1
train device:cuda:0


epoch:0,batch:1248,lr:0.001,loss:0.4475,mean_loss:1.196,train_f1:0.386,valid_f1:0.43: 100%|▉| 1249/1251 [01:16<00:00, 1
epoch:1,batch:1248,lr:0.0008,loss:0.3271,mean_loss:0.376,train_f1:0.802,valid_f1:0.823: 100%|▉| 1249/1251 [01:17<00:00,
epoch:2,batch:1248,lr:0.00064,loss:0.2234,mean_loss:0.278,train_f1:0.866,valid_f1:0.873: 100%|▉| 1249/1251 [01:17<00:00
epoch:3,batch:1248,lr:0.00051,loss:0.1362,mean_loss:0.236,train_f1:0.892,valid_f1:0.886: 100%|▉| 1249/1251 [01:17<00:00
epoch:4,batch:1248,lr:0.00041,loss:0.2527,mean_loss:0.211,train_f1:0.907,valid_f1:0.895: 100%|▉| 1249/1251 [01:17<00:00
epoch:5,batch:1248,lr:0.00033,loss:0.1419,mean_loss:0.193,train_f1:0.916,valid_f1:0.898: 100%|▉| 1249/1251 [01:17<00:00
epoch:6,batch:1248,lr:0.00026,loss:0.2463,mean_loss:0.18,train_f1:0.925,valid_f1:0.899: 100%|▉| 1249/1251 [01:17<00:00,
epoch:7,batch:1248,lr:0.00021,loss:0.1936,mean_loss:0.17,train_f1:0.93,valid_f1:0.902: 100%|▉| 1249/1251 [01:17<00:00, 
epoch:8,batch:1248,lr:0.00017,loss:0.159

In [7]:
if __name__ == '__main__':
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
    word_num_list = [600,750,900,1100,1300,1500,1800,2000]
    for word_num in word_num_list:
        print('word num:',word_num)
        MyTrain(max_epoch=15,random_seed=1,word_num=word_num).my_train()

word num: 600
random seed:1
train device:cuda:0


epoch:0,batch:1248,lr:0.001,loss:0.4247,mean_loss:1.175,train_f1:0.391,valid_f1:0.435: 100%|▉| 1249/1251 [01:21<00:00, 
epoch:1,batch:1248,lr:0.0008,loss:0.3177,mean_loss:0.379,train_f1:0.792,valid_f1:0.819: 100%|▉| 1249/1251 [01:21<00:00,
epoch:2,batch:1248,lr:0.00064,loss:0.1959,mean_loss:0.279,train_f1:0.863,valid_f1:0.873: 100%|▉| 1249/1251 [01:20<00:00
epoch:3,batch:1248,lr:0.00051,loss:0.1696,mean_loss:0.236,train_f1:0.887,valid_f1:0.888: 100%|▉| 1249/1251 [01:21<00:00
epoch:4,batch:1248,lr:0.00041,loss:0.23,mean_loss:0.21,train_f1:0.905,valid_f1:0.896: 100%|▉| 1249/1251 [01:20<00:00, 1
epoch:5,batch:1248,lr:0.00033,loss:0.1209,mean_loss:0.192,train_f1:0.916,valid_f1:0.899: 100%|▉| 1249/1251 [01:20<00:00
epoch:6,batch:1248,lr:0.00026,loss:0.2119,mean_loss:0.178,train_f1:0.924,valid_f1:0.898: 100%|▉| 1249/1251 [01:20<00:00
epoch:7,batch:1248,lr:0.00021,loss:0.1694,mean_loss:0.168,train_f1:0.93,valid_f1:0.902: 100%|▉| 1249/1251 [01:20<00:00,
epoch:8,batch:1248,lr:0.00017,loss:0.167

word num: 750
random seed:1
train device:cuda:0


epoch:0,batch:1248,lr:0.001,loss:0.3909,mean_loss:1.24,train_f1:0.371,valid_f1:0.412: 100%|▉| 1249/1251 [01:25<00:00, 1
epoch:1,batch:1248,lr:0.0008,loss:0.2937,mean_loss:0.388,train_f1:0.784,valid_f1:0.812: 100%|▉| 1249/1251 [01:25<00:00,
epoch:2,batch:1248,lr:0.00064,loss:0.2253,mean_loss:0.28,train_f1:0.861,valid_f1:0.872: 100%|▉| 1249/1251 [01:25<00:00,
epoch:3,batch:1248,lr:0.00051,loss:0.1515,mean_loss:0.235,train_f1:0.886,valid_f1:0.886: 100%|▉| 1249/1251 [01:23<00:00
epoch:4,batch:1248,lr:0.00041,loss:0.2829,mean_loss:0.208,train_f1:0.902,valid_f1:0.895: 100%|▉| 1249/1251 [01:23<00:00
epoch:5,batch:1248,lr:0.00033,loss:0.121,mean_loss:0.189,train_f1:0.915,valid_f1:0.899: 100%|▉| 1249/1251 [01:23<00:00,
epoch:6,batch:1248,lr:0.00026,loss:0.2304,mean_loss:0.175,train_f1:0.923,valid_f1:0.9: 100%|▉| 1249/1251 [01:23<00:00, 
epoch:7,batch:1248,lr:0.00021,loss:0.1875,mean_loss:0.164,train_f1:0.928,valid_f1:0.905: 100%|▉| 1249/1251 [01:23<00:00
epoch:8,batch:1248,lr:0.00017,loss:0.166

word num: 900
random seed:1
train device:cuda:0


epoch:0,batch:1248,lr:0.001,loss:0.6478,mean_loss:1.36,train_f1:0.313,valid_f1:0.352: 100%|▉| 1249/1251 [01:30<00:00, 1
epoch:1,batch:1248,lr:0.0008,loss:0.3365,mean_loss:0.439,train_f1:0.74,valid_f1:0.781: 100%|▉| 1249/1251 [01:30<00:00, 
epoch:2,batch:1248,lr:0.00064,loss:0.3101,mean_loss:0.3,train_f1:0.845,valid_f1:0.86: 100%|▉| 1249/1251 [01:30<00:00, 1
epoch:3,batch:1248,lr:0.00051,loss:0.18,mean_loss:0.245,train_f1:0.884,valid_f1:0.884: 100%|▉| 1249/1251 [01:30<00:00, 
epoch:4,batch:1248,lr:0.00041,loss:0.2665,mean_loss:0.214,train_f1:0.901,valid_f1:0.895: 100%|▉| 1249/1251 [01:30<00:00
epoch:5,batch:1248,lr:0.00033,loss:0.157,mean_loss:0.193,train_f1:0.912,valid_f1:0.9: 100%|▉| 1249/1251 [01:31<00:00, 1
epoch:6,batch:1248,lr:0.00026,loss:0.1694,mean_loss:0.179,train_f1:0.921,valid_f1:0.9: 100%|▉| 1249/1251 [01:30<00:00, 
epoch:7,batch:1248,lr:0.00021,loss:0.1491,mean_loss:0.168,train_f1:0.928,valid_f1:0.903: 100%|▉| 1249/1251 [01:30<00:00
epoch:8,batch:1248,lr:0.00017,loss:0.182

word num: 1100
random seed:1
train device:cuda:0


epoch:0,batch:1248,lr:0.001,loss:0.7379,mean_loss:1.395,train_f1:0.302,valid_f1:0.343: 100%|▉| 1249/1251 [01:41<00:00, 
epoch:1,batch:1248,lr:0.0008,loss:0.4003,mean_loss:0.46,train_f1:0.734,valid_f1:0.769: 100%|▉| 1249/1251 [01:40<00:00, 
epoch:2,batch:1248,lr:0.00064,loss:0.3051,mean_loss:0.31,train_f1:0.836,valid_f1:0.852: 100%|▉| 1249/1251 [01:41<00:00,
epoch:3,batch:1248,lr:0.00051,loss:0.1967,mean_loss:0.255,train_f1:0.877,valid_f1:0.874: 100%|▉| 1249/1251 [01:40<00:00
epoch:4,batch:1248,lr:0.00041,loss:0.2434,mean_loss:0.224,train_f1:0.899,valid_f1:0.89: 100%|▉| 1249/1251 [01:41<00:00,
epoch:5,batch:1248,lr:0.00033,loss:0.1136,mean_loss:0.202,train_f1:0.912,valid_f1:0.898: 100%|▉| 1249/1251 [01:41<00:00
epoch:6,batch:1248,lr:0.00026,loss:0.1932,mean_loss:0.187,train_f1:0.921,valid_f1:0.899: 100%|▉| 1249/1251 [01:44<00:00
epoch:7,batch:1248,lr:0.00021,loss:0.2131,mean_loss:0.176,train_f1:0.927,valid_f1:0.902: 100%|▉| 1249/1251 [01:44<00:00
epoch:8,batch:1248,lr:0.00017,loss:0.161

word num: 1300
random seed:1
train device:cuda:0


epoch:0,batch:1248,lr:0.001,loss:0.8377,mean_loss:1.485,train_f1:0.245,valid_f1:0.284: 100%|▉| 1249/1251 [01:55<00:00, 
epoch:1,batch:1248,lr:0.0008,loss:0.466,mean_loss:0.546,train_f1:0.671,valid_f1:0.719: 100%|▉| 1249/1251 [01:54<00:00, 
epoch:2,batch:1248,lr:0.00064,loss:0.2931,mean_loss:0.349,train_f1:0.805,valid_f1:0.832: 100%|▉| 1249/1251 [01:54<00:00
epoch:3,batch:1248,lr:0.00051,loss:0.1915,mean_loss:0.273,train_f1:0.861,valid_f1:0.867: 100%|▉| 1249/1251 [01:55<00:00
epoch:4,batch:1248,lr:0.00041,loss:0.264,mean_loss:0.235,train_f1:0.888,valid_f1:0.885: 100%|▉| 1249/1251 [01:55<00:00,
epoch:5,batch:1248,lr:0.00033,loss:0.1971,mean_loss:0.21,train_f1:0.903,valid_f1:0.892: 100%|▉| 1249/1251 [01:55<00:00,
epoch:6,batch:1248,lr:0.00026,loss:0.2224,mean_loss:0.193,train_f1:0.916,valid_f1:0.896: 100%|▉| 1249/1251 [01:54<00:00
epoch:7,batch:1248,lr:0.00021,loss:0.182,mean_loss:0.182,train_f1:0.922,valid_f1:0.9: 100%|▉| 1249/1251 [01:55<00:00, 1
epoch:8,batch:1248,lr:0.00017,loss:0.167

word num: 1500
random seed:1
train device:cuda:0


epoch:0,batch:1248,lr:0.001,loss:0.546,mean_loss:1.539,train_f1:0.267,valid_f1:0.299: 100%|▉| 1249/1251 [02:17<00:00,  
epoch:1,batch:1248,lr:0.0008,loss:0.3225,mean_loss:0.421,train_f1:0.772,valid_f1:0.8: 100%|▉| 1249/1251 [02:04<00:00, 1
epoch:2,batch:1248,lr:0.00064,loss:0.2327,mean_loss:0.294,train_f1:0.851,valid_f1:0.861: 100%|▉| 1249/1251 [02:04<00:00
epoch:3,batch:1248,lr:0.00051,loss:0.1732,mean_loss:0.245,train_f1:0.883,valid_f1:0.878: 100%|▉| 1249/1251 [02:04<00:00
epoch:4,batch:1248,lr:0.00041,loss:0.271,mean_loss:0.215,train_f1:0.9,valid_f1:0.89: 100%|▉| 1249/1251 [02:04<00:00, 10
epoch:5,batch:1248,lr:0.00033,loss:0.1477,mean_loss:0.195,train_f1:0.911,valid_f1:0.895: 100%|▉| 1249/1251 [02:05<00:00
epoch:6,batch:1248,lr:0.00026,loss:0.2166,mean_loss:0.181,train_f1:0.921,valid_f1:0.897: 100%|▉| 1249/1251 [02:04<00:00
epoch:7,batch:1248,lr:0.00021,loss:0.1615,mean_loss:0.17,train_f1:0.927,valid_f1:0.899: 100%|▉| 1249/1251 [02:04<00:00,
epoch:8,batch:1248,lr:0.00017,loss:0.132

word num: 1800
random seed:1
train device:cuda:0


epoch:0,batch:1248,lr:0.001,loss:0.7737,mean_loss:1.716,train_f1:0.193,valid_f1:0.223: 100%|▉| 1249/1251 [02:19<00:00, 
epoch:1,batch:1248,lr:0.0008,loss:0.363,mean_loss:0.51,train_f1:0.701,valid_f1:0.736: 100%|▉| 1249/1251 [02:19<00:00,  
epoch:2,batch:1248,lr:0.00064,loss:0.3439,mean_loss:0.311,train_f1:0.851,valid_f1:0.858: 100%|▉| 1249/1251 [02:20<00:00
epoch:3,batch:1248,lr:0.00051,loss:0.2392,mean_loss:0.249,train_f1:0.887,valid_f1:0.882: 100%|▉| 1249/1251 [02:19<00:00
epoch:4,batch:1248,lr:0.00041,loss:0.2992,mean_loss:0.217,train_f1:0.907,valid_f1:0.895: 100%|▉| 1249/1251 [02:19<00:00
epoch:5,batch:1248,lr:0.00033,loss:0.1671,mean_loss:0.196,train_f1:0.917,valid_f1:0.901: 100%|▉| 1249/1251 [02:19<00:00
epoch:6,batch:1248,lr:0.00026,loss:0.2245,mean_loss:0.181,train_f1:0.927,valid_f1:0.902: 100%|▉| 1249/1251 [02:19<00:00
epoch:7,batch:1248,lr:0.00021,loss:0.2048,mean_loss:0.17,train_f1:0.933,valid_f1:0.906: 100%|▉| 1249/1251 [02:20<00:00,
epoch:8,batch:1248,lr:0.00017,loss:0.144

word num: 2000
random seed:1
train device:cuda:0


epoch:0,batch:1248,lr:0.001,loss:0.8041,mean_loss:1.791,train_f1:0.161,valid_f1:0.19: 100%|▉| 1249/1251 [02:35<00:00,  
epoch:1,batch:1248,lr:0.0008,loss:0.4571,mean_loss:0.582,train_f1:0.633,valid_f1:0.674: 100%|▉| 1249/1251 [02:32<00:00,
epoch:2,batch:1248,lr:0.00064,loss:0.3514,mean_loss:0.347,train_f1:0.826,valid_f1:0.84: 100%|▉| 1249/1251 [02:32<00:00,
epoch:3,batch:1248,lr:0.00051,loss:0.2151,mean_loss:0.269,train_f1:0.875,valid_f1:0.873: 100%|▉| 1249/1251 [02:33<00:00
epoch:4,batch:1248,lr:0.00041,loss:0.333,mean_loss:0.231,train_f1:0.897,valid_f1:0.887: 100%|▉| 1249/1251 [02:33<00:00,
epoch:5,batch:1248,lr:0.00033,loss:0.1418,mean_loss:0.206,train_f1:0.911,valid_f1:0.894: 100%|▉| 1249/1251 [02:32<00:00
epoch:6,batch:1248,lr:0.00026,loss:0.1934,mean_loss:0.19,train_f1:0.921,valid_f1:0.897: 100%|▉| 1249/1251 [02:37<00:00,
epoch:7,batch:1248,lr:0.00021,loss:0.1734,mean_loss:0.178,train_f1:0.928,valid_f1:0.9: 100%|▉| 1249/1251 [02:31<00:00, 
epoch:8,batch:1248,lr:0.00017,loss:0.154

上面可以看到，如果取前2000个字为最大范围的话，取到前1000个字的时候，验证集的f1成绩就已经到极限了，保持在0.91基本上不会再高了，可以理解为数据集里有一些比较长的文章，只看前2000个字不能看出到底是哪个分类，还需要充分利用后面的内容，下面采用另外一种思路来测试，每个text统一处理成2000个字，长度小于2000的text直接取用，长度大于2000的字符串，按照前500后500中间1000，总共2000来抽取，其中中间1000个字按照10*100的形式来抽取。  

从下面的结果来看，没有什么作用

In [10]:
import torch
import torch.nn as nn
from torch.utils.data.dataset import Dataset
from torch.utils.data import DataLoader
import numpy as np
import pandas as pd
from tqdm import tqdm
import os

def split_train_valid(csv_path,train_prec=0.75):
    csv_data = pd.read_csv(csv_path,sep='\t') 
    text_data = csv_data.text
    label_data = csv_data.label
    
    mask = np.random.rand(len(text_data))<train_prec
    train_text_data = text_data[mask].reset_index(drop=True)
    valid_text_data = text_data[~mask].reset_index(drop=True)
    train_label_data = label_data[mask].reset_index(drop=True)
    valid_label_data = label_data[~mask].reset_index(drop=True)
    
    return train_text_data,valid_text_data,train_label_data,valid_label_data

class MyDataset(Dataset):
    def __init__(self,text_data,label_data,word_num):
        self.text_data = text_data
        self.label_data = label_data
        self.word_num = word_num
        
    def __getitem__(self,index):
        #所有text内的token索引增加1，0空出来代表空格，将每个text控制在一定长度内
        text_str = self.text_data[index]
        text_list = [int(x)+1 for x in text_str.split()]
        out_text_list = []
        if len(text_list)>=2000:
            start_text_list = text_list[:500]
            out_text_list.extend(start_text_list)
            for i in range(10):
                index_gap = (len(text_list)-1000)//10
                middle_text_list = text_list[500+(i*index_gap):500+(i*index_gap)+100]
                out_text_list.extend(middle_text_list)
            end_text_list = text_list[-500:]
            out_text_list.extend(end_text_list)
        else:
            out_text_list = [x for x in text_list]
            out_text_list.extend([0]*(2000-len(text_list)))
        text_array = np.array(out_text_list)
        label_array = np.array(self.label_data[index])
        return text_array,label_array
        
    def __len__(self):
        return len(self.text_data)

class MyModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.embedding = nn.Embedding(7551,100)
        self.rnn = nn.GRU(100,50,batch_first=True)
        self.fc = nn.Linear(50,14)
        
    def forward(self,X):
#         print('X:',X.shape)
        X = self.embedding(X)
#         print('embedding:',X.shape)
        _,X = self.rnn(X)
#         print('rnn:',X.shape)
        X = X.squeeze(dim=0)
#         print('squeeze:',X.shape)
        y_hat = self.fc(X)
#         print('y_hat:',y_hat.shape)
        return y_hat

    
class MyTrain():
    def __init__(self,max_epoch=1,random_seed=1,lr=0.001,out_dir='./',word_num= 1000):
        self.max_epoch = max_epoch
        self.random_seed = random_seed
        self.lr = lr
        self.out_dir = out_dir
        self.iter = 0
        self.word_num = word_num
        
    def fix_random(self):
        import random
        import numpy as np
        import torch
        random.seed(self.random_seed)
        np.random.seed(self.random_seed)
        torch.random.manual_seed(self.random_seed)
        torch.cuda.random.manual_seed_all(self.random_seed)
        torch.backends.cudnn.deterministic = True
        print(f'random seed:{self.random_seed}')
        
    def my_train(self):
        
        max_epoch,lr = self.max_epoch,self.lr
        
        if self.random_seed is not None:
            self.fix_random()
        
        train_text_data,valid_text_data,train_label_data,valid_label_data = split_train_valid('./train_set.csv',train_prec=0.75)
        train_dataset = MyDataset(train_text_data,train_label_data,self.word_num)
        valid_dataset = MyDataset(valid_text_data,valid_label_data,self.word_num)
        
        my_model = MyModel()
        my_optim = torch.optim.Adam(my_model.parameters(),lr=lr)
        my_loss = nn.CrossEntropyLoss()
        
        if torch.cuda.is_available():
            my_model.cuda()
            my_loss.cuda()
        print(f'train device:{next(iter(my_model.parameters())).device}')  #显示训练设备
        
        best_f1_score = 0
        epoch_index = 0

        for epoch_index in range(max_epoch):
            
            loss_list = []
            train_f1_score_list = []
            valid_f1_score_list = []
            train_dataloader = DataLoader(train_dataset,batch_size=120,shuffle=True)
            valid_dataloader = DataLoader(valid_dataset,batch_size=40,shuffle=True)
            my_dataloader = tqdm(train_dataloader)
            
            batch_index = 0
                        
            for train_data,valid_data in zip(my_dataloader,valid_dataloader):
                                
                my_model.train()  #将模型设置为训练模式
                train_text,train_label = train_data
                valid_text,valid_label = valid_data
                
                if torch.cuda.is_available():
                    train_text = train_text.cuda()
                    train_label = train_label.cuda()
                    valid_text = valid_text.cuda()
                    valid_label = valid_label.cuda()
                
                train_y_hat = my_model(train_text)
                batch_train_loss = my_loss(train_y_hat,train_label)
                
                my_optim.zero_grad()
                batch_train_loss.backward()
                my_optim.step()
                my_optim.param_groups[0]['lr'] = lr*(0.8**(epoch_index%10))
        
                my_model.eval()  #将模型设置为验证模式
                with torch.no_grad():
                    valid_y_hat = my_model(valid_text)
                    batch_valid_f1_score = self.f1_score(valid_y_hat.data,valid_label.data)
                    valid_f1_score_list.append(batch_valid_f1_score)
                    mean_valid_f1 = round(sum(valid_f1_score_list)/len(valid_f1_score_list),3)

                    #显示batch结果
                    batch_lr = round(my_optim.param_groups[0]['lr'],5)
                    batch_loss = round(batch_train_loss.item(),4)
                    loss_list.append(batch_loss)
                    mean_loss = round((sum(loss_list)/len(loss_list)),3)

                    batch_train_f1_score = self.f1_score(train_y_hat.data,train_label.data)
                    train_f1_score_list.append(batch_train_f1_score)
                    mean_train_f1 = round(sum(train_f1_score_list)/len(train_f1_score_list),3)

                    my_dataloader.set_description(f'epoch:{epoch_index},batch:{batch_index},lr:{batch_lr},loss:{batch_loss},mean_loss:{mean_loss},train_f1:{mean_train_f1},valid_f1:{mean_valid_f1}')
                    
                    batch_index+=1
            #存储模型
            if mean_valid_f1>0.96:
                torch.save(my_model.state_dict(),os.path.join(self.out_dir,f'embedding_gru_best_{self.word_num}word_valid_f1_score_{mean_valid_f1}'))

            
                
    def f1_score(self,y_hat,label,eps=1e-8):
        #y_hat(N,C),label(1)
        y_hat = y_hat.cpu()
        label = label.cpu()
        preds_list = list(torch.argmax(y_hat,dim=1).numpy())
        label_list = list(label.numpy())
#         print(f'preds:{preds_list},label:{label_list}')
        class_index_list = []
        for class_index in label_list:
            if class_index not in class_index_list:
                class_index_list.append(class_index)

        f1_score_list = []
        for index in class_index_list:
            if index not in preds_list:
                sub_f1_score = 0
            else:
                tp = 0
                fp = 0
                fn = 0
                for i in range(len(preds_list)):
                    if preds_list[i] == index and label_list[i] == index:
                        tp+=1
                    if preds_list[i] == index and label_list[i] != index: 
                        fp+=1
                    if preds_list[i] != index and label_list[i] == index:
                        fn+=1
                prec_val = tp/(tp+fp) 
                recall_val = tp/(tp+fn)
                sub_f1_score = 2*(prec_val*recall_val)/(prec_val+recall_val+eps)
            f1_score_list.append(sub_f1_score)

        batch_f1_score = sum(f1_score_list)/len(f1_score_list)

        return batch_f1_score
        

In [11]:
if __name__ == '__main__':
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        MyTrain(max_epoch=30,random_seed=1).my_train()

random seed:1
train device:cuda:0


epoch:0,batch:1248,lr:0.001,loss:1.0405,mean_loss:1.875,train_f1:0.157,valid_f1:0.177: 100%|▉| 1249/1251 [02:38<00:00, 
epoch:1,batch:1248,lr:0.0008,loss:0.6333,mean_loss:0.726,train_f1:0.555,valid_f1:0.599: 100%|▉| 1249/1251 [02:39<00:00,
epoch:2,batch:1248,lr:0.00064,loss:0.5164,mean_loss:0.482,train_f1:0.734,valid_f1:0.754: 100%|▉| 1249/1251 [02:40<00:00
epoch:3,batch:1248,lr:0.00051,loss:0.3069,mean_loss:0.369,train_f1:0.819,valid_f1:0.828: 100%|▉| 1249/1251 [02:39<00:00
epoch:4,batch:1248,lr:0.00041,loss:0.4163,mean_loss:0.313,train_f1:0.845,valid_f1:0.848: 100%|▉| 1249/1251 [02:39<00:00
epoch:5,batch:1248,lr:0.00033,loss:0.1698,mean_loss:0.278,train_f1:0.861,valid_f1:0.861: 100%|▉| 1249/1251 [02:39<00:00
epoch:6,batch:1248,lr:0.00026,loss:0.2978,mean_loss:0.255,train_f1:0.878,valid_f1:0.869: 100%|▉| 1249/1251 [02:38<00:00
epoch:7,batch:1248,lr:0.00021,loss:0.2523,mean_loss:0.238,train_f1:0.889,valid_f1:0.874: 100%|▉| 1249/1251 [02:38<00:00
epoch:8,batch:1248,lr:0.00017,loss:0.179

尝试使用双向GRU,看对结果是否会有提升

In [4]:
import torch
import torch.nn as nn
from torch.utils.data.dataset import Dataset
from torch.utils.data import DataLoader
import numpy as np
import pandas as pd
from tqdm import tqdm
import os

def split_train_valid(csv_path,train_prec=0.75):
    csv_data = pd.read_csv(csv_path,sep='\t') 
    text_data = csv_data.text
    label_data = csv_data.label
    
    mask = np.random.rand(len(text_data))<train_prec
    train_text_data = text_data[mask].reset_index(drop=True)
    valid_text_data = text_data[~mask].reset_index(drop=True)
    train_label_data = label_data[mask].reset_index(drop=True)
    valid_label_data = label_data[~mask].reset_index(drop=True)
    
    return train_text_data,valid_text_data,train_label_data,valid_label_data

class MyDataset(Dataset):
    def __init__(self,text_data,label_data,word_num):
        self.text_data = text_data
        self.label_data = label_data
        self.word_num = word_num
        
    def __getitem__(self,index):
        #所有text内的token索引增加1，0空出来代表空格，将每个text控制在一定长度内
        text_str = self.text_data[index]
        text_list = [int(x)+1 for x in text_str.split()]
        if len(text_list)>=self.word_num:
            text_list = text_list[:self.word_num]
        else:
            text_list.extend([0]*(self.word_num-len(text_list)))
        text_array = np.array(text_list)
        label_array = np.array(self.label_data[index])
        return text_array,label_array
        
    def __len__(self):
        return len(self.text_data)

class MyModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.embedding = nn.Embedding(7551,101)
        self.rnn = nn.GRU(101,50,batch_first=True,bidirectional=True)
        self.fc = nn.Linear(100,14)
        
    def forward(self,X):
#         print('X:',X.shape)
        X = self.embedding(X)
#         print('embedding:',X.shape)
        _,X = self.rnn(X)
#         print('rnn:',X.shape)
        X = X.permute(1,0,2)
        X = X.reshape((X.shape[0],-1))
#         print('trans:',X.shape)
        y_hat = self.fc(X)
#         print('y_hat:',y_hat.shape)
        return y_hat

    
class MyTrain():
    def __init__(self,max_epoch=1,random_seed=1,lr=0.001,out_dir='./',word_num= 2674):
        self.max_epoch = max_epoch
        self.random_seed = random_seed
        self.lr = lr
        self.out_dir = out_dir
        self.iter = 0
        self.word_num = word_num
        
    def fix_random(self):
        import random
        import numpy as np
        import torch
        random.seed(self.random_seed)
        np.random.seed(self.random_seed)
        torch.random.manual_seed(self.random_seed)
        torch.cuda.random.manual_seed_all(self.random_seed)
        torch.backends.cudnn.deterministic = True
        print(f'random seed:{self.random_seed}')
        
    def my_train(self):
        
        max_epoch,lr = self.max_epoch,self.lr
        
        if self.random_seed is not None:
            self.fix_random()
        
        train_text_data,valid_text_data,train_label_data,valid_label_data = split_train_valid('./train_set.csv',train_prec=0.75)
        train_dataset = MyDataset(train_text_data,train_label_data,self.word_num)
        valid_dataset = MyDataset(valid_text_data,valid_label_data,self.word_num)
        
        my_model = MyModel()
        my_optim = torch.optim.Adam(my_model.parameters(),lr=lr)
        my_loss = nn.CrossEntropyLoss()
        
        if torch.cuda.is_available():
            my_model.cuda()
            my_loss.cuda()
        print(f'train device:{next(iter(my_model.parameters())).device}')  #显示训练设备
        
        best_f1_score = 0
        epoch_index = 0

        for epoch_index in range(max_epoch):
            
            loss_list = []
            train_f1_score_list = []
            valid_f1_score_list = []
            train_dataloader = DataLoader(train_dataset,batch_size=60,shuffle=True)
            valid_dataloader = DataLoader(valid_dataset,batch_size=20,shuffle=True)
            my_dataloader = tqdm(train_dataloader)
            
            batch_index = 0
                        
            for train_data,valid_data in zip(my_dataloader,valid_dataloader):
                                
                my_model.train()  #将模型设置为训练模式
                train_text,train_label = train_data
                valid_text,valid_label = valid_data
                
                if torch.cuda.is_available():
                    train_text = train_text.cuda()
                    train_label = train_label.cuda()
                    valid_text = valid_text.cuda()
                    valid_label = valid_label.cuda()
                
                train_y_hat = my_model(train_text)
                batch_train_loss = my_loss(train_y_hat,train_label)
                
                my_optim.zero_grad()
                batch_train_loss.backward()
                my_optim.step()
                my_optim.param_groups[0]['lr'] = lr*(0.8**(epoch_index%10))
        
                my_model.eval()  #将模型设置为验证模式
                with torch.no_grad():
                    valid_y_hat = my_model(valid_text)
                    batch_valid_f1_score = self.f1_score(valid_y_hat.data,valid_label.data)
                    valid_f1_score_list.append(batch_valid_f1_score)
                    mean_valid_f1 = round(sum(valid_f1_score_list)/len(valid_f1_score_list),3)

                    #显示batch结果
                    batch_lr = round(my_optim.param_groups[0]['lr'],5)
                    batch_loss = round(batch_train_loss.item(),4)
                    loss_list.append(batch_loss)
                    mean_loss = round((sum(loss_list)/len(loss_list)),3)

                    batch_train_f1_score = self.f1_score(train_y_hat.data,train_label.data)
                    train_f1_score_list.append(batch_train_f1_score)
                    mean_train_f1 = round(sum(train_f1_score_list)/len(train_f1_score_list),3)

                    my_dataloader.set_description(f'epoch:{epoch_index},batch:{batch_index},lr:{batch_lr},loss:{batch_loss},mean_loss:{mean_loss},train_f1:{mean_train_f1},valid_f1:{mean_valid_f1}')

                    #存储模型
#                     if batch_valid_f1_score>0.96:
#                         torch.save(my_model.state_dict(),os.path.join(self.out_dir,f'embedding_gru_best_{self.word_num}word_valid_f1_score_{round(batch_valid_f1_score,4)}'))
#                         best_f1_score = batch_train_f1_score

                    batch_index+=1
            torch.save(my_model.state_dict(),os.path.join(self.out_dir,f'embedding_gru_best_{self.word_num}word_valid_f1_score_{round(batch_valid_f1_score,4)}'))

            
                
    def f1_score(self,y_hat,label,eps=1e-8):
        #y_hat(N,C),label(1)
        y_hat = y_hat.cpu()
        label = label.cpu()
        preds_list = list(torch.argmax(y_hat,dim=1).numpy())
        label_list = list(label.numpy())
#         print(f'preds:{preds_list},label:{label_list}')
        class_index_list = []
        for class_index in label_list:
            if class_index not in class_index_list:
                class_index_list.append(class_index)

        f1_score_list = []
        for index in class_index_list:
            if index not in preds_list:
                sub_f1_score = 0
            else:
                tp = 0
                fp = 0
                fn = 0
                for i in range(len(preds_list)):
                    if preds_list[i] == index and label_list[i] == index:
                        tp+=1
                    if preds_list[i] == index and label_list[i] != index: 
                        fp+=1
                    if preds_list[i] != index and label_list[i] == index:
                        fn+=1
                prec_val = tp/(tp+fp) 
                recall_val = tp/(tp+fn)
                sub_f1_score = 2*(prec_val*recall_val)/(prec_val+recall_val+eps)
            f1_score_list.append(sub_f1_score)

        batch_f1_score = sum(f1_score_list)/len(f1_score_list)

        return batch_f1_score
        

In [5]:
   if __name__ == '__main__':
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        MyTrain(max_epoch=10,random_seed=1,word_num=2674).my_train()

random seed:1
train device:cuda:0


epoch:0,batch:2496,lr:0.001,loss:0.2769,mean_loss:0.729,train_f1:0.635,valid_f1:0.685: 100%|▉| 2497/2502 [04:43<00:00, 
epoch:1,batch:2496,lr:0.0008,loss:0.2035,mean_loss:0.249,train_f1:0.889,valid_f1:0.898: 100%|▉| 2497/2502 [04:43<00:00,
epoch:2,batch:2496,lr:0.00064,loss:0.3138,mean_loss:0.184,train_f1:0.922,valid_f1:0.918: 100%|▉| 2497/2502 [04:43<00:00
epoch:3,batch:2496,lr:0.00051,loss:0.065,mean_loss:0.149,train_f1:0.937,valid_f1:0.922: 100%|▉| 2497/2502 [04:43<00:00,
epoch:4,batch:2496,lr:0.00041,loss:0.0498,mean_loss:0.126,train_f1:0.947,valid_f1:0.927: 100%|▉| 2497/2502 [04:43<00:00
epoch:5,batch:2496,lr:0.00033,loss:0.0228,mean_loss:0.108,train_f1:0.956,valid_f1:0.929: 100%|▉| 2497/2502 [04:37<00:00
epoch:6,batch:2496,lr:0.00026,loss:0.1217,mean_loss:0.095,train_f1:0.963,valid_f1:0.928: 100%|▉| 2497/2502 [04:35<00:00
epoch:7,batch:2496,lr:0.00021,loss:0.0343,mean_loss:0.084,train_f1:0.967,valid_f1:0.928: 100%|▉| 2497/2502 [04:34<00:00
epoch:8,batch:2496,lr:0.00017,loss:0.103