# 模型验证

## 划分训练集和验证集

In [9]:
import torch
import torch.nn as nn
from torch.utils.data.dataset import Dataset
from torch.utils.data import DataLoader
import numpy as np
import pandas as pd
from tqdm import tqdm
import os

def split_train_valid(csv_path,train_prec=0.75):
    csv_data = pd.read_csv(csv_path,sep='\t') 
    text_data = csv_data.text
    label_data = csv_data.label
    
    mask = np.random.rand(len(text_data))<train_prec
    train_text_data = text_data[mask].reset_index(drop=True)
    valid_text_data = text_data[~mask].reset_index(drop=True)
    train_label_data = label_data[mask].reset_index(drop=True)
    valid_label_data = label_data[~mask].reset_index(drop=True)
    
    return train_text_data,valid_text_data,train_label_data,valid_label_data

class MyDataset(Dataset):
    def __init__(self,text_data,label_data):
        self.text_data = text_data
        self.label_data = label_data
        
    def __getitem__(self,index):
        #所有text内的token索引增加1，0空出来代表空格，将每个text控制在1000长
        text_str = self.text_data[index]
        text_list = [int(x)+1 for x in text_str.split()]
        if len(text_list)>=1000:
            text_list = text_list[:1000]
        else:
            text_list.extend([0]*(1000-len(text_list)))
        text_array = np.array(text_list)
        label_array = np.array(self.label_data[index])
        return text_array,label_array
        
    def __len__(self):
        return len(self.text_data)

class MyModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.embedding = nn.Embedding(7551,101)
        self.rnn = nn.GRU(101,50,batch_first=True,bidirectional=True)
        self.fc = nn.Linear(100,14)
        
    def forward(self,X):
#         print('X:',X.shape)
        X = self.embedding(X)
#         print('embedding:',X.shape)
        _,X = self.rnn(X)
#         print('rnn:',X.shape)
        X = X.permute(1,0,2)
        X = X.reshape((X.shape[0],-1))
#         print('trans:',X.shape)
        y_hat = self.fc(X)
#         print('y_hat:',y_hat.shape)
        return y_hat

    
class MyTrain():
    def __init__(self,max_epoch=1,random_seed=1,lr=0.001,out_dir='./'):
        self.max_epoch = max_epoch
        self.random_seed = random_seed
        self.lr = lr
        self.out_dir = out_dir
        self.iter = 0
        
    def fix_random(self):
        import random
        import numpy as np
        import torch
        random.seed(self.random_seed)
        np.random.seed(self.random_seed)
        torch.random.manual_seed(self.random_seed)
        torch.cuda.random.manual_seed_all(self.random_seed)
        torch.backends.cudnn.deterministic = True
        print(f'random seed:{self.random_seed}')
        
    def my_train(self):
        
        max_epoch,lr = self.max_epoch,self.lr
        
        if self.random_seed is not None:
            self.fix_random()
        
        train_text_data,valid_text_data,train_label_data,valid_label_data = split_train_valid('./train_set.csv',train_prec=0.75)
        train_dataset = MyDataset(train_text_data,train_label_data)
        valid_dataset = MyDataset(valid_text_data,valid_label_data)
        
        my_model = MyModel()
        my_optim = torch.optim.Adam(my_model.parameters(),lr=lr)
        my_loss = nn.CrossEntropyLoss()
        
        if torch.cuda.is_available():
            my_model.cuda()
            my_loss.cuda()
        print(f'train device:{next(iter(my_model.parameters())).device}')  #显示训练设备
        
        best_f1_score = 0
        epoch_index = 0

        for epoch_index in range(max_epoch):
            
            loss_list = []
            train_f1_score_list = []
            valid_f1_score_list = []
            train_dataloader = DataLoader(train_dataset,batch_size=120,shuffle=True)
            valid_dataloader = DataLoader(valid_dataset,batch_size=40,shuffle=True)
            my_dataloader = tqdm(train_dataloader)
            
            batch_index = 0
                        
            for train_data,valid_data in zip(my_dataloader,valid_dataloader):
                                
                my_model.train()  #将模型设置为训练模式
                train_text,train_label = train_data
                valid_text,valid_label = valid_data
                
                if torch.cuda.is_available():
                    train_text = train_text.cuda()
                    train_label = train_label.cuda()
                    valid_text = valid_text.cuda()
                    valid_label = valid_label.cuda()
                
                train_y_hat = my_model(train_text)
                batch_train_loss = my_loss(train_y_hat,train_label)
                
                my_optim.zero_grad()
                batch_train_loss.backward()
                my_optim.step()
                my_optim.param_groups[0]['lr'] = lr*(0.8**(epoch_index%10))
        
                my_model.eval()  #将模型设置为验证模式
                with torch.no_grad():
                    valid_y_hat = my_model(valid_text)
                    batch_valid_f1_score = self.f1_score(valid_y_hat.data,valid_label.data)
                    valid_f1_score_list.append(batch_valid_f1_score)
                    mean_valid_f1 = round(sum(valid_f1_score_list)/len(valid_f1_score_list),3)

                    #显示batch结果
                    batch_lr = round(my_optim.param_groups[0]['lr'],5)
                    batch_loss = round(batch_train_loss.item(),4)
                    loss_list.append(batch_loss)
                    mean_loss = round((sum(loss_list)/len(loss_list)),3)

                    batch_train_f1_score = self.f1_score(train_y_hat.data,train_label.data)
                    train_f1_score_list.append(batch_train_f1_score)
                    mean_train_f1 = round(sum(train_f1_score_list)/len(train_f1_score_list),3)

                    my_dataloader.set_description(f'epoch:{epoch_index},batch:{batch_index},lr:{batch_lr},loss:{batch_loss},mean_loss:{mean_loss},train_f1:{mean_train_f1},valid_f1:{mean_valid_f1}')

                    #存储模型
                    if batch_train_f1_score>best_f1_score:
                        torch.save(my_model.state_dict(),os.path.join(self.out_dir,'embedding_gru_best'))
                        best_f1_score = batch_train_f1_score

                    batch_index+=1
            
                
    def f1_score(self,y_hat,label,eps=1e-8):
        #y_hat(N,C),label(1)
        y_hat = y_hat.cpu()
        label = label.cpu()
        preds_list = list(torch.argmax(y_hat,dim=1).numpy())
        label_list = list(label.numpy())
#         print(f'preds:{preds_list},label:{label_list}')
        class_index_list = []
        for class_index in label_list:
            if class_index not in class_index_list:
                class_index_list.append(class_index)

        f1_score_list = []
        for index in class_index_list:
            if index not in preds_list:
                sub_f1_score = 0
            else:
                tp = 0
                fp = 0
                fn = 0
                for i in range(len(preds_list)):
                    if preds_list[i] == index and label_list[i] == index:
                        tp+=1
                    if preds_list[i] == index and label_list[i] != index: 
                        fp+=1
                    if preds_list[i] != index and label_list[i] == index:
                        fn+=1
                prec_val = tp/(tp+fp) 
                recall_val = tp/(tp+fn)
                sub_f1_score = 2*(prec_val*recall_val)/(prec_val+recall_val+eps)
            f1_score_list.append(sub_f1_score)

        batch_f1_score = sum(f1_score_list)/len(f1_score_list)

        return batch_f1_score
        

## 预测

In [10]:
def predict(model_path,test_data_path,model='train'):
    p_model = MyModel()
    best_stat_dict = torch.load(model_path)
    p_model.load_state_dict(best_stat_dict)
    if torch.cuda.is_available():
        p_model.cuda()
    test_data = pd.read_csv(test_data_path,sep='\t')
    test_text = test_data.text
    if model == 'test':
        test_label = np.ones_like(test_text) #当预测的不是train数据时，虚拟一个方便生成dataset和DataLoader
    if model == 'train':
        test_label = test_data.label
    test_dataset = MyDataset(test_text,test_label)
    test_dataloader = DataLoader(test_dataset,batch_size=1)
    test_dataloader = tqdm(test_dataloader)
    pred_list = []
    for test_index,batch_data in enumerate(test_dataloader):
        batch_text,batch_label = batch_data
        if torch.cuda.is_available():
            batch_text = batch_text.cuda()
        p_model.eval()
        preds = p_model(batch_text)
        pred_class_index = preds.argmax()
        pred_list.append(pred_class_index.item())

    print(len(pred_list))
    if model=='train':
        train_f1_score = p_model.f1_score(preds,test_label)
        print('train_set_f1:',train_f1_score)
        
    return pred_list


In [12]:
if __name__ == '__main__':
    model_path = 'embedding_gru_best_2674word_valid_f1_score_0.929'
    
    pred_list = predict(os.path.join('./',model_path),'./test.csv',model='test')
    pred_data = pd.DataFrame(pred_list)
    pred_data.head()
    pred_data.to_csv('test_pred.csv',sep='\t',index=False)
    print('down')

100%|███████████████████████████████████████████████████████████████████████████| 50000/50000 [02:27<00:00, 339.95it/s]


50000
down
