In [1]:
!nvidia-smi

Tue Aug 24 08:03:44 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 457.49       Driver Version: 457.49       CUDA Version: 11.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name            TCC/WDDM | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  GeForce RTX 2060   WDDM  | 00000000:01:00.0 Off |                  N/A |
| N/A   39C    P8     5W /  N/A |   1095MiB /  6144MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
import json
import random
import os
import pickle
import time
import gc
import copy

from tqdm import tqdm
from sklearn.utils.class_weight import compute_class_weight
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from torch import nn
from torch import optim
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModel
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
import pandas as pd

In [3]:
torch.__version__

'1.8.0'

In [4]:
torch.cuda.is_available()

True

In [5]:
torch.backends.cudnn.enabled = True
torch.backends.cudnn.benchmark = True

In [6]:
processed_expose_test_path = "data/processed_test_expose.json"
processed_expose_train_labeled_path = "data/processed_train_expose_labeled.json"
processed_expose_train_unlabel_predict_other_path = "data/processed_train_expose_unlabel_predict_other_0.5.json"
processed_expose_train_valid_path = "data/processed_train_valid.json"
processed_expose_train_train_path = "data/processed_train_train.json"

pretrained_bert_path = "bert-base-chinese/"

category_encoder_path = 'model/category_encoder_second.pickle'
paragraphs_num_encoder_path = 'model/paragraphs_num_encoder_second.pickle'
source_encoder_path = 'model/source_encoder_second.pickle'
doctype_encoder_path = 'model/doctype_encoder_second.pickle'
words_len_encoder_path = 'model/words_len_encoder_second.pickle'
model_train_first_path = 'model/model_train_first.pt'
model_train_mutil_model_path = 'model/model_train_mutil_model.pt'

submission_path = "submission_train_mutil_model_predict.csv"

num_workers = 0

In [7]:
def get_feature_list(input_path, feature_name):
    feature_list = []
    with open(input_path, 'r', encoding="utf-8") as input_file:
        for line in tqdm(input_file):
            json_data = json.loads(line)
            feature_list.append(json_data[feature_name])
    return feature_list

In [8]:
def get_category_encoder():
    if os.path.exists(category_encoder_path):
        with open (category_encoder_path, 'rb') as category_encoder_file: 
            return pickle.load(category_encoder_file)
    
    category_list = get_feature_list(processed_expose_train_labeled_path, 'category') + \
                    get_feature_list(processed_expose_test_path, 'category') + \
                    get_feature_list(processed_expose_train_unlabel_predict_other_path, 'category')
    category_list = np.array(category_list).reshape(-1, 1)
    encoder = OneHotEncoder(categories='auto', handle_unknown='ignore').fit(category_list)
    
    with open (category_encoder_path, 'wb') as category_encoder_file:
        pickle.dump(encoder, category_encoder_file)
    return encoder

In [9]:
category_encoder = get_category_encoder()
category_encoder.transform(np.array([1]).reshape(-1, 1)).toarray()[0]

array([0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [10]:
len(category_encoder.categories_[0])

30

In [11]:
def get_paragraphs_num_encoder():
    if os.path.exists(paragraphs_num_encoder_path):
        with open (paragraphs_num_encoder_path, 'rb') as paragraphs_num_encoder_file: 
            return pickle.load(paragraphs_num_encoder_file)
    
    paragraphs_num_list = get_feature_list(processed_expose_train_labeled_path, 'paragraphs_num') + \
                          get_feature_list(processed_expose_test_path, 'paragraphs_num') + \
                          get_feature_list(processed_expose_train_unlabel_predict_other_path, 'paragraphs_num')
    paragraphs_num_list = np.array(paragraphs_num_list).reshape(-1, 1)
    encoder = StandardScaler().fit(paragraphs_num_list)
    
    with open (paragraphs_num_encoder_path, 'wb') as paragraphs_num_encoder_file:
        pickle.dump(encoder, paragraphs_num_encoder_file)
    return encoder

In [12]:
paragraphs_num_encoder = get_paragraphs_num_encoder()
paragraphs_num_encoder.transform(np.array([100]).reshape(-1, 1))[0][0]

5.698883781221176

In [13]:
paragraphs_num_encoder.transform(np.array([99999999999]).reshape(-1, 1))[0][0]

6058788676.516701

In [14]:
def get_words_len_encoder():
    if os.path.exists(words_len_encoder_path):
        with open (words_len_encoder_path, 'rb') as words_len_encoder_file: 
            return pickle.load(words_len_encoder_file)
    
    words_len_list = get_feature_list(processed_expose_train_labeled_path, 'words_len') + \
                     get_feature_list(processed_expose_test_path, 'words_len') + \
                     get_feature_list(processed_expose_train_unlabel_predict_other_path, 'words_len')
    words_len_list = np.array(words_len_list).reshape(-1, 1)
    encoder = StandardScaler().fit(words_len_list)
    
    with open (words_len_encoder_path, 'wb') as words_len_encoder_file:
        pickle.dump(encoder, words_len_encoder_file)
    return encoder

In [15]:
words_len_encoder = get_words_len_encoder()
words_len_encoder.transform(np.array([1000]).reshape(-1, 1))[0][0]

-0.39012417029706364

In [16]:
words_len_encoder.transform(np.array([2000]).reshape(-1, 1))[0][0]

0.4138137636721993

In [17]:
def get_source_encoder():
    if os.path.exists(source_encoder_path):
        with open (source_encoder_path, 'rb') as source_encoder_file: 
            return pickle.load(source_encoder_file)
    
    source_list = get_feature_list(processed_expose_train_labeled_path, 'source') + \
                  get_feature_list(processed_expose_test_path, 'source') + \
                  get_feature_list(processed_expose_train_unlabel_predict_other_path, 'source')
    source_list = np.array(source_list).reshape(-1, 1)
    encoder = OneHotEncoder(categories='auto', handle_unknown='ignore').fit(source_list)
    
    with open (source_encoder_path, 'wb') as source_encoder_file:
        pickle.dump(encoder, source_encoder_file)
    return encoder

In [18]:
source_encoder = get_source_encoder()
source_encoder.transform(np.array(['中国经济周刊']).reshape(-1, 1)).toarray()[0]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0.

In [19]:
source_encoder.transform(np.array(['hg']).reshape(-1, 1)).toarray()[0]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0.

In [20]:
len(source_encoder.categories_[0])

281

In [21]:
def get_doctype_encoder():
    if os.path.exists(doctype_encoder_path):
        with open (doctype_encoder_path, 'rb') as doctype_encoder_file: 
            return pickle.load(doctype_encoder_file)
    
    doctype_list = get_feature_list(processed_expose_train_labeled_path, 'doctype') + \
                  get_feature_list(processed_expose_train_unlabel_predict_other_path, 'doctype')
    doctype_set = set(doctype_list)
    doctype_list = list(doctype_set)
    
    with open (doctype_encoder_path, 'wb') as doctype_encoder_file:
        pickle.dump(doctype_list, doctype_encoder_file)
    return doctype_list

In [22]:
doctype_list = get_doctype_encoder()
doctype_list, len(doctype_list)

(['情感解读',
  '深度事件',
  '其他',
  '人物专栏',
  '攻略文',
  '推荐文',
  '治愈系文章',
  '物品评测',
  '行业解读',
  '科普知识文',
  '作品分析'],
 11)

In [23]:
# with open(processed_expose_train_labeled_path, 'r', encoding="utf-8") as input_file, \
#      open(processed_expose_train_unlabel_predict_other_path, 'r', encoding="utf-8") as input1_file, \
#      open(processed_expose_train_train_path, 'w', encoding="utf-8") as train_file, \
#      open(processed_expose_train_valid_path, 'w', encoding="utf-8") as valid_file:
#     for line in tqdm(input_file):
#         json_data = json.loads(line)
#         if random.random() < 0.1:
#             valid_file.write(f"{json.dumps(json_data, ensure_ascii=False)}\n")
#         else:
#             train_file.write(f"{json.dumps(json_data, ensure_ascii=False)}\n")
#     for line in tqdm(input1_file):
#         json_data = json.loads(line)
#         if random.random() < 0.1:
#             valid_file.write(f"{json.dumps(json_data, ensure_ascii=False)}\n")
#         else:
#             train_file.write(f"{json.dumps(json_data, ensure_ascii=False)}\n")

In [24]:
class MyDataset(Dataset):
    def __init__(self, input_paths, dataset_type):
        self.dataset_type = dataset_type
        self.tokenizer = AutoTokenizer.from_pretrained(pretrained_bert_path)
        self.data_list = self.load_data(input_paths)
        
    @classmethod
    def get_data_label(cls, tokenizer, item, dataset_type):
        token = tokenizer(item['text'], add_special_tokens=True,
                                              max_length=512,
                                              truncation=True,
                                              padding='max_length',
                                              return_tensors="pt")
        del item['text']
#         item = dict(item, **token)
        item['input_ids'] = token['input_ids'][0]
        item['token_type_ids'] = token['token_type_ids'][0]
        item['attention_mask'] = token['attention_mask'][0]
        
        item['category'] = category_encoder.transform(np.array([item['category']]).reshape(-1, 1)).toarray()[0]
        item['paragraphs_num'] = paragraphs_num_encoder.transform(np.array([item['paragraphs_num']]).reshape(-1, 1))[0]
        item['words_len'] = words_len_encoder.transform(np.array([item['words_len']]).reshape(-1, 1))[0]
        del item['pic_num']
        item['source'] = source_encoder.transform(np.array([item['source']]).reshape(-1, 1)).toarray()[0]
        
#         del item['id']

        if dataset_type == 'test':
            item['doctype'] = -1
        else:
            item['doctype'] = doctype_list.index(item['doctype'])
        label = item['doctype']
        
        del item['doctype']
        
        #         print(item)
        
        return item, label

    def __getitem__(self, index):
        item = self.data_list[index]
        item, label = MyDataset.get_data_label(self.tokenizer, item, self.dataset_type)
        return item, label
        

    def __len__(self):
        return len(self.data_list)

    def load_data(self, input_paths):
        if type(input_paths) != list:
            print("input_paths is not list!")
            return
        data_list = []
        for input_path in input_paths:
            with open(input_path, 'r', encoding='utf-8') as input_file:
                for line in tqdm(input_file):
                    json_data = json.loads(line)
                    data_list.append(json_data)
        if self.dataset_type != 'test':
            random.shuffle(data_list)
        return data_list

In [25]:
class BertClassificationModel(nn.Module):
    """Bert分类器模型"""
    def __init__(self, hidden_size=768):
        super(BertClassificationModel, self).__init__()
        self.bert = AutoModel.from_pretrained(pretrained_bert_path)
        
        category_size = len(category_encoder.categories_[0])
        paragraphs_num_size = 1
        words_len_size = 1
        source_size = len(source_encoder.categories_[0])
        linear_size = hidden_size + category_size + paragraphs_num_size + source_size + words_len_size
        self.net = nn.Sequential(
            nn.Dropout(p=0.2),
            nn.Linear(linear_size, 512),
            nn.ReLU(inplace=True),
            nn.Linear(512, 64),
            nn.ReLU(inplace=True),
        )
        self.out = nn.Linear(64, len(doctype_list))
#         self.out = nn.Linear(linear_size, len(doctype_list))

    def forward(self, batch_data):
        input_ids = batch_data['input_ids'].clone().detach().cuda()
        token_type_ids = batch_data['token_type_ids'].clone().detach().cuda()
        attention_mask = batch_data['attention_mask'].clone().detach().cuda()
        bert_output = self.bert(input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask)
        bert_cls_hidden_state = bert_output[0][:, 0, :]
#         print(bert_cls_hidden_state.shape)
#         print(batch_data['category'].shape)
#         print(batch_data['paragraphs_num'].shape)
#         print(batch_data['source'].shape)
        category = batch_data['category'].clone().detach().cuda()
        paragraphs_num = batch_data['paragraphs_num'].clone().detach().cuda()
        source = batch_data['source'].clone().detach().cuda()
        words_len = batch_data['words_len'].clone().detach().cuda()
        cat_layer = torch.cat((bert_cls_hidden_state, category, paragraphs_num, source, words_len), 1)
#         print(cat_layer.shape)
        output = self.net(cat_layer.to(torch.float32))
        output = self.out(output)
        
        return output

In [26]:
class MutilModelModel(nn.Module):
    """Bert分类器模型"""
    def __init__(self, hidden_size=768):
        super(MutilModelModel, self).__init__()
        self.first_model = torch.load(model_train_first_path)
        self.freeze(self.first_model)
        
        self.bert = AutoModel.from_pretrained(pretrained_bert_path)
        
        category_size = len(category_encoder.categories_[0])
        paragraphs_num_size = 1
        words_len_size = 1
        source_size = len(source_encoder.categories_[0])
        first_model_size = 10
        linear_size = hidden_size + category_size + paragraphs_num_size + source_size + words_len_size + first_model_size
        self.net = nn.Sequential(
            nn.Dropout(p=0.2),
            nn.Linear(linear_size, 512),
            nn.ReLU(inplace=True),
            nn.Linear(512, 64),
            nn.ReLU(inplace=True),
        )
        self.out = nn.Linear(64, len(doctype_list))

    def forward(self, batch_data):
        bert_output = self.bert(input_ids=batch_data['input_ids'].cuda(), token_type_ids=batch_data['token_type_ids'].cuda(), attention_mask=batch_data['attention_mask'].cuda())
        bert_cls_hidden_state = bert_output[0][:, 0, :]
        
        first_model_output = self.first_model(batch_data)
#         print(bert_cls_hidden_state.shape)
#         print(batch_data['category'].shape)
#         print(first_model_output.shape)
        
        cat_layer = torch.cat((bert_cls_hidden_state, 
                               batch_data['category'].cuda(), 
                               batch_data['paragraphs_num'].cuda(), 
                               batch_data['source'].cuda(), 
                               batch_data['words_len'].cuda(),
                               first_model_output), 
                              1)
#         print(cat_layer.shape)
        output = self.net(cat_layer.to(torch.float32))
        output = self.out(output)
        return output
    
    def freeze(self, layer):
        for child in layer.children():
#             print(child)
            for param in child.parameters():
                param.requires_grad = False
#                 print(param)

In [27]:
class EarlyStopping:
    """Early stops the training if validation loss doesn't improve after a given patience."""
    def __init__(self, patience=7, verbose=False, delta=0):
        """
        Args:
            patience (int): How long to wait after last time validation loss improved.
                            Default: 7
            verbose (bool): If True, prints a message for each validation loss improvement. 
                            Default: False
            delta (float): Minimum change in the monitored quantity to qualify as an improvement.
                            Default: 0
        """
        self.patience = patience
        self.verbose = verbose
        self.counter = 0
        self.best_score = None
        self.early_stop = False
        self.val_loss_min = np.Inf
        self.delta = delta

    def __call__(self, val_loss, model):

        score = -val_loss

        if self.best_score is None:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
        elif score < self.best_score + self.delta:
            self.counter += 1
            print(f'EarlyStopping counter: {self.counter} out of {self.patience}. score:{-score}')
            with open("early_stop.log", 'a+', encoding="utf-8") as log_file:
                log_file.write(f'{localtime} EarlyStopping counter: {self.counter} out of {self.patience}. score:{-score}\n')
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
            self.counter = 0

    def save_checkpoint(self, val_loss, model):
        '''Saves model when validation loss decrease.'''
        if self.verbose:
            print(f'Validation loss decreased ({self.val_loss_min:.6f} --> {val_loss:.6f}).  Saving model ...')
            localtime = time.asctime( time.localtime(time.time()) )
            with open("early_stop.log", 'a+', encoding="utf-8") as log_file:
                log_file.write(f'{localtime} Validation loss decreased ({self.val_loss_min:.6f} --> {val_loss:.6f}).  Saving model ...\n')
#         torch.save(model.state_dict(), model_train_mutil_model_path)	# 这里会存储迄今最优模型的参数
        torch.save(model, model_train_mutil_model_path)
        self.val_loss_min = val_loss

In [28]:
batch_size = 2

# torch.cuda.set_device(0)
model = MutilModelModel()
# torch.set_default_tensor_type(torch.DoubleTensor)
model = model.cuda()
# print(model)

# 不同子网络设定不同的学习率
Bert_model_param = []
Bert_downstream_param = []
for items, param in model.named_parameters():
    if not param.requires_grad:
#         print(items)
        continue
    if "bert" in items:
        Bert_model_param.append(param)
    else:
        Bert_downstream_param.append(param)
param_groups = [{"params": Bert_model_param, "lr": 1e-5},
                {"params": Bert_downstream_param, "lr": 1e-4}]
optimizer = optim.Adam(param_groups, eps=1e-7, weight_decay=0.001)
# 初始化 early_stopping 对象
patience = 2	# 当验证集损失在连续n次训练周期中都没有得到降低时，停止模型训练，以防止模型过拟合
early_stopping = EarlyStopping(patience, verbose=True)
criterion = nn.CrossEntropyLoss()

Some weights of the model checkpoint at bert-base-chinese/ were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [29]:
# epoch_num = 5
# batch_loss_num = 100
# for epoch in range(epoch_num):
#     model.train()	# 设置模型为训练模式
#     train_dataset = MyDataset([processed_expose_train_train_path], 'train')
#     train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers)
#     with tqdm(total=len(train_loader)) as t:
#         loss_sum = 0.0
#         batch_loss = 1.0
#         for batch_idx,data_ in enumerate(train_loader, 0):
#             data, label = data_
#             # 清空梯度
#             optimizer.zero_grad()
#             output = model(data)
#             loss = criterion(output, label.cuda())
#             loss.backward()

#             # 更新模型参数
#             optimizer.step()
            
#             loss_sum += loss.item()
#             if batch_idx % batch_loss_num == batch_loss_num - 1:
#                 batch_loss = loss_sum / batch_loss_num
#                 localtime = time.asctime( time.localtime(time.time()) )
#                 with open("early_stop.log", 'a+', encoding="utf-8") as log_file:
#                     log_file.write(f'{localtime} train_loss={loss.item():.6f} batch_loss={batch_loss:.6f}\n')
#                 loss_sum = 0.0
          
#             t.set_postfix_str(f'train_loss={loss.item():.6f} batch_loss={batch_loss:.6f}')
#             t.update()
            
#             del data, label, output 
#             gc.collect()
#             torch.cuda.empty_cache()
#     #----------------------------------------------------
#     model.eval() # 设置模型为评估/测试模式
#     valid_dataset = MyDataset([processed_expose_train_valid_path], 'valid')
#     valid_loader = DataLoader(dataset=valid_dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers)
#     valid_loss_sum = 0.0
#     with tqdm(total=len(valid_loader)) as t:
#         with torch.no_grad():
#             for data, label in valid_loader:
#                 # 一般如果验证集不是很大的话，模型验证就不需要按批量进行了，但要注意输入参数的维度不能错
#                 output = model(data)
#                 loss = criterion(output, label.cuda())
#                 valid_loss_sum += loss.item()
#                 t.set_postfix_str(f'valid loss={loss.item()}')
#                 t.update()
#     early_stopping(valid_loss_sum, model)
#     # 若满足 early stopping 要求
#     if early_stopping.early_stop:
#         print("Early stopping")
#         # 结束模型训练
#         break

In [30]:
# 保存完整的 BERT 分类器模型
# torch.save(model, model_train_mutil_model_path)

In [31]:
# 获得 early stopping 时的模型参数
# model.load_state_dict(torch.load(model_train_mutil_model_path))

In [None]:
epoch_num = 5
batch_loss_num = 100
for epoch in range(epoch_num):
    model.train()	# 设置模型为训练模式
    train_dataset = MyDataset([processed_expose_train_labeled_path, processed_expose_train_unlabel_predict_other_path], 'train')
    train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers)
    with tqdm(total=len(train_loader)) as t:
        loss_sum = 0.0
        batch_loss = 1.0
        for batch_idx,data_ in enumerate(train_loader, 0):
            data, label = data_
            # 清空梯度
            optimizer.zero_grad()
            output = model(data)
            loss = criterion(output, label.cuda())
            loss.backward()

            # 更新模型参数
            optimizer.step()
            
            loss_sum += loss.item()
            if batch_idx % batch_loss_num == batch_loss_num - 1:
                batch_loss = loss_sum / batch_loss_num
                localtime = time.asctime( time.localtime(time.time()) )
                with open("early_stop.log", 'a+', encoding="utf-8") as log_file:
                    log_file.write(f'{localtime} train_loss={loss.item():.6f} batch_loss={batch_loss:.6f}\n')
                loss_sum = 0.0
          
            t.set_postfix_str(f'train_loss={loss.item():.6f} batch_loss={batch_loss:.6f}')
            t.update()
            
            del data, label, output 
            gc.collect()
            torch.cuda.empty_cache()

    # 保存完整的 BERT 分类器模型
    torch.save(model, model_train_mutil_model_path)
    localtime = time.asctime( time.localtime(time.time()) )
    with open("early_stop.log", 'a+', encoding="utf-8") as log_file:
        log_file.write(f'{localtime} Saving model ...\n')

76454it [00:00, 85525.69it/s]
39347it [00:00, 95713.14it/s]
  0%|                                    | 30/57901 [00:16<8:19:58,  1.93it/s, train_loss=2.152438 batch_loss=1.000000]

In [None]:
# model = torch.load(model_train_mutil_model_path)
# model = model.cuda()

In [None]:
model.eval() # 设置模型为评估/测试模式
valid_dataset = MyDataset([processed_expose_train_valid_path], 'valid')
valid_loader = DataLoader(dataset=valid_dataset, batch_size=1, shuffle=True, num_workers=num_workers)
with torch.no_grad():
    for data, label in tqdm(valid_loader):
        print(data, label)
        print(data['id'][0])
        output = model(data)
#         print(output)
#         print(torch.sigmoid(output))
#         sigmoid = nn.Sigmoid()
#         print(sigmoid(output))
        predict_list = F.softmax(output, dim=1).tolist()[0]
        predict_index = np.argmax(predict_list)
        predict_label = doctype_list[predict_index]
        print(predict_label)
        break

In [None]:
# with open("data/processed_train_expose_labeled.json", "r", encoding='utf-8') as input_file:
#     for line in tqdm(input_file):
#         json_data = json.loads(line)
#         if json_data['id'] == 'ee137ac3-c2a2-4aba-a517-36840ffd2f1a':
#             print(line)
#             break

In [None]:
# model.eval() # 设置模型为评估/测试模式
# valid_dataset = MyDataset("data/test.txt", 'test')
# valid_loader = DataLoader(dataset=valid_dataset, batch_size=1, shuffle=True, num_workers=num_workers)
# with torch.no_grad():
#     for data, label in tqdm(valid_loader):
# #         print(data, label)
#         output = model(data)
# #         print(output)
# #         print(torch.sigmoid(output))
#         print(F.softmax(output, dim=1).tolist()[0])
#         break

In [None]:
model.eval() # 设置模型为评估/测试模式
test_dataset = MyDataset([processed_expose_test_path], 'test')
test_loader = DataLoader(dataset=test_dataset, batch_size=1, shuffle=True, num_workers=num_workers)
predict_doctype = {}
with torch.no_grad():
    for data, label in tqdm(test_loader):
        id = data['id'][0]
        output = model(data)
        predict_list = F.softmax(output, dim=1).tolist()[0]
        predict_index = np.argmax(predict_list)
        predict_label = doctype_list[predict_index]
        predict_doctype[id] = predict_label
predict_data = {'predict_doctype' : predict_doctype}
df = pd.DataFrame(predict_data)
df.head()

In [None]:
df.index.name = 'id'
df.head()

In [None]:
df.to_csv("submission_train_mutil_model_predict.csv")

In [None]:
# 看起来，复杂、大模型是有效的(在验证数据集上，第二个epoch的loss还在降低)，所以下一步准备试一下cnn和albert(或roberta)

In [None]:
# 0.451470