In [1]:
!nvidia-smi

Thu Aug 19 22:53:50 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 457.49       Driver Version: 457.49       CUDA Version: 11.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name            TCC/WDDM | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  GeForce RTX 2060   WDDM  | 00000000:01:00.0  On |                  N/A |
| N/A   39C    P8     6W /  N/A |   1311MiB /  6144MiB |      1%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
import json
import random
import os
import pickle
import time
import gc
import copy

from tqdm import tqdm
from sklearn.utils.class_weight import compute_class_weight
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from torch import nn
from torch import optim
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModel
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
import pandas as pd

In [3]:
torch.__version__

'1.8.0'

In [4]:
torch.cuda.is_available()

True

In [5]:
torch.backends.cudnn.enabled = True
torch.backends.cudnn.benchmark = True

In [6]:
processed_expose_test_path = "data/processed_test_expose.json"
processed_expose_train_labeled_path = "data/processed_train_expose_labeled.json"
processed_expose_train_labeled_train_path = "data/processed_train_expose_labeled_train.json"
processed_expose_train_labeled_valid_path = "data/processed_train_expose_labeled_valid.json"
processed_expose_train_unlabel_path = "data/processed_train_expose_unlabel.json"
processed_expose_train_unlabel_predict_list_path = "data/processed_train_expose_unlabel_predict_list.json"

pretrained_bert_path = "bert-base-chinese/"

category_encoder_path = 'model/category_encoder.pickle'
paragraphs_num_encoder_path = 'model/paragraphs_num_encoder.pickle'
source_encoder_path = 'model/source_encoder.pickle'
doctype_encoder_path = 'model/doctype_encoder.pickle'
words_len_encoder_path = 'model/words_len_encoder.pickle'
model_train_first_path = 'model/model_train_first.pt'

num_workers = 0

In [7]:
# with open(processed_expose_train_path, 'r', encoding="utf-8") as input_file, \
#      open(processed_expose_train_labeled_path, 'w', encoding="utf-8") as labeled_file, \
#      open(processed_expose_train_unlabel_path, 'w', encoding="utf-8") as unlabel_file:
#     for line in tqdm(input_file):
#         json_data = json.loads(line)
#         if json_data['doctype'] != '':
#             labeled_file.write(f"{json.dumps(json_data, ensure_ascii=False)}\n")
#         else:
#             unlabel_file.write(f"{json.dumps(json_data, ensure_ascii=False)}\n")

In [8]:
def get_feature_list(input_path, feature_name):
    feature_list = []
    with open(input_path, 'r', encoding="utf-8") as input_file:
        for line in tqdm(input_file):
            json_data = json.loads(line)
            feature_list.append(json_data[feature_name])
    return feature_list

In [9]:
def get_category_encoder():
    if os.path.exists(category_encoder_path):
        with open (category_encoder_path, 'rb') as category_encoder_file: 
            return pickle.load(category_encoder_file)
    
    category_list = get_feature_list(processed_expose_train_labeled_path, 'category') + get_feature_list(processed_expose_test_path, 'category')
    category_list = np.array(category_list).reshape(-1, 1)
    encoder = OneHotEncoder(categories='auto', handle_unknown='ignore').fit(category_list)
    
    with open (category_encoder_path, 'wb') as category_encoder_file:
        pickle.dump(encoder, category_encoder_file)
    return encoder

In [10]:
category_encoder = get_category_encoder()
category_encoder.transform(np.array([1]).reshape(-1, 1)).toarray()[0]

array([0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [11]:
len(category_encoder.categories_[0])

30

In [12]:
def get_paragraphs_num_encoder():
    if os.path.exists(paragraphs_num_encoder_path):
        with open (paragraphs_num_encoder_path, 'rb') as paragraphs_num_encoder_file: 
            return pickle.load(paragraphs_num_encoder_file)
    
    paragraphs_num_list = get_feature_list(processed_expose_train_labeled_path, 'paragraphs_num') + get_feature_list(processed_expose_test_path, 'paragraphs_num')
    paragraphs_num_list = np.array(paragraphs_num_list).reshape(-1, 1)
    encoder = StandardScaler().fit(paragraphs_num_list)
    
    with open (paragraphs_num_encoder_path, 'wb') as paragraphs_num_encoder_file:
        pickle.dump(encoder, paragraphs_num_encoder_file)
    return encoder

In [13]:
paragraphs_num_encoder = get_paragraphs_num_encoder()
paragraphs_num_encoder.transform(np.array([100]).reshape(-1, 1))[0][0]

5.3295779947518716

In [14]:
paragraphs_num_encoder.transform(np.array([99999999999]).reshape(-1, 1))[0][0]

5684543631.472906

In [15]:
def get_words_len_encoder():
    if os.path.exists(words_len_encoder_path):
        with open (words_len_encoder_path, 'rb') as words_len_encoder_file: 
            return pickle.load(words_len_encoder_file)
    
    words_len_list = get_feature_list(processed_expose_train_labeled_path, 'words_len') + get_feature_list(processed_expose_test_path, 'words_len')
    words_len_list = np.array(words_len_list).reshape(-1, 1)
    encoder = StandardScaler().fit(words_len_list)
    
    with open (words_len_encoder_path, 'wb') as words_len_encoder_file:
        pickle.dump(encoder, words_len_encoder_file)
    return encoder

In [16]:
words_len_encoder = get_words_len_encoder()
words_len_encoder.transform(np.array([1000]).reshape(-1, 1))[0][0]

-0.5472610685490511

In [17]:
words_len_encoder.transform(np.array([2000]).reshape(-1, 1))[0][0]

0.2779654221391926

In [18]:
def get_source_encoder():
    if os.path.exists(source_encoder_path):
        with open (source_encoder_path, 'rb') as source_encoder_file: 
            return pickle.load(source_encoder_file)
    
    source_list = get_feature_list(processed_expose_train_labeled_path, 'source') + get_feature_list(processed_expose_test_path, 'source')
    source_list = np.array(source_list).reshape(-1, 1)
    encoder = OneHotEncoder(categories='auto', handle_unknown='ignore').fit(source_list)
    
    with open (source_encoder_path, 'wb') as source_encoder_file:
        pickle.dump(encoder, source_encoder_file)
    return encoder

In [19]:
source_encoder = get_source_encoder()
source_encoder.transform(np.array(['中国经济周刊']).reshape(-1, 1)).toarray()[0]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0.

In [20]:
source_encoder.transform(np.array(['hg']).reshape(-1, 1)).toarray()[0]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0.

In [21]:
len(source_encoder.categories_[0])

281

In [22]:
def get_doctype_encoder():
    if os.path.exists(doctype_encoder_path):
        with open (doctype_encoder_path, 'rb') as doctype_encoder_file: 
            return pickle.load(doctype_encoder_file)
    
    doctype_list = get_feature_list(processed_expose_train_labeled_path, 'doctype')
    doctype_set = set(doctype_list)
    doctype_list = list(doctype_set)
    
    with open (doctype_encoder_path, 'wb') as doctype_encoder_file:
        pickle.dump(doctype_list, doctype_encoder_file)
    return doctype_list

In [23]:
doctype_list = get_doctype_encoder()
doctype_list, len(doctype_list)

(['作品分析',
  '治愈系文章',
  '情感解读',
  '行业解读',
  '科普知识文',
  '深度事件',
  '推荐文',
  '攻略文',
  '人物专栏',
  '物品评测'],
 10)

In [24]:
# with open(processed_expose_train_labeled_path, 'r', encoding="utf-8") as input_file, \
#      open(processed_expose_train_labeled_train_path, 'w', encoding="utf-8") as train_file, \
#      open(processed_expose_train_labeled_valid_path, 'w', encoding="utf-8") as valid_file:
#     for line in tqdm(input_file):
#         json_data = json.loads(line)
#         if random.random() < 0.1:
#             valid_file.write(f"{json.dumps(json_data, ensure_ascii=False)}\n")
#         else:
#             train_file.write(f"{json.dumps(json_data, ensure_ascii=False)}\n")

In [37]:
class MyDataset(Dataset):
    def __init__(self, input_path, dataset_type):
        self.dataset_type = dataset_type
        self.tokenizer = AutoTokenizer.from_pretrained(pretrained_bert_path)
        self.data_list = self.load_data(input_path)
        
    @classmethod
    def get_data_label(cls, tokenizer, item, dataset_type):
        token = tokenizer(item['text'], add_special_tokens=True,
                                              max_length=512,
                                              truncation=True,
                                              padding='max_length',
                                              return_tensors="pt")
        del item['text']
#         item = dict(item, **token)
        item['input_ids'] = token['input_ids'][0]
        item['token_type_ids'] = token['token_type_ids'][0]
        item['attention_mask'] = token['attention_mask'][0]
        
        item['category'] = category_encoder.transform(np.array([item['category']]).reshape(-1, 1)).toarray()[0]
        item['paragraphs_num'] = paragraphs_num_encoder.transform(np.array([item['paragraphs_num']]).reshape(-1, 1))[0]
        item['words_len'] = words_len_encoder.transform(np.array([item['words_len']]).reshape(-1, 1))[0]
        del item['pic_num']
        item['source'] = source_encoder.transform(np.array([item['source']]).reshape(-1, 1)).toarray()[0]
        
        del item['id']

        if dataset_type == 'test':
            item['doctype'] = -1
        else:
            item['doctype'] = doctype_list.index(item['doctype'])
        label = item['doctype']
        
        del item['doctype']
        
        #         print(item)
        
        return item, label

    def __getitem__(self, index):
        item = self.data_list[index]
        item, label = MyDataset.get_data_label(self.tokenizer, item, self.dataset_type)
        return item, label
        

    def __len__(self):
        return len(self.data_list)

    def load_data(self, input_path):
        data_list = []
        with open(input_path, 'r', encoding='utf-8') as input_file:
            for line in tqdm(input_file):
                json_data = json.loads(line)
                data_list.append(json_data)
        if self.dataset_type != 'test':
            random.shuffle(data_list)
        return data_list

In [26]:
# train_dataset = MyDataset(processed_expose_train_labeled_path, 'train')

In [27]:
# train_loader = DataLoader(dataset=train_dataset, batch_size=2, shuffle=True, num_workers=num_workers)
# for data, label in tqdm(train_loader):
#     print(data)
#     print(label)

# #     bert_output = bert(input_ids=data['input_ids'], token_type_ids=data['token_type_ids'], attention_mask=data['attention_mask'])
# #     print(bert_output)
# #     bert_cls_hidden_state = bert_output[0][:, 0, :]
# #     print(bert_cls_hidden_state)
# #     print(bert_cls_hidden_state.shape)
#     break

In [29]:
class BertClassificationModel(nn.Module):
    """Bert分类器模型"""
    def __init__(self, hidden_size=768):
        super(BertClassificationModel, self).__init__()
        self.bert = AutoModel.from_pretrained(pretrained_bert_path)
        
        category_size = len(category_encoder.categories_[0])
        paragraphs_num_size = 1
        words_len_size = 1
        source_size = len(source_encoder.categories_[0])
        linear_size = hidden_size + category_size + paragraphs_num_size + source_size + words_len_size
        self.net = nn.Sequential(
            nn.Dropout(p=0.2),
            nn.Linear(linear_size, 512),
            nn.ReLU(inplace=True),
            nn.Linear(512, 64),
            nn.ReLU(inplace=True),
        )
        self.out = nn.Linear(64, len(doctype_list))
#         self.out = nn.Linear(linear_size, len(doctype_list))

    def forward(self, batch_data):
        input_ids = batch_data['input_ids'].clone().detach().cuda()
        token_type_ids = batch_data['token_type_ids'].clone().detach().cuda()
        attention_mask = batch_data['attention_mask'].clone().detach().cuda()
        bert_output = self.bert(input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask)
        bert_cls_hidden_state = bert_output[0][:, 0, :]
#         print(bert_cls_hidden_state.shape)
#         print(batch_data['category'].shape)
#         print(batch_data['paragraphs_num'].shape)
#         print(batch_data['source'].shape)
        category = batch_data['category'].clone().detach().cuda()
        paragraphs_num = batch_data['paragraphs_num'].clone().detach().cuda()
        source = batch_data['source'].clone().detach().cuda()
        words_len = batch_data['words_len'].clone().detach().cuda()
        cat_layer = torch.cat((bert_cls_hidden_state, category, paragraphs_num, source, words_len), 1)
#         print(cat_layer.shape)
        output = self.net(cat_layer.to(torch.float32))
        output = self.out(output)
        
        return output

In [None]:
class EarlyStopping:
    """Early stops the training if validation loss doesn't improve after a given patience."""
    def __init__(self, patience=7, verbose=False, delta=0):
        """
        Args:
            patience (int): How long to wait after last time validation loss improved.
                            Default: 7
            verbose (bool): If True, prints a message for each validation loss improvement. 
                            Default: False
            delta (float): Minimum change in the monitored quantity to qualify as an improvement.
                            Default: 0
        """
        self.patience = patience
        self.verbose = verbose
        self.counter = 0
        self.best_score = None
        self.early_stop = False
        self.val_loss_min = np.Inf
        self.delta = delta

    def __call__(self, val_loss, model):

        score = -val_loss

        if self.best_score is None:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
        elif score < self.best_score + self.delta:
            self.counter += 1
            print(f'EarlyStopping counter: {self.counter} out of {self.patience}')
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
            self.counter = 0

    def save_checkpoint(self, val_loss, model):
        '''Saves model when validation loss decrease.'''
        if self.verbose:
            print(f'Validation loss decreased ({self.val_loss_min:.6f} --> {val_loss:.6f}).  Saving model ...')
#         torch.save(model.state_dict(), model_train_first_path)	# 这里会存储迄今最优模型的参数
        torch.save(model, model_train_first_path)
        self.val_loss_min = val_loss

In [30]:
batch_size = 2

# torch.cuda.set_device(0)
model = BertClassificationModel()
# torch.set_default_tensor_type(torch.DoubleTensor)
model = model.cuda()
# print(model)

# 不同子网络设定不同的学习率
Bert_model_param = []
Bert_downstream_param = []
for items, _ in model.named_parameters():
    if "bert" in items:
        Bert_model_param.append(_)
    else:
        Bert_downstream_param.append(_)
param_groups = [{"params": Bert_model_param, "lr": 1e-5},
                {"params": Bert_downstream_param, "lr": 1e-4}]
optimizer = optim.Adam(param_groups, eps=1e-7, weight_decay=0.001)
# 初始化 early_stopping 对象
patience = 2	# 当验证集损失在连续n次训练周期中都没有得到降低时，停止模型训练，以防止模型过拟合
early_stopping = EarlyStopping(patience, verbose=True)
criterion = nn.CrossEntropyLoss()

Some weights of the model checkpoint at bert-base-chinese/ were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
# epoch_num = 5
# batch_loss_num = 100
# for epoch in range(epoch_num):
#     model.train()	# 设置模型为训练模式
#     train_dataset = MyDataset(processed_expose_train_labeled_train_path, 'train')
#     train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers)
#     with tqdm(total=len(train_loader)) as t:
#         loss_sum = 0.0
#         batch_loss = 1.0
#         for batch_idx,data_ in enumerate(train_loader, 0):
#             data, label = data_
#             # 清空梯度
#             optimizer.zero_grad()
#             output = model(data)
#             loss = criterion(output, label.cuda())
# #             output = model(data)
# #             loss = criterion(output, label)
#             loss.backward()

#             # 更新模型参数
#             optimizer.step()
            
#             loss_sum += loss.item()
#             if batch_idx % batch_loss_num == batch_loss_num - 1:
#                 batch_loss = loss_sum / batch_loss_num
#                 loss_sum = 0.0
          
#             t.set_postfix_str(f'train_loss={loss.item():.6f} batch_loss={batch_loss:.6f}')
#             t.update()
            
#             del data, label, output 
#             gc.collect()
#             torch.cuda.empty_cache()
#     #----------------------------------------------------
#     model.eval() # 设置模型为评估/测试模式
#     valid_dataset = MyDataset(processed_expose_train_labeled_valid_path, 'valid')
#     valid_loader = DataLoader(dataset=valid_dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers)
#     valid_loss_sum = 0.0
#     with tqdm(total=len(valid_loader)) as t:
#         with torch.no_grad():
#             for data, label in valid_loader:
#                 # 一般如果验证集不是很大的话，模型验证就不需要按批量进行了，但要注意输入参数的维度不能错
#                 output = model(data)
#                 loss = criterion(output, label.cuda())
# #                 output = model(data)
# #                 loss = criterion(output, label)
#                 valid_loss_sum += loss.item()
#                 t.set_postfix_str(f'valid loss={loss.item()}')
#                 t.update()
#     early_stopping(valid_loss_sum, model)
#     # 若满足 early stopping 要求
#     if early_stopping.early_stop:
#         print("Early stopping")
#         # 结束模型训练
#         break
#     # # 保存完整的 BERT 分类器模型
#     # torch.save(model, model_train_first_path)

68920it [00:00, 85117.89it/s]
100%|█████████████████████████████████| 34460/34460 [4:32:10<00:00,  2.11it/s, train_loss=0.701839 batch_loss=0.609479]
7534it [00:00, 89649.39it/s]
100%|███████████████████████████████████████████████| 3767/3767 [04:34<00:00, 13.73it/s, valid loss=0.1909613162279129]


Validation loss decreased (inf --> 2116.414926).  Saving model ...


68920it [00:00, 87001.94it/s]
100%|█████████████████████████████████| 34460/34460 [4:32:39<00:00,  2.11it/s, train_loss=0.414682 batch_loss=0.635362]
7534it [00:00, 84626.53it/s]
100%|███████████████████████████████████████████████| 3767/3767 [04:37<00:00, 13.57it/s, valid loss=0.3082568645477295]


EarlyStopping counter: 1 out of 2


68920it [00:00, 82222.99it/s]
 23%|███████▋                          | 7833/34460 [54:10<3:12:27,  2.31it/s, train_loss=0.028321 batch_loss=0.701261]

In [None]:
# torch.save(model, model_train_first_path)

In [None]:
# 获得 early stopping 时的模型参数
# model.load_state_dict(torch.load(model_train_first_path))

In [31]:
epoch_num = 1
batch_loss_num = 100
for epoch in range(epoch_num):
    model.train()	# 设置模型为训练模式
    train_dataset = MyDataset(processed_expose_train_labeled_path, 'train')
    train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers)
    with tqdm(total=len(train_loader)) as t:
        loss_sum = 0.0
        batch_loss = 1.0
        for batch_idx,data_ in enumerate(train_loader, 0):
            data, label = data_
            # 清空梯度
            optimizer.zero_grad()
            output = model(data)
            loss = criterion(output, label.cuda())
#             output = model(data)
#             loss = criterion(output, label)
            loss.backward()

            # 更新模型参数
            optimizer.step()
            
            loss_sum += loss.item()
            if batch_idx % batch_loss_num == batch_loss_num - 1:
                batch_loss = loss_sum / batch_loss_num
                loss_sum = 0.0
          
            t.set_postfix_str(f'train_loss={loss.item():.6f} batch_loss={batch_loss:.6f}')
            t.update()
            
            del data, label, output 
            gc.collect()
            torch.cuda.empty_cache()

    # 保存完整的 BERT 分类器模型
    torch.save(model, model_train_first_path)

76454it [00:00, 88671.66it/s]
100%|█████████████████████████████████| 38227/38227 [5:28:40<00:00,  1.94it/s, train_loss=3.117380 batch_loss=0.535170]


In [30]:
model = torch.load(model_train_first_path)
model = model.cuda()

In [31]:
model.eval() # 设置模型为评估/测试模式
valid_dataset = MyDataset(processed_expose_train_labeled_valid_path, 'valid')
valid_loader = DataLoader(dataset=valid_dataset, batch_size=1, shuffle=True, num_workers=num_workers)
with torch.no_grad():
    for data, label in tqdm(valid_loader):
        print(data, label)
        print(data['id'][0])
        output = model(data)
#         print(output)
#         print(torch.sigmoid(output))
#         sigmoid = nn.Sigmoid()
#         print(sigmoid(output))
        predict_list = F.softmax(output, dim=1).tolist()[0]
        predict_index = np.argmax(predict_list)
        predict_label = doctype_list[predict_index]
        print(predict_label)
        break

7534it [00:00, 85588.30it/s]
  0%|                                                                                         | 0/7534 [00:00<?, ?it/s]

{'id': ['c3fb4257-64ed-4c43-bc13-344d890cf229'], 'category': tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.]], dtype=torch.float64), 'paragraphs_num': tensor([[-0.2981]], dtype=torch.float64), 'source': tensor([[1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.

  0%|                                                                                         | 0/7534 [00:00<?, ?it/s]

情感解读





In [44]:
# with open("data/processed_train_expose_labeled.json", "r", encoding='utf-8') as input_file:
#     for line in tqdm(input_file):
#         json_data = json.loads(line)
#         if json_data['id'] == 'ee137ac3-c2a2-4aba-a517-36840ffd2f1a':
#             print(line)
#             break

2331it [00:00, 83173.87it/s]

{"id": "ee137ac3-c2a2-4aba-a517-36840ffd2f1a", "category": 11, "doctype": "人物专栏", "paragraphs_num": 1, "pic_num": 0, "source": "", "words_len": 2142, "text": "经过22年的时间漂流，《尘埃落定》是如何留下来的？阿来在杭州这么说。乡愁与告别：从《尘埃落定》到《机村史诗》李敬泽称此为乡愁：“当我们热爱田园时，我们对自然、大地、村庄的那个热爱，实际上不仅是因为很多东西正在逝去，也是因为那些过去里包含着的，我们认为我们生命当中非常珍贵的、必须珍视的价值在逝去。虽然它写了那一段特定历史中的故事，但其实你打开看到的是，人处在那样一个史诗般的天真年代，人与自然、与社会、与自己的关系。《尘埃落定》写了20世纪前50年的故乡，后来他又有了《机村史诗》，写后50年的故乡。20年前后：文学经典对读者的意义《尘埃落定》1994年完稿，1998年首次出版，2000年获第五届茅盾文学奖。对谈中，阿来也说：“写完《尘埃落定》这本书，我离开它就更有勇气了，至少我们俩和解了，或者说两清了，那些纠缠都放下了。这也意味着这部书经过22年的时间漂流，已经正在成为现代的一个节点，它会继续被一代代的人阅读。他说，我们中国人习惯于读大历史，对于鸦片战争、辛亥革命非常熟，“但我们不知道这个地方的历史，不知道我的那个县、我的那个村、我的那个乡，它的历史。”书中故事所代表的久远的乡愁，对故乡与山川大地匹配的英雄浪漫的气质，后来阿来觉得把这些找回来了。”李敬泽说，“一开始，我们会看书中那段历史，看土司制度的命运，看整个藏区经受的巨大历史变革等。"}






In [64]:
# model.eval() # 设置模型为评估/测试模式
# valid_dataset = MyDataset("data/test.txt", 'test')
# valid_loader = DataLoader(dataset=valid_dataset, batch_size=1, shuffle=True, num_workers=num_workers)
# with torch.no_grad():
#     for data, label in tqdm(valid_loader):
# #         print(data, label)
#         output = model(data)
# #         print(output)
# #         print(torch.sigmoid(output))
#         print(F.softmax(output, dim=1).tolist()[0])
#         break

2it [00:00, ?it/s]
  0%|                                                                                            | 0/2 [00:00<?, ?it/s]

[0.01922558806836605, 0.12085458636283875, 0.011475713923573494, 0.11851993203163147, 0.2810468077659607, 0.18804335594177246, 0.008331549353897572, 0.18468160927295685, 0.017482150346040726, 0.050338730216026306]





In [81]:
model.eval() # 设置模型为评估/测试模式
test_dataset = MyDataset(processed_expose_test_path, 'test')
test_loader = DataLoader(dataset=test_dataset, batch_size=1, shuffle=True, num_workers=num_workers)
predict_doctype = {}
with torch.no_grad():
    for data, label in tqdm(test_loader):
#         print(data, label)
        id = data['id'][0]
        output = model(data)
#         print(output)
#         print(torch.sigmoid(output))
#         sigmoid = nn.Sigmoid()
#         print(sigmoid(output))
        predict_list = F.softmax(output, dim=1).tolist()[0]
        predict_index = np.argmax(predict_list)
        predict_label = doctype_list[predict_index]
#         print(predict_label)
        predict_doctype[id] = predict_label
predict_data = {'predict_doctype' : predict_doctype}
df = pd.DataFrame(predict_data)
df.head()

45285it [00:00, 90916.52it/s]
100%|████████████████████████████████████████████████████████████████████████████| 45285/45285 [28:01<00:00, 26.93it/s]


Unnamed: 0,predict_doctype
0001a2f1-714e-4eca-8d26-d0b173d8d327,情感解读
00028139-6f2c-4321-b3e0-5ddf7c9af4eb,作品分析
00063b7f-03db-430b-857c-b127a970422c,行业解读
0006fe16-ae5d-432b-8fbd-f0653200069c,深度事件
0008fcff-3bd0-4a61-acb3-d995c8871768,深度事件


In [85]:
df.index.name = 'id'
df.head()

Unnamed: 0_level_0,predict_doctype
id,Unnamed: 1_level_1
0001a2f1-714e-4eca-8d26-d0b173d8d327,情感解读
00028139-6f2c-4321-b3e0-5ddf7c9af4eb,作品分析
00063b7f-03db-430b-857c-b127a970422c,行业解读
0006fe16-ae5d-432b-8fbd-f0653200069c,深度事件
0008fcff-3bd0-4a61-acb3-d995c8871768,深度事件


In [87]:
df.to_csv("submission_train_first_predict.csv")
# 0.398140

In [None]:
model.eval() # 设置模型为评估/测试模式
tokenizer = AutoTokenizer.from_pretrained(pretrained_bert_path)
# count = 0
with torch.no_grad():
    with open(processed_expose_train_unlabel_path, "r", encoding='utf-8') as unlabel_file, \
         open(processed_expose_train_unlabel_predict_list_path, "w", encoding='utf-8') as unlabel_predict_list_file:
        for line in tqdm(unlabel_file, total=500000):
            json_data = json.loads(line)
            data, label = MyDataset.get_data_label(tokenizer, copy.deepcopy(json_data), "test")
            data['category'] = torch.tensor(data['category']).reshape(1, -1)
            data['paragraphs_num'] = torch.tensor(data['paragraphs_num']).reshape(1, -1)
            data['source'] = torch.tensor(data['source']).reshape(1, -1)
            data['words_len'] = torch.tensor(data['words_len']).reshape(1, -1)
            data['input_ids'] = data['input_ids'].reshape(1, -1)
            data['token_type_ids'] = data['token_type_ids'].reshape(1, -1)
            data['attention_mask'] = data['attention_mask'].reshape(1, -1)
#             print(data)
#             print(label)
            output = model(data)
            predict_list = F.softmax(output, dim=1).tolist()[0]
#             print(predict_list)
#             count += 1
#             if count>=2:
#                 break
#             print(json_data)
#             break
            json_data['predict_list'] = predict_list
            unlabel_predict_list_file.write(f"{json.dumps(json_data, ensure_ascii=False)}\n")

 60%|█████████████████████████████████████████▋                            | 297712/500000 [3:00:28<2:02:23, 27.55it/s]

In [7]:
with open(processed_expose_train_unlabel_predict_list_path, "r", encoding='utf-8') as unlabel_predict_list_file:
    for line in tqdm(unlabel_predict_list_file):
        pass

500000it [00:02, 200751.81it/s]
