# 对冷启动数据进行自动标注，减少人工标注时的噪音

In [1]:
#!AHOCORASICK_BYTES=1 pip install git+https://github.com/WojciechMula/pyahocorasick.git

In [2]:
import os
import json
import random
from matplotlib import pyplot as plt
import numpy as np
from tokenizer import tokenizer,token2str,vocab_size
import torch
import torch.nn as nn
from make_model import make_model
from train_and_use import Batch,CrossEntropyLoss,SimpleAdamOptimizer,OptimizerWrapper,train_server_start
from train_and_use import text_continue
from train_and_use import TOGGLE,STOP
from train_and_use import record
from collections import Counter,defaultdict
from tqdm import tqdm
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# device = torch.device("cpu")
# %matplotlib widget

In [3]:
model = make_model(
    #token是从1开始的，0填充，剩下的用来覆盖全部字节
    vocab_size = vocab_size+1+255,
    embedding_dim = 512,
    key_dim = 256,
    head_number = 8,
    position_information_type = "mask",
    # position_information_type = "sinusoidal",
    # position_information_type = "rotary",
    # position_information_type = "learned",
    enable_affine = True,
    enable_talking_head = True,
    use_diff = False,
    self_attention_block_size = 0,
    feed_forward_dim = 1024,
    enable_layer_norm = True,
    deep = 6,
    dropout_rate = 0.1
).to(device)

In [4]:
model.load_state_dict(torch.load('autotagger_init.weight',weights_only=True))
model = model.eval()

In [5]:
file_high = open('high_iter.txt','w')
file_mid = open('mid_iter.txt','w')
file_low  = open('low_iter.txt','w')

In [6]:
fnames = ['high_init.txt','mid_init.txt','low_init.txt']
np.random.shuffle(fnames)
cnt = 0
for fname in fnames:
    cnt += 1
    print('\r[',cnt,'/',len(fnames),']',end=' '*10)
    with open(fname,'r',encoding='utf-8') as f:
        lines = [line[:-1] for line in f]
        n = len(lines)
        n -= n%3
        lines = random.sample(lines,n)
    # with torch.amp.autocast("cuda"):
    for i in tqdm(range(0,len(lines),3)):
        tokens_batch = [(tokenizer(lines[i],5.0)+[-255]*1000)[:1000] + tokenizer(' 这段文本的质量按照“高、中、低”三档评价为：',5.0)]
        tokens_batch += [(tokenizer(lines[i+1],5.0)+[-255]*1000)[:1000] + tokenizer(' 这段文本的质量按照“高、中、低”三档评价为：',5.0)]
        tokens_batch += [(tokenizer(lines[i+2],5.0)+[-255]*1000)[:1000] + tokenizer(' 这段文本的质量按照“高、中、低”三档评价为：',5.0)]
        tokens_batch = np.array(tokens_batch,dtype=np.int64)+255
        inputs = torch.from_numpy(tokens_batch).to(device).data
        with torch.no_grad():
            o = text_continue(
                model,inputs,out_length=1,
                repeat_penalty_value = 0.0,
                temperature = 1.0
            )
        for res,line in zip(o,lines[i:i+3]):
            res = token2str(res.cpu().numpy()-255)[-1]
            if res == '高':
                print(line,file=file_high)
            if res == '中':
                print(line,file=file_mid)
            if res == '低':
                print(line,file=file_low)

[ 1 / 3 ]          

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████| 321/321 [01:33<00:00,  3.45it/s]


[ 2 / 3 ]          

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████| 778/778 [03:51<00:00,  3.35it/s]


[ 3 / 3 ]          

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████| 413/413 [02:02<00:00,  3.37it/s]


In [7]:
file_high.close()
file_mid.close()
file_low.close()