In [120]:
from tokenizers import (
    models,
    normalizers,
    pre_tokenizers,
    trainers,
    Tokenizer,
)
import math

In [5]:
#训练分词器
tokenizer = Tokenizer(models.WordPiece(unk_token="[UNK]"))
tokenizer.normalizer = normalizers.Sequence(
    [normalizers.NFD(),normalizers.Lowercase()]
)
tokenizer.pre_tokenizer = pre_tokenizers.WhitespaceSplit()
special_tokens = ["[UNK]","[PAD]","[CLS]","[SEP]","[MASK]"]
trainer = trainers.WordPieceTrainer(vocab_size =1000,special_tokens=special_tokens)

tokenizer.train(["corp.txt"],trainer=trainer)
tokenizer.save("./model_save/my_tokenizer.json")

#使用训练好的分词器
tokenizer = Tokenizer.from_file("./model_save/my_tokenizer.json")
tokens = tokenizer.encode("尽其在我，听其在天")
print(tokens.ids,tokens.tokens)

In [16]:
?tokenizer

[0;31mType:[0m        Tokenizer
[0;31mString form:[0m <tokenizers.Tokenizer object at 0x7fe2c7a56430>
[0;31mFile:[0m        /data/yetianxiang/anaconda3/envs/finetune/lib/python3.10/site-packages/tokenizers/__init__.py
[0;31mDocstring:[0m  
A :obj:`Tokenizer` works as a pipeline. It processes some raw text as input
and outputs an :class:`~tokenizers.Encoding`.

Args:
    model (:class:`~tokenizers.models.Model`):
        The core algorithm that this :obj:`Tokenizer` should be using.

In [97]:
torch.cuda.is_available()

True

In [137]:
#model_config
import torch
import json
class ModelConfig:
    def __init__(self,
                device = torch.device("cuda"),
                vocab_size = None,
                max_len = None,
                hidden_dim = None,
                n_head = None,
                drop_out = None,
                layer_count = None
                ):
                
        self.device = device
        self.vocab_size = vocab_size
        self.max_len = max_len
        self.hidden_dim = hidden_dim
        self.n_head = n_head
        self.drop_out = drop_out
        self.layer_count = layer_count

    
    def save(self,path):
        f = open(path,'w')
        d = {
            "vocab_size":self.vocab_size,
            "max_len":self.max_len,
            "hidden_dim":self.hidden_dim,
            "n_head":self.n_head,
            "drop_out":self.drop_out,
            "layer_count":self.layer_count,
        }
        d = json.dumps(d)
        f.write(d)
        f.close()

    def load(self,path):
        d = open(path).read()
        d = json.loads(d)
        self.vocab_size = d["vocab_size"]
        self.max_len = d["max_len"]
        self.hidden_dim = d["hidden_dim"]
        self.n_head = d["n_head"]
        self.drop_out = d["drop_out"]
        self.layer_count = d["layer_count"]
        

In [170]:
import torch.nn as nn
class Embedding(nn.Module):
    def __init__(self,config):
        super(Embedding,self).__init__()
        #id编码
        self.word_embeddings = nn.Embedding(num_embeddings = config.vocab_size,embedding_dim = config.hidden_dim).to(config.device)
        self.position_embeddings = nn.Embedding(num_embeddings = config.max_len,embedding_dim = config.hidden_dim).to(config.device)
        self.token_type_embeddings = nn.Embedding(num_embeddings = 2,embedding_dim = config.hidden_dim).to(config.device)
        self.device = config.device

    def forward(self,input_ids,token_type_ids=None):
        seq_length = input_ids.size(1)
        position_ids = torch.arange(seq_length,dtype = torch.long,device = self.device)
        if token_type_ids is None:
            token_type_ids = torch.zeros_like(input_ids,device = self.device)

        we = self.word_embeddings(input_ids)
        pe = self.position_embeddings(position_ids)
        te = self.token_type_embeddings(token_type_ids)
        return we + pe + te

In [171]:
class MultiHeadAttention(nn.Module):
    def __init__(self,config):
        super(MultiHeadAttention,self).__init__()
        self.n_head = config.n_head
        self.dim = config.hidden_dim
        self.device = config.device

        self.wq = nn.Linear(self.dim,self.dim).to(self.device)
        self.wk = nn.Linear(self.dim,self.dim).to(self.device)
        self.wv = nn.Linear(self.dim,self.dim).to(self.device)

        self.softmax = nn.Softmax(dim=3)
        self.f = nn.Linear(self.dim,self.dim).to(self.device)
        self.norm = nn.LayerNorm(self.dim).to(self.device)
        self.dropout = nn.Dropout(config.drop_out)

    def concat(self,tensor):
        a, b, c, d = tensor.size()
        return tensor.view(a,c,b*d)
    
    def attention(self,q,k,v):
        _, _, _, d = q.size()
        kt = k.transpose(2,3)
        s = (q @ kt)/math.sqrt(d)
        s = self.softmax(s)
        v = s @ v
        return v
    
    def split(self,tensor):
        a,b,c = tensor.size()
        d = c // self.n_head
        return tensor.view(a,self.n_head,b,d)
    
    def forward(self,x):
        q,k,v = self.wq(x),self.wk(x),self.wv(x)
        q,k,v = self.split(q),self.split(k),self.split(v)
        v = self.attention(q,k,v)
        v = self.concat(v)
        vb = v
        v = self.norm(x+v)
        v = self.f(v)
        v = self.norm(vb+v)
        return self.dropout(v)

In [172]:
class Pooler(nn.Module):
    def __init__(self,config):
        super(Pooler,self).__init__()
        self.dense = nn.Linear(config.hidden_dim,config.hidden_dim).to(config.device)
        self.ac = nn.Tanh()

    def forward(self,x):
        f = x[:,0]
        return self.ac(self.dense(f))

In [173]:
class Transformer(nn.Module):
    def __init__(self,config):
        super(Transformer,self).__init__()
        self.embedding = Embedding(config)
        self.layers = nn.ModuleList([
            MultiHeadAttention(config) for _ in range(config.layer_count)
        ])
        self.pooler = Pooler(config)
        self.device = config.device

    def forward(self,input_ids,token_type_ids=None):
        em = self.embedding(input_ids,token_type_ids)
        for layer in self.layers:
            em = layer(em)
        p = self.pooler(em)
        return p,em

In [174]:
import torch.nn.functional as F
class LMPredictHead(nn.Module):
    def __init__(self,config):
        super(LMPredictHead,self).__init__()
        self.f1 = nn.Linear(config.hidden_dim,config.hidden_dim).to(config.device)
        self.f2 = nn.Linear(config.hidden_dim,config.vocab_size).to(config.device)

    def forward(self,x):
        x = F.relu(self.f1(x))
        return self.f2(x)

class SClsHead(nn.Module):
    def __init__(self,config):
        super(SClsHead,self).__init__()
        self.f1 = nn.Linear(config.hidden_dim,config.hidden_dim).to(config.device)
        self.f2 = nn.Linear(config.hidden_dim,2).to(config.device)

    def forward(self,x):
        x = F.relu(self.f1(x))
        return self.f2(x)

In [175]:
from torch.nn import CrossEntropyLoss
class PretrainModel(nn.Module):
    def __init__(self,config):
        super(PretrainModel,self).__init__()
        self.transformer = Transformer(config)
        self.wm = LMPredictHead(config)
        self.scls = SClsHead(config)
        self.loss_function = CrossEntropyLoss(ignore_index=-1)
        self.config = config

    def forward(self,input_ids,token_type_ids=None,masked_lm_labels=None,next_sentence_labels=None):
        p,e = self.transformer(input_ids,token_type_ids)
        wmp = self.wm(e)
        sclsp = self.scls(p)

        if masked_lm_labels is not None and next_sentence_labels is not None:
            loss1 = self.loss_function(wmp.view(-1,self.config.vocab_size),masked_lm_labels.reshape(-1))
            loss2 = self.loss_function(sclsp.view(-1,2),next_sentence_labels.view(-1))
            return loss1 + loss2
        return wmp,sclsp

In [166]:
t = torch.rand(20,103,128)
t.view(-1,103).size()

torch.Size([2560, 103])

In [176]:
from tqdm import tqdm
import torch.optim as optim
import numpy as np
import random
from torch.utils.data import Dataset,DataLoader

class PretrainDataset(Dataset):
    def __init__(self):
        self.tokenizer = Tokenizer.from_file("./model_save/my_tokenizer.json")
        lines = open("corp.txt",encoding="utf8").readlines()
        ds  = []
        for line in tqdm(lines):
            line = line.strip()
            if line == '':
                continue
            ts = line.split("。")
            if len(ts)<2:
                continue
            for i in range(len(ts)-1):
                ds.append([ts[i],ts[i+1],[1]])

        nds = []
        for i in range(len(ds)):
            id1 = random.randint(0,len(ds)-1)
            id2 = random.randint(0,1)
            nds.append([ds[i][0],ds[id1][id2],[0]])

        self.data = ds + nds

    def __len__(self):
        return len(self.data)

    def __getitem__(self,index):
        d = self.data[index]
        ids1 = self.tokenizer.encode(d[0][:50]).ids
        ids2 = self.tokenizer.encode(d[1][:50]).ids
        ids = ([2]+ids1+[3]+ids2+[3])[:103]

        padding_size = 103 - len(ids)
        token_type_ids = [0 for _ in range(len(ids1)+2)] +\
            [1 for _ in range(len(ids2)+1)]
        mask = np.random.random(len(ids))
        mask = (mask > 0.05).astype(np.int64)
        if padding_size > 0:
            mask = np.hstack((mask,np.ones(padding_size)))
            ids += [1 for _ in range(padding_size)]
            token_type_ids += [0 for _ in range(padding_size)]
        mask_ids = []
        for i in range(len(mask)):
            if mask[i]==1:
                mask_ids.append(ids[i])
            else:
                mask_ids.append(4)

        return np.array(ids,dtype=np.int64),np.array(token_type_ids,dtype=np.int64),np.array(mask_ids,dtype=np.int64),np.array(d[2],dtype=np.int64)

In [205]:
def pretrain():
    config = ModelConfig(device=torch.device("cuda"))
    config.load("./model_save/config.json")
    model = PretrainModel(config)
    data = PretrainDataset()
    loader = DataLoader(data,batch_size=20,shuffle=True)
    epoch = 20
    optimizer = optim.Adam(model.parameters(),lr=1e-4)
    step = 0
    pbar = tqdm(loader)
    for e in range(epoch):
        for ids,token_type_ids,mask_ids,ns in pbar:
            ids = ids.to(config.device)
            token_type_ids = token_type_ids.to(config.device)
            mask_ids = mask_ids.to(config.device)
            ns = ns.to(config.device)
            
            optimizer.zero_grad()
            loss = model(mask_ids,token_type_ids,ids,ns)
            loss.backward()
            optimizer.step()
            step += 1
            desc = "[{}/{}][{}][{}]".format(e+1,epoch,step,float(loss))
            pbar.set_description(desc)
            if step % 100 == 0:
                wmp,cls = model(mask_ids,token_type_ids)
                pids = torch.argmax(wmp,dim=2)
                # print(f"ids.size:{ids.size()}")
                # print(f"pids.size:{pids.size()}")
                for i in range(pids.size(0)):
                    s = data.tokenizer.decode(pids.detach().cpu().numpy()[i])
                    ori_s = data.tokenizer.decode(ids.detach().cpu().numpy()[i])
                    print("\n\n==[{}]==\n==[{}]==\n\n".format(s,ori_s))
            if step % 5000 == 0:
                torch.save(model.state_dict(),"./model_save/model_pretrain_{}.pt".format(step))
        torch.save(model.transformer.state_dict(),"./model_save/model_pretrain_done.pt")

In [206]:
pretrain()

100%|██████████| 138/138 [00:00<00:00, 391090.51it/s]
[1/20][27][6.906792163848877]: 100%|██████████| 27/27 [00:00<00:00, 55.51it/s]




==[##， ##， ##， ##， ##， ##，]==
==[因 ##嘱 ##其 ##静 ##心 ##养 ##病 ##， ##不 ##必 ##挂 ##念 ##营 ##务 ##， ##余 ##代 ##为 ##函 ##告 ##南 ##省 ##江 ##省 ##等 ##语 同 ##乡 ##京 ##官 ##如 ##故 ##， ##郑 ##莘 ##田 ##给 ##谏 ##服 ##阙 ##来 ##京 ##， ##梅 ##霖 ##生 ##病 ##势 ##沉 ##重 ##， ##深 ##为 ##可 ##虑]==




==[]==
==[且 ##望 ##诸 ##弟 ##分 ##此 ##重 ##任 ##， ##余 ##亦 ##欲 ##稍 ##稍 ##息 ##肩 ##， ##乃 ##不 ##得 ##一 ##售 ##， ##使 ##我 ##中 ##心 ##无 ##倚]==




==[]==
==[男 ##谨 ##禀 （ ##道 ##光 ##二 ##十 ##四 ##年 ##七 ##月 ##廿 ##日 ##）]==




==[]==
==[又 ##有 ##当 ##名 ##士 ##者 ##， ##鄙 ##科 ##名 ##为 ##粪 ##土 ##， ##或 ##好 ##作 ##诗 ##古 ##文 ##， ##或 ##好 ##讲 ##考 ##据 ##， ##或 ##好 ##谈 ##理 ##学 ##， ##嚣 ##嚣 ##然 ##自 ##以 ##为 ##压 ##倒 ##一 ##切 ##矣 吾 ##家 ##戏 ##言 ##戏 ##动 ##积 ##习 ##， ##明 ##年 ##喜 ##在 ##家 ##， ##当 ##与 ##两 ##先 ##生 ##尽 ##改 ##之]==




==[##，]==
==[切 ##己 ##体 ##察 ##， ##穷 ##其 ##理 ##， ##即 ##格 ##物 ##也 四 ##弟 ##九 ##弟 ##虽 ##不 ##长 ##进 ##， ##亦 ##不 ##自 ##满 ##， ##求 ##大 ##人 ##教 ##六 ##弟 ##， ##总 ##期 ##不 ##自 ##满 ##足 ##为 ##要]==




==[##， ##，]==
==[以 ##后 ##务 ##宜 ##力 ##除 ##此 ##病 ##， ##以 ##吴 ##（ ##木 ##云 ##

In [208]:
class TextClsDataset(Dataset):
    def __init__(self):
        self.tokenizer = Tokenizer.from_file("./model_save/my_tokenizer.json")
        lines = open("./头条data/toutiao_cat_data.txt",encoding="utf8").readlines()
        index = 0
        self.tagmap = {}
        data = []
        for line in lines:
            ts = line.strip().split("_!_")
            if len(ts)!=5:
                continue
            id = ts[1]
            label = ts[2]
            text = ts[3]
            if not id in self.tagmap:
                self.tagmap[id] = {"index":index,"label":label}
                index+=1
            data.append([text,self.tagmap[id]["index"]])
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self,index):
        text, label = self.data[index]
        ids = [2] + self.tokenizer.encode(text[:101]).ids + [3]
        if len(ids)<103:
            ids += [1 for _ in range(103-len(ids))]
        return np.array(ids,dtype=np.int64),label


In [209]:
class TextClsModel(nn.Module):
    def __init__(self,config,label_count):
        super(TextClsModel,self).__init__()
        self.transformer = Transformer(config)
        self.f1 = nn.Linear(config.hidden_dim,label_count).to(config.device)

    def forward(self,x):
        p,_ = self.transformer(x)
        return self.f1(p)

def train():
    data = TextClsDataset()
    loader = DataLoader(data,batch_size=20,shuffle=True)

    config = ModelConfig(device = torch.device("cuda:0"))
    config.load("./model_save/config.json")
    model = TextClsModel(config,len(data.tagmap))
    model.transformer.load_state_dict(torch.load("./model_save/model_pretrain_done.pt",map_location=config.device))
    loss_function = CrossEntropyLoss()
    epoch = 20
    optimizer = optim.Adam(model.parameters(),lr=1e-4)
    step = 0
    pbar = tqdm(loader)
    for e in range(epoch):
        for ids,label in pbar:
            ids = ids.to(config.device)
            label = label.to(config.device)
            p = model(ids)
            optimizer.zero_grad()
            loss = loss_function(p,label)
            loss.backward()
            optimizer.step()
            step+=1
            err = 0
            pLabel = torch.argmax(p,dim=1)
            for i in range(pLabel.size(0)):
                if int(pLabel[i]) != int(label[i]):
                    err += 1
            acc = 1
            if err != 0:
                acc = (pLabel.size(0)-err)/pLabel.size(0)
            desc = "[{}/{}][{}][{:.4f}][{:.4f}]".format(e+1,epoch,step,float(loss),acc)
            pbar.set_description(desc)
            

In [210]:
train()

[1/20][19135][2.4128][0.1250]: 100%|██████████| 19135/19135 [05:10<00:00, 61.66it/s] 
