In [1]:
import numpy as np
import pandas as pd
import torch
from gensim.models.word2vec import LineSentence
from tqdm import tqdm
from transformers import BertTokenizer, BertModel, AdamW, BertPreTrainedModel, get_linear_schedule_with_warmup
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, RandomSampler
from torch.nn.utils.rnn import pad_sequence

import warnings
warnings.filterwarnings('ignore')

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  from ._conv import register_converters as _register_converters


In [2]:
sim_data = pd.read_csv("data.csv")
print(len(sim_data))
sim_data.head()

395849


Unnamed: 0.1,Unnamed: 0,sentence1,sentence2,similarity
0,0,但是，许多公司并没有把冗余现金用于新投资以扩大产能、占领新市场——自全球金融危机爆发以来它们...,但有担心，之前印度的政治家们决定采取行动的东西比日常苦难更可怕必须发生。,0
1,1,当释放到一个人的循环系统，血红蛋白在这些高氧化态最终自毁，周围组织损伤的分子。,当血红蛋白被释放到人的循环系统中时，高价铁形式最终会自我毁灭，伤害周围组织的分子。,1
2,2,是的，世界需要解决全球变暖问题（主要是通过高投资，绿色的研究和开发，并通过促进廉价的开采，污...,是的，世界需要解决全球变暖问题（主要通过加大对绿色能源研究和发展的投资，促进廉价和低污染页岩...,1
3,3,以色列前安全高官��积极参与了由新美国战略重心（Center for a New Ameri...,有很好的理由：这两个世界上人口最多的民主国家之间的友好关系可以塑造世界的未来。,0
4,4,有见及此，自1990年以来的每一份联合国发展报告都表明如果某地的公共政策能把民生放在首位，则...,是否在神的创造惊叹或只是试着去了解为什么事情会是这样的 - 但要认识到穆斯林国家有多少可以被...,0


In [3]:
# 2. 打乱文本顺序,生成不相似数据
data = sim_data.sample(frac=1)

In [4]:
from typing import Tuple, List
import re

Texts = torch.LongTensor

class SentenceDataset(Dataset):
    def __init__(self, tokenizer, dataframe, device):
        self.device = device
        self.tokenizer = tokenizer
        self.pad_idx = tokenizer.pad_token_id
        self.X = []
        self.Y = []
        for i, (row) in tqdm(dataframe.iterrows()):
            if len(tokenizer.tokenize(row["sentence1"])) > 120:
                continue
            # 移除第一句标点符号
            sentence1 = re.sub(r'[。，?]','',row["sentence1"])
            orignal_text = tokenizer.encode(sentence1)
#             orignal_text = torch.LongTensor(text)
            # 移除第二句标点符号
            sentence2 = re.sub(r'[。，?]','',row["sentence2"])
#             print(sentence2)
            trans_text = tokenizer.encode(sentence2, add_special_tokens=True)
#             print(tokenizer.decode(text[1:]))
            text = torch.LongTensor(orignal_text+trans_text)
#             print(type(row[["similarity"]].tolist()),row[["similarity"]].tolist())
            sim_value = int(row[["similarity"]].tolist()[0])
            tags = torch.FloatTensor([1-sim_value,sim_value])
#             print(tags)
#             print(tags)
            self.X.append(text)
            self.Y.append(tags)
    
    def __len__(self):
        return len(self.X)

    def __getitem__(self, index: int) -> Tuple[Texts, torch.LongTensor]:
        return self.X[index], self.Y[index]

In [5]:
tokenizer = BertTokenizer.from_pretrained('./model')

from sklearn.model_selection import train_test_split
train_df, val_df = train_test_split(data, test_size=0.05)

In [6]:
print(len(train_df))
print(len(val_df))

376056
19793


In [7]:
# device = torch.device("cpu")
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [8]:
def collate_fn(batch: List[Tuple[Texts, torch.LongTensor]]) \
        -> Tuple[Texts, torch.LongTensor]:
    x, y = list(zip(*batch))
    x = pad_sequence(x, batch_first=True, padding_value=0)
    y = torch.stack(y)
    return x.to(device), y.to(device)

train_dataset = SentenceDataset(tokenizer, train_df, device)
dev_dataset = SentenceDataset(tokenizer, val_df, device)

BATCH_SIZE = 4
train_sampler = RandomSampler(train_dataset)
dev_sampler = RandomSampler(dev_dataset)
train_iterator = DataLoader(train_dataset, batch_size=BATCH_SIZE, sampler=train_sampler, collate_fn=collate_fn)
dev_iterator = DataLoader(dev_dataset, batch_size=BATCH_SIZE, sampler=dev_sampler, collate_fn=collate_fn)

376056it [09:06, 688.14it/s]
19793it [00:28, 697.14it/s]


In [13]:
# . 训练bert下游模型
class BertSimilarity(BertPreTrainedModel):
    def __init__(self, config):
        super(BertSimilarity, self).__init__(config)
        self.bert = BertModel(config)
        self.similarity = nn.Linear(config.hidden_size, 2)
        
    def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None,
            labels=None):
        outputs = self.bert(input_ids,
                               attention_mask=attention_mask,
                               token_type_ids=token_type_ids,
                               position_ids=position_ids,
                               head_mask=head_mask)
#         print(outputs.shape)
#         cls_output = outputs[0][:,0,:] # batch, hidden
        cls_output = outputs[1] # output[:, 0]->Linear->Tanh ->(batch, hidden)
        cls_output = self.similarity(cls_output) # batch, 2
#         print(cls_output.shape)
        cls_output = torch.sigmoid(cls_output)
        criterion = nn.BCELoss()
#         print(cls_output.shape,labels.shape)
        loss = 0
        if labels is not None:
            loss = criterion(cls_output, labels)
        return loss, cls_output

# model = BertModel.from_pretrained('hfl/chinese-bert-wwm-ext')
model = BertSimilarity.from_pretrained('./model').to(device)

In [10]:
from sklearn.metrics import roc_auc_score
def train(model, iterator, optimizer, scheduler):
    model.train()
    total_loss = 0
    for x, y in tqdm(iterator):
        optimizer.zero_grad()
        mask = (x != 0).float()
#         print(x.shape,y.shape)
        loss, outputs = model(x, attention_mask=mask, labels=y)
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
        scheduler.step()
    print(f"Train loss {total_loss / len(iterator)}")

def evaluate(model, iterator):
    model.eval()
    pred = []
    true = []
    with torch.no_grad():
        total_loss = 0
        for x, y in tqdm(iterator):
            mask = (x != 0).float()
            loss, outputs = model(x, attention_mask=mask, labels=y)
            total_loss += loss
            true += y.cpu().numpy().tolist()
            pred += outputs.cpu().numpy().tolist()
    true = np.array(true)
    pred = np.array(pred)
#     print(true.shape,pred.shape)
    for i, name in enumerate(['non-similarity','similarity']):
        print(f"{name} roc_auc {roc_auc_score(true[:, i], pred[:, i])}")
    print(f"Evaluate loss {total_loss / len(iterator)}")

In [20]:
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
{'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
{'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]
EPOCH_NUM = 4
# triangular learning rate, linearly grows untill half of first epoch, then linearly decays 
warmup_steps = int(0.5 * len(train_iterator))
total_steps = len(train_iterator) * EPOCH_NUM - warmup_steps
optimizer = AdamW(optimizer_grouped_parameters, lr=2e-5, eps=1e-8)
# scheduler = WarmupLinearSchedule(optimizer, warmup_steps=warmup_steps, t_total=total_steps)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=total_steps)

In [None]:
for i in range(EPOCH_NUM):
    print('=' * 50, f"EPOCH {i}", '=' * 50)
    train(model, train_iterator, optimizer, scheduler)
    evaluate(model, dev_iterator)



100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 93914/93914 [4:59:59<00:00,  5.22it/s]


Train loss 0.19179787686077668


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4944/4944 [03:16<00:00, 25.14it/s]


non-similarity roc_auc 0.9768990039385351
similarity roc_auc 0.9768934935092332
Evaluate loss 0.17497685551643372


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 93914/93914 [4:59:18<00:00,  5.23it/s]


Train loss 0.16408025716438154


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4944/4944 [03:16<00:00, 25.16it/s]


non-similarity roc_auc 0.9808903759968274
similarity roc_auc 0.9808952689674788
Evaluate loss 0.16057398915290833


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 93914/93914 [4:59:23<00:00,  5.23it/s]


Train loss 0.13382706855373264


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4944/4944 [03:15<00:00, 25.31it/s]


non-similarity roc_auc 0.984459670963238
similarity roc_auc 0.984459702095607
Evaluate loss 0.14583845436573029


 92%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉             | 86452/93914 [4:36:26<23:41,  5.25it/s]

In [15]:
# 保存模型
output_dir='./sim_model'
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

('./sim_model\\vocab.txt',
 './sim_model\\special_tokens_map.json',
 './sim_model\\added_tokens.json')

In [16]:
def generate_text(s1,s2):
    # 移除第一句标点符号
    sentence1 = re.sub(r'[。，?]','',s1)
    orignal_text = tokenizer.encode(sentence1)
#             orignal_text = torch.LongTensor(text)
    # 移除第二句标点符号
    sentence2 = re.sub(r'[。，?]','',s2)
#             print(sentence2)
    trans_text = tokenizer.encode(sentence2, add_special_tokens=True)
#             print(tokenizer.decode(text[1:]))
    return torch.LongTensor([orignal_text+trans_text]).to(device)

In [17]:
def get_similarity(sentence1,sentence2):
    text = generate_text(sentence1,sentence2)
#     tokens = tokenizer.tokenize(comment)
#     sentence = comment
#     if len(tokens)>400:
#         sentence = comment[0:400]
#     text = [tokenizer.encode(sentence, add_special_tokens=True)]
#     text = torch.LongTensor(text)
    mask = (text != 0).float()
    pred = []
    with torch.no_grad():
        _,outputs = model(text, attention_mask=mask)
        pred = outputs.cpu().numpy().tolist()
#     pred = np.argmax(np.array(pred).reshape((-1,features_size,4)),axis=2)
    return pred[0]

In [18]:
text_a = '技术侦查措施只能在立案后采取'
text_b = '未立案不可以进行技术侦查'
get_similarity(text_b,text_a)

[0.6785768270492554, 0.6522148847579956]

In [None]:
get_similarity(text_b,text_a)