In [1]:
from ltp import LTP
import torch
from transformers import AutoTokenizer, AutoModel

In [7]:
# 文句相似度 測試

tokenizer = AutoTokenizer.from_pretrained("bert-base-chinese")
model = AutoModel.from_pretrained("bert-base-chinese")

def get_bert_embedding(text):
    input_ids = torch.tensor([tokenizer.encode(text)])
    with torch.no_grad():
        last_hidden_states = model(input_ids)[0]
    return last_hidden_states[0][0]

def cosine_similarity(a, b):
    return torch.dot(a, b) / (torch.norm(a) * torch.norm(b))

text1 = "我"
text2 = "他"

embedding1 = get_bert_embedding(text1)
embedding2 = get_bert_embedding(text2)

similarity = cosine_similarity(embedding1, embedding2)

print(similarity)


Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tensor(0.9323)


In [2]:
class Get_Name:

    def __init__(self,model="LTP/base1",GPU=True):
        self.ltp = LTP("LTP/base1")
        #ltp = LTP(path = "LTP/base|LTP/small|LTP/tiny")

        # GPU 
        if GPU and torch.cuda.is_available():
            self.ltp.to("cuda")
    
    def get_name(self,text):
        
        # cws 分詞 : 將文句拆開
        # pos 詞性標註 ， 將拆開的文句用詞性分類 ，名稱 : nh
        # ner 命名實體
        # 與文句的用詞也有關係
        words = self.ltp.pipeline([text], tasks = ["cws","pos","ner","srl","sdp"])
        names = []
        for name in words["ner"][0]:

            if name[0] == "Nh" and name[1] not in names:
                names.append(name[1])
        maybe_names = self.find_noun(words["srl"])
        
        for maybe_name in maybe_names:
            if maybe_names[maybe_name] >= 1 :
                if self.check_noun(maybe_name) and maybe_name not in names:
                    names.append(maybe_name)

        return names
        
    def find_noun(self,words):
        
        maybe_names = {}
        for i in words[0]:
            for j in i["arguments"]:
                if j[0] == "A0":
                    try:
                        maybe_names[j[1]]+=1
                    except:
                        maybe_names.setdefault(j[1],1)

        return maybe_names

    def check_noun(self,text):
        
        words = self.ltp.pipeline([text], tasks = ["pos"])
        # print(text)
        check_table = ["nh"]
        for part_of_speech in words["pos"]:
            # print(part_of_speech)
            if part_of_speech not in check_table:
                return False
        
        return True

In [3]:
text = """
陸立鼎站起身來，正要入內與娘子商議如何應敵，陸二娘已走到廳上。陸立鼎將血手印指給她看，又說了墳破屍失之事。
陸二娘皺眉道：「兩個孩子送到那裏去躲避？」陸立鼎指著牆上血手印道：「兩個孩子也在數內，這魔頭既按下了血手印，只怕輕易躲避不了。
嘿，咱兩個枉自練了這些年武功，這人進出我家，我們沒半點知覺，這……這……」陸二娘望著白牆，抓住椅背，道：「為甚麼九個手印？咱們家裏可只有七口。」
"""
GN = Get_Name()
GN.get_name(text=text)

['陸立鼎', '陸', '陸二娘']
