In [2]:
from transformers import BertTokenizer, BertForTokenClassification
import torch

# 初始化 tokenizer 和模型
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
model = BertForTokenClassification.from_pretrained('bert-base-cased')

# 输入句子
sentence = "My dog loves to play in the park."

# 使用 tokenizer 对句子进行编码
inputs = tokenizer(sentence, return_tensors="pt")

# 使用模型对句子进行预测
outputs = model(**inputs)

# 获取预测结果
predictions = outputs.logits.argmax(dim=-1).squeeze().tolist()

# 将预测结果转换为标签
labels = [model.config.id2label[prediction] for prediction in predictions]

# 打印结果
for token, label in zip(inputs['input_ids'].squeeze().tolist(), labels):
    print(f"{tokenizer.decode([token])}: {label}")


Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForTokenClassification: ['cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cas

[CLS]: LABEL_1
My: LABEL_1
dog: LABEL_1
loves: LABEL_1
to: LABEL_1
play: LABEL_1
in: LABEL_0
the: LABEL_1
park: LABEL_1
.: LABEL_1
[SEP]: LABEL_1


In [2]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

tokenizer = AutoTokenizer.from_pretrained("dslim/bert-large-NER")
model = AutoModelForTokenClassification.from_pretrained("dslim/bert-large-NER")
content = open("./story/content/using.txt","r").read()
nlp = pipeline("ner", model=model, tokenizer=tokenizer)


ner_results = nlp(content)



In [4]:
print(ner_results)


[{'entity': 'B-PER', 'score': 0.96768314, 'index': 8, 'word': 'Cinderella', 'start': 31, 'end': 41}, {'entity': 'B-PER', 'score': 0.91253775, 'index': 21, 'word': 'Cinderella', 'start': 95, 'end': 105}, {'entity': 'B-PER', 'score': 0.91550875, 'index': 115, 'word': 'Cinderella', 'start': 484, 'end': 494}, {'entity': 'B-PER', 'score': 0.88089794, 'index': 190, 'word': 'Cinderella', 'start': 815, 'end': 825}, {'entity': 'B-PER', 'score': 0.9327669, 'index': 249, 'word': 'Cinderella', 'start': 1025, 'end': 1035}, {'entity': 'B-PER', 'score': 0.84263504, 'index': 339, 'word': 'Cinderella', 'start': 1333, 'end': 1343}, {'entity': 'B-PER', 'score': 0.92706114, 'index': 346, 'word': 'Cinderella', 'start': 1367, 'end': 1377}, {'entity': 'B-PER', 'score': 0.8823574, 'index': 436, 'word': 'Cinderella', 'start': 1714, 'end': 1724}, {'entity': 'B-PER', 'score': 0.8681753, 'index': 468, 'word': 'Cinderella', 'start': 1830, 'end': 1840}, {'entity': 'B-PER', 'score': 0.80215275, 'index': 497, 'word':

In [3]:
table = ["B-PER","I-PER"]
names = []
for i in ner_results:
    if i["entity"] in table and i["score"] >= 0.9:
        
        if i["word"] not in names:
            names.append(i["word"])
print(names)



['Tina', 'Jessica', 'Mary']


In [28]:
import spacy

# 加载英文模型
nlp = spacy.load("en_core_web_sm")

# 输入句子
sentence = "Denny is my father"  

# 使用模型对句子进行分析
doc = nlp(sentence)

# 打印结果
for token in doc:
    print(f"{token.text}: {token.dep_}")

Denny: nsubj
is: ROOT
my: poss
father: attr


In [15]:
content = open("./cinderella.txt","r").read()

#text = "We were all out at the zoo one day, I was doing some acting, walking on the railing of the gorilla exhibit. I fell in. Everyone screamed and Tommy jumped in after me, forgetting that he had blueberries in his front pocket. The gorillas just went wild."

doc = nlp(content)

for sent in doc.sents:    
    print(sent)

Once upon a time a girl named Cinderella lived with her stepmother and two stepsisters.  
Poor Cinderella had to work hard all day long so the others could rest.
It was she who had to wake up each morning when it was still dark and cold to start the fire.  
It was she who cooked the meals.
It was she who kept the fire going.
The poor girl could not stay clean, from all the ashes and cinders by the fire.

“What a mess!”
her two stepsisters laughed.  
And that is why they called her “Cinderella.”

One day, big news came to town.  
The King and Queen were going to have a ball!  
It was time for the Prince to find a bride.
All of the young ladies in the land were invited to come.  
They were wild with joy!
They would wear their most beautiful gown and fix their hair extra nice.
Maybe the prince would like them!

At Cinderella’s house, she now had extra work to do.  
She had to make two brand-new gowns for her step-sisters.  

“Faster!” shouted one step-sister.

“You call that a dress?” scr

In [17]:
import spacy

# 加载英文模型
nlp = spacy.load("en_core_web_sm")
content = open("./cinderella.txt","r").read()
# 输入文本
text = "John is a dog"

# 使用模型对文本进行分析
doc = nlp(content)

names = list()
# 打印结果
for ent in doc.ents:
    
    if ent.label_ == "PERSON":
        if ent.text not in names:
            names.append(ent.text)
   
print(names)


['Cinderella', 'Faster', 'Fairy Godmother', 'Fairy', 'Queen', 'Mother']


In [26]:
# Compare two documents
doc_1 = nlp("Snow White's")
doc_2 = nlp("Snow White")

print(doc_1.similarity(doc_2))
print(doc_2.similarity(doc_1))

0.637396521506983
0.637396521506983


  print(doc_1.similarity(doc_2))
  print(doc_2.similarity(doc_1))


In [40]:
import spacy
import pandas as pd

# 1. Loading the language library
nlp = spacy.load('en_core_web_sm')
content = open("./snow_white.txt","r").read()
# 2. Building a Pipline Object
doc = nlp(content)
for sent in doc.sents:  
    
    sentence = nlp(str(sent))
    for entity in sentence.ents:
        if entity.label_ =="PERSON":
            print(f"{entity.text:-<{20}}{entity.label_:-<{20}}{str(spacy.explain(entity.label_))}")
# # 3. Using Tokens
# for token in doc:
#     print(f"{token.text:{12}}{token.pos_:{12}}{token.dep_:{12}}{token.lemma_}")

# Named Entity
# for entity in doc.ents:
#     print(f"{entity.text:-<{20}}{entity.label_:-<{20}}{str(spacy.explain(entity.label_))}")

# # Noun Chunks
# for chunk in doc.noun_chunks:
#     print(chunk.text)

# # Built-in Visualizers
# from spacy import displacy
# displacy.render(doc, style='dep', jupyter=True, options={'distance':90})

# # Visualizing the entity recongnizer
# displacy.render(doc, style='ent', jupyter=True)

Snow White----------PERSON--------------People, including fictional
Snow White----------PERSON--------------People, including fictional
Thou----------------PERSON--------------People, including fictional
Snow White----------PERSON--------------People, including fictional
Queen---------------PERSON--------------People, including fictional
Snow White's--------PERSON--------------People, including fictional
Snow White----------PERSON--------------People, including fictional
Snow White----------PERSON--------------People, including fictional
Snow White----------PERSON--------------People, including fictional
Snow White----------PERSON--------------People, including fictional
Snow White----------PERSON--------------People, including fictional
Snow White----------PERSON--------------People, including fictional
Snow White----------PERSON--------------People, including fictional
Snow White----------PERSON--------------People, including fictional
Snow White----------PERSON--------------People, 

In [41]:
import spacy

nlp = spacy.load('en_core_web_sm')
content = open("./cinderella.txt","r").read()
doc = nlp(content)

for token in doc:
    if "subj" in token.dep_:
        print(f"The subject is: {token.text}")

The subject is: girl
The subject is: Cinderella
The subject is: others
The subject is: It
The subject is: who
The subject is: it
The subject is: It
The subject is: who
The subject is: It
The subject is: who
The subject is: girl
The subject is: stepsisters
The subject is: that
The subject is: they
The subject is: news
The subject is: King
The subject is: It
The subject is: Prince
The subject is: All
The subject is: They
The subject is: They
The subject is: prince
The subject is: she
The subject is: She
The subject is: Faster
The subject is: You
The subject is: other
The subject is: dear
The subject is: Cinderella
The subject is: I
The subject is: stepmother
The subject is: you
The subject is: WHAT
The subject is: girl
The subject is: I
The subject is: You
The subject is: Who
The subject is: YOU
The subject is: sister
The subject is: They
The subject is: All
The subject is: Cinderella
The subject is: they
The subject is: they
The subject is: I
The subject is: I
The subject is: I
The subj

In [None]:
# `token.dep_` 属性返回的是 token 在依存句法分析中的依存关系标签。这些标签表示 token 与其他 token 之间的语法关系。spacy 使用的是通用依存关系标签集，其中包括许多不同类型的依存关系标签。

# 下面是一些常见的依存关系标签及其含义：

# - `nsubj`：名词性主语
# - `dobj`：直接宾语
# - `iobj`：间接宾语
# - `aux`：助动词
# - `amod`：形容词修饰语
# - `advmod`：副词修饰语
# - `prep`：介词
# - `pobj`：介词宾语
# - `conj`：并列连词

# 您可以在[通用依存关系网站](https://universaldependencies.org/u/dep/index.html)上查看完整的依存关系标签列表及其定义。