In [1]:
from neo4j import GraphDatabase

def export_entities():
    uri = "bolt://localhost:7687"
    user = "neo4j"
    password = "neo4j123456"
    
    with GraphDatabase.driver(uri, auth=(user, password)) as driver:
        with driver.session() as session:
            # 导出所有教师、课程等实体
            result1 = session.run("""
                MATCH (n:Person)
                RETURN n.name
            """)
            result2 = session.run("""
                MATCH (n:Course)
                RETURN n.courseName
            """)
            result3 = session.run("""
                MATCH (n:College)
                RETURN n.name
            """)

            result4 = session.run("""
                MATCH (n:Classroom)
                RETURN n.name
            """)

            with open("./entities.txt", "w", encoding="utf-8") as f:
                for record in result1:
                    f.write(record.value() + ",Person\n")
                for record in result2:
                    if record.value() is not None:
                        f.write(record.value() + ",Course\n")
                for record in result3:
                    f.write(record.value() + ",College\n")
                for record in result4:
                    f.write(record.value() + ",Classroom\n")

if __name__ == "__main__":
    export_entities()

In [1]:
from pypinyin import lazy_pinyin, Style

with open('../agents/data/entities.txt', 'r', encoding='utf-8') as f:
    entities = f.read().split('\n')

pinyin_tntities = []
for entity in entities:
    if not entity:
        continue
    tmp = entity.split(',')
    assert len(tmp) == 2
    py = ''.join(lazy_pinyin(tmp[0], style=Style.NORMAL))
    pinyin_tntities.append(py+','+tmp[1])

with open('pinyin_entities.txt', 'w', encoding='utf-8') as f:
    f.write('\n'.join(pinyin_tntities))

In [9]:
with open('entities.txt', 'r', encoding='utf-8') as f:
    entities = f.read().split('\n')
with open('entities.txt', 'w', encoding='utf-8') as f:
    for entity in entities:
        if entity:
            f.write(entity + '\n')

In [19]:
# spell_checker.py
from symspellpy import SymSpell, Verbosity
import json
from typing import Tuple
import re

class SpellChecker:
    def __init__(self):
        # 初始化拼写检查器
        self.sym_spell = SymSpell(max_dictionary_edit_distance=2)
        
        # 加载自定义词典（需提前从知识图谱导出实体）
        self._load_custom_dictionary("./entities.txt")
        
        # 加载通用词典
        self.sym_spell.load_dictionary('./entities.txt', term_index=0, count_index=1)

    def _load_custom_dictionary(self, path: str):
        """从知识图谱导出实体构建专用词典"""
        with open(path, 'r', encoding='utf-8') as f:
            for line in f:
                entity = line.strip()
                # 为每个实体设置较高频率（确保优先匹配）
                self.sym_spell.create_dictionary_entry(entity, 1000000)

    def correct_query(self, query: str) -> Tuple[str, dict]:
        """
        返回纠正后的查询与纠错元数据
        """
        # 分句处理（防止跨句纠错）
        sentences = re.split(r'[。！？]', query)
        
        corrections = []
        for sent in sentences:
            if not sent:
                continue
                
            # 获取纠错建议
            suggestions = self.sym_spell.lookup_compound(
                sent, 
                max_edit_distance=2,
                transfer_casing=True
            )
            
            if suggestions:
                corrected = suggestions[0].term
                if corrected != sent:
                    corrections.append({
                        "original": sent,
                        "corrected": corrected,
                        "distance": suggestions[0].distance
                    })
                sent = corrected
        
        return '。'.join(sentences), {"corrections": corrections}

if __name__ == '__main__':
    checker = SpellChecker()
    query = "王金老师教授什么课程？"
    corrected_query, metadata = checker.correct_query(query)
    print(corrected_query)
    print(json.dumps(metadata, indent=2))

王金老师教授什么课程。
{
  "corrections": []
}




In [1]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

tokenizer = AutoTokenizer.from_pretrained("ckiplab/bert-base-chinese-ner")
model = AutoModelForTokenClassification.from_pretrained("ckiplab/bert-base-chinese-ner")

nlp = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")

text = "请介绍一下王静老师在文汇楼3栋3201上的课程"

entities = nlp(text)
for entity in entities:
    print(entity)


  from .autonotebook import tqdm as notebook_tqdm
Device set to use cpu


{'entity_group': 'PERSON', 'score': 0.99999905, 'word': '王', 'start': 5, 'end': 6}
{'entity_group': 'PERSON', 'score': 0.9999846, 'word': '静', 'start': 6, 'end': 7}
{'entity_group': 'FAC', 'score': 0.99999654, 'word': '文 汇', 'start': 10, 'end': 12}
{'entity_group': 'FAC', 'score': 0.9999957, 'word': '楼', 'start': 12, 'end': 13}
{'entity_group': 'CARDINAL', 'score': 0.99997795, 'word': '3', 'start': 13, 'end': 14}


In [10]:
import jieba

# 自定义实体词典
jieba.load_userdict("./entities.txt")

# 输入文本
text = "请介绍一下王静老师在文汇楼3栋3201上的计算机网络课程。"

# 分词
words = jieba.cut(text)

# 输出分词结果
print(' '.join(words))


请 介绍 一下 王静 老师 在 文汇楼3栋3201 上 的 计算机网络 课程 。


In [11]:
import jieba

jieba.load_userdict("entities.txt")

# 同时将词表存入集合方便快速查询
with open("entities.txt", "r", encoding="utf-8") as f:
    entity_set = set(line.strip() for line in f)

In [13]:
def detect_error_words(text):
    words = jieba.lcut(text)
    error_candidates = []
    for word in words:
        if word not in entity_set and len(word) > 1:  # 过滤单字和非实体词
            error_candidates.append(word)
    return words, error_candidates

# 示例
text = "王静老师教授的课程有哪些？"
words, error_candidates = detect_error_words(text)
print("分词结果:", words)          # ['我', '打算', '去', '腾训', '公司', '参观', '，', '地址', '在', '南京市', '的', '鼓楼区']
print("候选错误词:", error_candidates)  # ['腾训']

分词结果: ['王静', '老师', '教授', '的', '课程', '有', '哪些', '？']
候选错误词: ['老师', '教授', '课程', '哪些']


In [24]:
import jieba
from fuzzywuzzy import fuzz, process

# 实体词表（教师名，课程名，教学楼名，教室名）
teachers = ["王津", "李华", "赵刚", "张丽"]
courses = ["算法设计与分析", "数据结构", "人工智能", "数据库系统"]
buildings = ["教学楼A", "教学楼B", "教学楼C"]
classrooms = ["101教室", "102教室", "201教室", "202教室"]

# 错别字纠错字典（可以扩展）
error_dict = {
    "王金": "王津",  # 错别字
    "算法课": "算法设计与分析",  # 简写
}

# 使用 fuzzywuzzy 提高匹配准确度
def correct_using_fuzzy(input_word, word_list):
    # 使用fuzzywuzzy进行模糊匹配，返回匹配度高的实体
    matched_word, score = process.extractOne(input_word, word_list, scorer=fuzz.partial_ratio)
    # 设置阈值，低于该阈值则认为没有匹配
    if score > 80:
        return matched_word
    return input_word

# 实体修正函数，结合 fuzzywuzzy 和字典
def correct_entities(text, entity_dict):
    words = jieba.cut(text)  # 使用jieba进行分词
    corrected_words = []
    
    for word in words:
        # 如果存在错误的实体，则进行修正
        if word in entity_dict:
            corrected_words.append(entity_dict[word])
        # 使用 fuzzywuzzy 进行模糊匹配修正
        else:
            corrected_teacher = correct_using_fuzzy(word, teachers)
            corrected_course = correct_using_fuzzy(word, courses)
            corrected_building = correct_using_fuzzy(word, buildings)
            corrected_classroom = correct_using_fuzzy(word, classrooms)
            
            # 将修正后的实体添加到结果中
            corrected_words.append(corrected_teacher if corrected_teacher != word else
                                   corrected_course if corrected_course != word else
                                   corrected_building if corrected_building != word else
                                   corrected_classroom if corrected_classroom != word else word)

    return ''.join(corrected_words)

# 示例输入
user_input = "王近老师的算法课在哪里上？"

# 进行实体修正
corrected_input = correct_entities(user_input, error_dict)

# 输出修正后的结果
print("原输入:", user_input)
print("修正后:", corrected_input)




原输入: 王近老师的算法课在哪里上？
修正后: 王近老师的算法设计与分析课在哪里上？


In [7]:
from neo4j import GraphDatabase

# 连接到Neo4j数据库
uri = "bolt://localhost:7687"  # Neo4j的地址
username = "neo4j"  # 用户名
password = "neo4j123456"  # 密码

# 创建Neo4j数据库驱动
driver = GraphDatabase.driver(uri, auth=(username, password))

# 插入数据的函数
def insert_triplets(tx):
    triplets = [
        ("杨振宁", "出生于", "中国安徽合肥"),
        ("杨振宁", "毕业于", "国立西南联合大学"),
        ("杨振宁", "博士毕业于", "芝加哥大学"),
        ("杨振宁", "获奖", "诺贝尔物理学奖"),
        ("杨振宁", "研究领域", "理论物理"),
        ("杨振宁", "父亲是", "杨武之"),
        ("诺贝尔物理学奖", "奖励领域", "物理学"),
        ("杨-米尔斯理论", "属于", "量子场论领域"),
        ("杨-米尔斯理论", "提出者", "杨振宁"),
        ("杨振宁", "合作者", "李政道")
    ]
    for subj, pred, obj in triplets:
        tx.run(
            "MERGE (a:Entity {name: $subj}) "
            "MERGE (b:Entity {name: $obj}) "
            "MERGE (a)-[:" + pred + "]->(b)",
            subj=subj, obj=obj
        )

# 插入数据
with driver.session() as session:
    session.write_transaction(insert_triplets)

print("数据插入完成！")

# 关闭驱动
driver.close()

  session.write_transaction(insert_triplets)


数据插入完成！


In [None]:
from pypinyin import pinyin, lazy_pinyin, Style

with open('../agents/data/entities.txt', 'r', encoding='utf-8') as f:
    entities = [line.strip() for line in f if line.strip().strip()] 
p2h = {}
entities = set(entities)
for entity in entities:
    py = ''.join(lazy_pinyin(entity, style=Style.NORMAL))
    if py in p2h:
        p2h[py].append(entity)
    else:
        p2h[py] = [entity]
for k,v in p2h.items():
    if len(v) > 1:
        print(k, v)

In [None]:
def load_entities(entities_path='../agents/data/entities.txt'):
        with open(entities_path, 'r', encoding='utf-8') as f:
            entities = [line.strip() for line in f if line.strip().strip()]
        entities = set(entities)
        p2h = {}
        for entity in entities:
            py = ''.join(lazy_pinyin(entity, style=Style.NORMAL))
            if py in p2h:
                p2h[py].append(entity)
            else:
                p2h[py] = [entity]
        return entities, p2h



False


In [15]:
import json

In [16]:
j = json.loads("{'person': ['王静'], 'classroom': ['格物楼1栋1507'], 'course': ['算法课'], 'department': ['信院']}")

JSONDecodeError: Expecting property name enclosed in double quotes: line 1 column 2 (char 1)