In [1]:
!pip install langchain accelerate peft keybert jieba

Collecting langchain
  Downloading langchain-0.1.16-py3-none-any.whl.metadata (13 kB)
Collecting peft
  Downloading peft-0.10.0-py3-none-any.whl.metadata (13 kB)
Collecting keybert
  Downloading keybert-0.8.4.tar.gz (29 kB)
  Preparing metadata (setup.py) ... [?25ldone
Collecting langchain-community<0.1,>=0.0.32 (from langchain)
  Downloading langchain_community-0.0.33-py3-none-any.whl.metadata (8.5 kB)
Collecting langchain-core<0.2.0,>=0.1.42 (from langchain)
  Downloading langchain_core-0.1.44-py3-none-any.whl.metadata (5.9 kB)
Collecting langchain-text-splitters<0.1,>=0.0.1 (from langchain)
  Downloading langchain_text_splitters-0.0.1-py3-none-any.whl.metadata (2.0 kB)
Collecting langsmith<0.2.0,>=0.1.17 (from langchain)
  Downloading langsmith-0.1.48-py3-none-any.whl.metadata (13 kB)
Collecting sentence-transformers>=0.3.8 (from keybert)
  Downloading sentence_transformers-2.7.0-py3-none-any.whl.metadata (11 kB)
Collecting packaging>=20.0 (from accelerate)
  Downloading packaging-

In [109]:
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, BartForConditionalGeneration
from peft import PeftModel
import torch

base_model = "fnlp/bart-base-chinese"
new_model = "tonyma163/bart_v1"

device="cuda:0"

base_model_reload = BartForConditionalGeneration.from_pretrained(
        base_model,
        return_dict=True,
        low_cpu_mem_usage=True,
        torch_dtype=torch.float16,
        device_map=device,
        #trust_remote_code=True,
)
base_model_reload.half()

model = PeftModel.from_pretrained(base_model_reload, new_model)

config.json:   0%|          | 0.00/1.69k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/561M [00:00<?, ?B/s]

adapter_config.json:   0%|          | 0.00/650 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/7.09M [00:00<?, ?B/s]

In [110]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained(base_model, trust_remote_code=True)

tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

tokenizer_config.json:   0%|          | 0.00/479 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/259k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/156 [00:00<?, ?B/s]

In [111]:
from transformers import Text2TextGenerationPipeline

pipe = Text2TextGenerationPipeline(model=model, tokenizer=tokenizer)

The model 'PeftModelForSeq2SeqLM' is not supported for . Supported models are ['BartForConditionalGeneration', 'BigBirdPegasusForConditionalGeneration', 'BlenderbotForConditionalGeneration', 'BlenderbotSmallForConditionalGeneration', 'EncoderDecoderModel', 'FSMTForConditionalGeneration', 'GPTSanJapaneseForConditionalGeneration', 'LEDForConditionalGeneration', 'LongT5ForConditionalGeneration', 'M2M100ForConditionalGeneration', 'MarianMTModel', 'MBartForConditionalGeneration', 'MT5ForConditionalGeneration', 'MvpForConditionalGeneration', 'NllbMoeForConditionalGeneration', 'PegasusForConditionalGeneration', 'PegasusXForConditionalGeneration', 'PLBartForConditionalGeneration', 'ProphetNetForConditionalGeneration', 'SeamlessM4TForTextToText', 'SeamlessM4Tv2ForTextToText', 'SwitchTransformersForConditionalGeneration', 'T5ForConditionalGeneration', 'UMT5ForConditionalGeneration', 'XLMProphetNetForConditionalGeneration'].


**Loading Document**

In [22]:
import pandas as pd
import ast

file_path = "/kaggle/input/nlp-knowledge-set/knowledge_set.txt"

data = []

# Open the file and parse each line from string to tuple
with open(file_path, 'r', encoding='utf-8') as file:
    for line in file:
        if line.strip():  # Ensure the line is not empty
            try:
                # Convert string representation of tuple to actual tuple
                tuple_data = ast.literal_eval(line.strip())
                data.append(tuple_data)
            except SyntaxError:
                print(f"Skipping malformed line: {line.strip()}")

# Load the data into a DataFrame
df = pd.DataFrame(data, columns=['Entity', 'Category', 'Answer'])

In [23]:
df.head()

Unnamed: 0,Entity,Category,Answer
0,西宁,2018-11-14,"阴,东风,最高气温:5℃,最低气温:-4℃"
1,何霄玲,喜好,poi
2,快乐大本营之快乐到家,评论,不好意思啊坡姐，我是你的路人黑，对不住了
3,辣相见川菜（三水总店）,特色菜,水煮鱼
4,浮城大亨,评论,人生是一幕大剧


**Knowledge Graph**

In [62]:
import networkx as nx
import pandas as pd

# Create a directed graph
G = nx.DiGraph()

# Add nodes and edges based on the DataFrame
for index, row in df.iterrows():
    entity_node = f"Entity: {row['Entity']}"
    category_node = f"{entity_node} - Category: {row['Category']}"
    answer_node = f"{category_node} - Answer: {row['Answer']}"

    # Add nodes and edges
    G.add_node(entity_node, type='Entity')
    G.add_node(category_node, type='Category')
    G.add_node(answer_node, type='Answer', answer=row['Answer'])
    
    G.add_edge(entity_node, category_node)
    G.add_edge(category_node, answer_node)

In [63]:
# Example query: Find all answers linked to a specific entity and category
entity_query = '西宁'
category_query = '2018-11-14'

processed_entity_query = f"Entity: {entity_query}"
processed_category_query = f"{processed_entity_query} - Category: {category_query}"

# First, find the category node directly connected to the entity
if (processed_entity_query, processed_category_query) in G.edges:
    answers = [node for node in G.successors(processed_category_query) if G.nodes[node]['type'] == 'Answer']
    for answer in answers:
        print(answer)
else:
    print("No such category for the given entity or wrong category/entity combination.")

Entity: 西宁 - Category: 2018-11-14 - Answer: 阴,东风,最高气温:5℃,最低气温:-4℃


**KeyBert + Chinese Word Segmentation**

In [6]:
from keybert import KeyBERT
#周迅 的 星座 是 什么 ?
#你好，今天是几号了？
#prompt = "周迅 的 星座 是 什么 ?"

kw_model = KeyBERT()
#keywords = kw_model.extract_keywords(prompt)

In [107]:
import jieba

prompt = "周迅的星座是什么?"

# Chinese Word Segmentation
processed_prompt = jieba.lcut(prompt)
processed_prompt = ' '.join(processed_prompt)

test_keywords_with_scores = kw_model.extract_keywords(processed_prompt, keyphrase_ngram_range=(1, 1), stop_words=None)

In [108]:
# Extract only the keywords, discarding the scores
test_keywords = [keyword for keyword, _ in test_keywords_with_scores]
test_keywords

['星座', '周迅', '什么']

**RAG**

In [90]:
# Chinese Word Segmentation
def seg_keywords(query):
    processed_query = jieba.lcut(query)
    processed_query = ' '.join(processed_query)

    keywords_with_scores = kw_model.extract_keywords(processed_query, keyphrase_ngram_range=(1, 1), stop_words=None)
    
    # Extract only the keywords, discarding the scores
    keywords = [keyword for keyword, _ in keywords_with_scores]
    return keywords

In [91]:
def retrieve_answers(graph, query):
    keywords = seg_keywords(query)
    
    found_answers = []
    # Traverse the graph looking for matches
    for entity_node in (n for n in graph.nodes if graph.nodes[n].get('type') == 'Entity'):
        entity_keywords = [kw for kw in keywords if kw in entity_node]
        if entity_keywords:
            # Explore each category node linked to the entity
            for category_node in graph.successors(entity_node):
                category_keywords = [kw for kw in keywords if kw in category_node and kw not in entity_keywords]
                if category_keywords:
                    # Collect all answers under each matching category
                    answer_nodes = [node for node in graph.successors(category_node) if graph.nodes[node]['type'] == 'Answer']
                    for answer_node in answer_nodes:
                        found_answers.append(graph.nodes[answer_node]['answer'])  # Use 'answer' attribute

    return found_answers


In [104]:
#周迅的星座是什么?
#你知道张国荣的星座吗？
input_query = "你知道张国荣的星座吗？"

answers = retrieve_answers(G, input_query)
print(f"Question: {input_query} Response: {answers}")

input_query = "周迅的星座是什么?"

answers = retrieve_answers(G, input_query)
print(f"Question: {input_query} Response: {answers}")

Question: 你知道张国荣的星座吗？ Response: ['处女座']
Question: 周迅的星座是什么? Response: ['天秤座']


**RAG + LLM**

In [114]:
def query_system(graph, query):
    # Attempt to retrieve answers from the knowledge graph
    answers = retrieve_answers(graph, query)
    
    # If answers are found in the graph, return them
    if answers:
        return "Answer from knowledge graph:", answers
    
    # If no answers are found, defer to the language model
    else:
        generated_answer = pipe(query)
        return "Answer from language model:", generated_answer[0]['generated_text']

In [115]:
query = "周迅的星座是什么？"
result = query_system(G, query)
print(result)

('Answer from knowledge graph:', ['天秤座'])


In [116]:
query = "亲爱的，你知道《思念的距离》的主唱是谁吗"
result = query_system(G, query)
print(result)



('Answer from language model:', '这 首 歌 的 主 唱 是 周 杰 伦 哦 。')
