In [1]:
!pip install langchain accelerate peft

Collecting langchain
  Downloading langchain-0.1.16-py3-none-any.whl.metadata (13 kB)
Collecting peft
  Downloading peft-0.10.0-py3-none-any.whl.metadata (13 kB)
Collecting langchain-community<0.1,>=0.0.32 (from langchain)
  Downloading langchain_community-0.0.33-py3-none-any.whl.metadata (8.5 kB)
Collecting langchain-core<0.2.0,>=0.1.42 (from langchain)
  Downloading langchain_core-0.1.43-py3-none-any.whl.metadata (5.9 kB)
Collecting langchain-text-splitters<0.1,>=0.0.1 (from langchain)
  Downloading langchain_text_splitters-0.0.1-py3-none-any.whl.metadata (2.0 kB)
Collecting langsmith<0.2.0,>=0.1.17 (from langchain)
  Downloading langsmith-0.1.48-py3-none-any.whl.metadata (13 kB)
Collecting packaging>=20.0 (from accelerate)
  Downloading packaging-23.2-py3-none-any.whl.metadata (3.2 kB)
Collecting orjson<4.0.0,>=3.9.14 (from langsmith<0.2.0,>=0.1.17->langchain)
  Downloading orjson-3.10.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (49 kB)
[2K     [90m━━━━━━

**Load Fine-tuned Model**

In [2]:
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, BartForConditionalGeneration
from peft import PeftModel
import torch

base_model = "fnlp/bart-base-chinese"
new_model = "tonyma163/bart_v1"

base_model_reload = BartForConditionalGeneration.from_pretrained(
        base_model,
        return_dict=True,
        low_cpu_mem_usage=True,
        torch_dtype=torch.float16,
        device_map="auto",
        #trust_remote_code=True,
)
base_model_reload.half()

model = PeftModel.from_pretrained(base_model_reload, new_model)

2024-04-17 13:12:24.466533: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-17 13:12:24.466664: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-17 13:12:24.622536: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


config.json:   0%|          | 0.00/1.69k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/561M [00:00<?, ?B/s]

adapter_config.json:   0%|          | 0.00/650 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/7.09M [00:00<?, ?B/s]

In [3]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained(base_model, trust_remote_code=True)

tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

tokenizer_config.json:   0%|          | 0.00/479 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/259k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/156 [00:00<?, ?B/s]

In [6]:
from transformers import Text2TextGenerationPipeline

pipe = Text2TextGenerationPipeline(model=model, tokenizer=tokenizer)

The model 'PeftModelForSeq2SeqLM' is not supported for . Supported models are ['BartForConditionalGeneration', 'BigBirdPegasusForConditionalGeneration', 'BlenderbotForConditionalGeneration', 'BlenderbotSmallForConditionalGeneration', 'EncoderDecoderModel', 'FSMTForConditionalGeneration', 'GPTSanJapaneseForConditionalGeneration', 'LEDForConditionalGeneration', 'LongT5ForConditionalGeneration', 'M2M100ForConditionalGeneration', 'MarianMTModel', 'MBartForConditionalGeneration', 'MT5ForConditionalGeneration', 'MvpForConditionalGeneration', 'NllbMoeForConditionalGeneration', 'PegasusForConditionalGeneration', 'PegasusXForConditionalGeneration', 'PLBartForConditionalGeneration', 'ProphetNetForConditionalGeneration', 'SeamlessM4TForTextToText', 'SeamlessM4Tv2ForTextToText', 'SwitchTransformersForConditionalGeneration', 'T5ForConditionalGeneration', 'UMT5ForConditionalGeneration', 'XLMProphetNetForConditionalGeneration'].


In [7]:
pipe('周迅的星座是什么？')



[{'generated_text': '周 迅 的 星 座 是 处 女 座 。'}]

**Loading Document**

In [8]:
import pandas as pd
import ast

file_path = "/kaggle/input/nlp-knowledge-set/knowledge_set.txt"

data = []

# Open the file and parse each line from string to tuple
with open(file_path, 'r', encoding='utf-8') as file:
    for line in file:
        if line.strip():  # Ensure the line is not empty
            try:
                # Convert string representation of tuple to actual tuple
                tuple_data = ast.literal_eval(line.strip())
                data.append(tuple_data)
            except SyntaxError:
                print(f"Skipping malformed line: {line.strip()}")

# Load the data into a DataFrame
df = pd.DataFrame(data, columns=['Entity', 'Category', 'Answer'])

In [9]:
df.head()

Unnamed: 0,Entity,Category,Answer
0,西宁,2018-11-14,"阴,东风,最高气温:5℃,最低气温:-4℃"
1,何霄玲,喜好,poi
2,快乐大本营之快乐到家,评论,不好意思啊坡姐，我是你的路人黑，对不住了
3,辣相见川菜（三水总店）,特色菜,水煮鱼
4,浮城大亨,评论,人生是一幕大剧


**Knowledge Graph**

In [39]:
import networkx as nx
import pandas as pd

# Create a directed graph
G = nx.DiGraph()

# Add nodes and edges based on the DataFrame
for index, row in df.iterrows():
    entity_node = f"Entity: {row['Entity']}"
    category_node = f"{entity_node}, Category: {row['Category']}"
    answer_node = f"Answer: {row['Answer']}"

    # Add nodes and edges
    G.add_node(entity_node, type='Entity')
    G.add_node(category_node, type='Category', entity=row['Entity'])
    G.add_node(answer_node, type='Answer', category=row['Category'])
    
    G.add_edge(entity_node, category_node)
    G.add_edge(category_node, answer_node)

In [41]:
# Example query: Find all answers linked to a specific entity and category
entity_query = '西宁'
category_query = '2018-11-14'

processed_entity_query = f"Entity: {entity_query}"
processed_category_query = f"{processed_entity_query}, Category: {category_query}"

# First, find the category node directly connected to the entity
if (processed_entity_query, processed_category_query) in G.edges:
    answers = [node for node in G.successors(processed_category_query) if G.nodes[node]['type'] == 'Answer']
    for answer in answers:
        print(answer)
else:
    print("No such category for the given entity or wrong category/entity combination.")

Answer: 阴,东风,最高气温:5℃,最低气温:-4℃


**Find the Entity Only**

In [42]:
# Entity Only
def query_by_entity(graph, entity):
    # Create a comprehensive list to store results
    results = []

    # Construct the entity node identifier
    entity_node = f"Entity: {entity}"

    # Check if the entity node exists in the graph
    if entity_node in graph:
        # Get all category nodes linked to the entity
        category_nodes = [node for node in G.successors(entity_node) if G.nodes[node]['type'] == 'Category']
        
        for category in category_nodes:
            # Retrieve all answer nodes linked to this category
            answer_nodes = [node for node in G.successors(category) if G.nodes[node]['type'] == 'Answer']
            results.append((category, answer_nodes))
    else:
        print(f"No information available for entity: {entity}")
    
    return results

In [44]:
# Query the graph for a specific entity
entity = "周迅"
results = query_by_entity(G, entity)

# Print the results
if results:
    print(f"Information for entity '{entity}':")
    for category, answers in results:
        print(f"\n{category}:")
        for answer in answers:
            print(f"  - {answer}")
else:
    print("No results found for the queried entity.")


Information for entity '周迅':

Entity: 周迅, Category: 评论:
  - Answer: 灵气逼人，倔犟，聪慧
  - Answer: 我的妈呦这土boqi嗓子
  - Answer: 每一个形象都演活了。
  - Answer: 竟然和我喜欢的刘若英是好友，果然都是小资
  - Answer: 在天朝唯一喜欢的演员
  - Answer: 喜欢她的身体散发出的各种灵气，甚至是她的嗓音·
  - Answer: 中国大陆目前演技最好的演员。
  - Answer: 到现在都还有灵气，真的很难得
  - Answer: amazinglady
  - Answer: 演技、容貌、修养，三方面俱佳，缺点太少。
  - Answer: 看完听风者终于坚定了支持周迅的决心~~
  - Answer: 演技和纯真的完美结合，不做作，不矫情
  - Answer: 小精灵，很享受她的美
  - Answer: 太喜欢了一直很喜欢。
  - Answer: #印象周迅#孙纳和张学宁
  - Answer: 美人~~~对演艺事业有强烈追求的演员~~
  - Answer: 挺喜欢她的~觉得演什么都像
  - Answer: 借用一句台词，“怎么会有如此好德又好色的人呢？”
  - Answer: 目前内地唯一有精湛演技的女生！
  - Answer: 这样的女人真的很别致
  - Answer: 公子！我宣你！我也宣萱！
  - Answer: #印象周迅#周公子生日快乐
  - Answer: 内地最好的女演员，没有之一。我说的是演技
  - Answer: 早期其实并不喜欢你……看了《李米的猜想》才喜欢上了你
  - Answer: 就是喜欢你。演技超好，脸蛋真漂漂。
  - Answer: 为什么打九分看看，《李米的猜想》
  - Answer: 大爱周迅！美丽、灵！
  - Answer: 有灵气的演员，喜欢她的笑
  - Answer: 最喜欢有独特气质的人了
  - Answer: 周迅，中国真正好的女演员
  - Answer: 我从来不知道什么叫淑女，更不装，我活的随意！
  - Answer: 她身上有股子灵气，学是学不来的~~~
  - Answer: 从大明宫词开始就很喜欢你那聪明空灵的劲。。。
  - Answe

**Document Splitting, Embedding, and Vector Store**

**Query Engine**

**Query the LLM**