In [1]:
import os
import pickle
import random
import inspect
import logging
import asyncio
import numpy as np
import nest_asyncio
from sqlalchemy.util import await_only
from lightrag.utils import EmbeddingFunc
from lightrag import LightRAG, QueryParam
from lightrag.llm.siliconcloud import siliconcloud_embedding
from lightrag.llm.ollama import ollama_model_complete, ollama_embed
from lightrag.llm.openai import openai_complete_if_cache, openai_embed

nest_asyncio.apply()

In [2]:
WORKING_DIR = "./rag_db/All_KG_LLAMA/"
logging.basicConfig(format="%(levelname)s:%(message)s", level=logging.INFO)
if not os.path.exists(WORKING_DIR):
    os.mkdir(WORKING_DIR)

In [3]:
chat_model = "Pro/meta-llama/Meta-Llama-3.1-8B-Instruct"
embed_model = 'BAAI/bge-m3'
api_key = 'your-api-key'

In [4]:
entities = [
    "brain tumor classification", "molecular diagnosis", "adult-type diffuse gliomas",
    "pediatric tumors", "diffuse glioma", "astrocytoma, IDH-mutant", "oligodendroglioma, IDH-mutant, and 1p/19q-codeleted",
    "glioblastoma, IDH-wildtype", "Diffuse midline glioma, H3 K27-altered", "diffuse hemispheric glioma", "infant-type hemispheric glioma",
    "diffuse astrocytoma", "polymorphous low-grade neuroepithelial tumor", "Pediatric-type diffuse low-grade gliomas", "ependymomas", "medulloblastoma",
    "choroid plexus tumors", "meningioma", "neurofibroma", "radiological features", "molecular markers", "diagnostic accuracy",
    "molecular classification", "radioactive treatment", "chemotherapy", "temozolomide", "surgery", "pathological biopsy"]

In [5]:
# async def llm_model_func(
#     prompt, system_prompt=None, history_messages=[], keyword_extraction=False, **kwargs
# ) -> str:
#     return await openai_complete_if_cache(
#         chat_model,
#         prompt,
#         system_prompt=system_prompt,
#         history_messages=history_messages,
#         api_key=api_key,
#         base_url="https://api.siliconflow.cn/v1/",
#         **kwargs,
#     )

# async def llm_model_func(
#     prompt, system_prompt=None, history_messages=[], keyword_extraction=False, **kwargs
# ) -> str:
#     return await openai_complete_if_cache(
#         'llama3.3-70b-instruct',
#         prompt,
#         system_prompt=system_prompt,
#         history_messages=history_messages,
#         api_key='api-key',
#         base_url="https://dashscope.aliyuncs.com/compatible-mode/v1",
#         **kwargs,
#     )

async def llm_model_func(
    prompt, system_prompt=None, history_messages=[], keyword_extraction=False, **kwargs
) -> str:
    await asyncio.sleep(random.randint(1, 10))
    results = await openai_complete_if_cache(
        chat_model,
        prompt,
        system_prompt=system_prompt,
        history_messages=history_messages,
        api_key=api_key,
        base_url="https://api.siliconflow.cn/v1/",
        **kwargs,
    )
    await asyncio.sleep(random.randint(5, 20))
    return results

async def embedding_func(texts: list[str]) -> np.ndarray:
    return await siliconcloud_embedding(
        texts,
        model=embed_model,
        api_key=api_key,
        max_token_size=8192,
    )

rag = LightRAG(
    working_dir=WORKING_DIR,
    llm_model_func=llm_model_func,
    embedding_func=EmbeddingFunc(
        embedding_dim=1024, max_token_size=8192, func=embedding_func
    ),
    enable_llm_cache=False,
    chunk_token_size=8192,
    addon_params={
        "entity_types": entities
    }
)

INFO:Logger initialized for working directory: ./rag_db/All_KG_LLAMA/
INFO:Load KV json_doc_status_storage with 0 data
INFO:Load KV llm_response_cache with 2 data
INFO:Load KV full_docs with 7 data
INFO:Load KV text_chunks with 263 data
INFO:Loaded graph from ./rag_db/All_KG_LLAMA/graph_chunk_entity_relation.graphml with 3827 nodes, 1181 edges
INFO:Load (3587, 1024) data
INFO:Init {'embedding_dim': 1024, 'metric': 'cosine', 'storage_file': './rag_db/All_KG_LLAMA/vdb_entities.json'} 3587 data
INFO:Load (1181, 1024) data
INFO:Init {'embedding_dim': 1024, 'metric': 'cosine', 'storage_file': './rag_db/All_KG_LLAMA/vdb_relationships.json'} 1181 data
INFO:Load (263, 1024) data
INFO:Init {'embedding_dim': 1024, 'metric': 'cosine', 'storage_file': './rag_db/All_KG_LLAMA/vdb_chunks.json'} 263 data
INFO:Loaded document status storage with 7 records


In [6]:
# import re
# from src.utils import MarkdownParser

# for i in os.listdir('raw_file/'):
#     if i.endswith('.md'):
#         with open(f'raw_file/{i}') as f:
#             rpl_content = f.read()
#         parser = MarkdownParser(rpl_content, a=1200, b=4800)
#         paragraphs = parser.get_paragraphs()
#         chunks = []
#         for paragraph in paragraphs:
#             chunk_content = re.sub(' +\n', '\n', paragraph.content)
#             chunk_content = re.sub('\n+', '\n', chunk_content)
#             chunks.append(f'''<header> {' -> '.join([f'{{{i}}}' for i in paragraph.headers])} </header> <content> {chunk_content} </content>''')
#         rag.insert('\n\n'.join(chunks), '\n\n', True)

In [7]:
# import networkx as nx
# from pyvis.network import Network

# # Load the GraphML file
# G = nx.read_graphml('./rag_db/All_KG/graph_chunk_entity_relation.graphml')

# # Create a Pyvis network
# net = Network(notebook=True)

# # Convert NetworkX graph to Pyvis network
# net.from_nx(G)

# # Save and display the network
# net.show('knowledge_graph.html')

In [8]:
import pandas as pd
from src.llm import async_response_with_llama
from src.prompts import query_system_rag_en, query_system_norag_en, judge_query_noRAG_en, judge_query_RAG_en

In [9]:
df_query = pd.read_table('query/Query_EN.txt')
df_query['QID'] = df_query.apply(lambda x: f'{x["tools"]}:{x["query_id"]}', axis=1)
df_judge = df_query[df_query['Attribute']=='judge'].copy()

In [10]:
if os.path.exists('answer/llama_en_judge_noRag.txt'):
    dfM = pd.read_table('answer/llama_en_judge_noRag.txt')
    retry_list = []
    for k, v in zip(dfM['QID'], dfM['ai_answer']):
        if '"Error"' in v:
            retry_list.append(k)
    if len(retry_list) > 0:
        print("retry list:", len(retry_list))
        df_judge = df_judge[df_judge['QID'].isin(retry_list)]

In [9]:
judge_sessions = {}
for k, v in zip(df_judge['QID'], df_judge['question']):
    messages = [
        {"role": 'system', 'content': query_system_norag_en}, 
        {"role": "user", 'content': judge_query_noRAG_en.render(query=v)}
    ]
    # query_id = f'{t}:{k}'
    judge_sessions[k] = messages
result = await async_response_with_llama(judge_sessions)

INFO:Retrying request to /chat/completions in 0.423886 seconds
INFO:Retrying request to /chat/completions in 0.418812 seconds
INFO:Retrying request to /chat/completions in 0.376837 seconds
INFO:Retrying request to /chat/completions in 0.403860 seconds
INFO:Retrying request to /chat/completions in 0.462784 seconds
INFO:Retrying request to /chat/completions in 0.487529 seconds
INFO:Retrying request to /chat/completions in 0.481760 seconds
INFO:Retrying request to /chat/completions in 0.489677 seconds
INFO:Retrying request to /chat/completions in 0.494941 seconds
INFO:Retrying request to /chat/completions in 0.432192 seconds
INFO:Retrying request to /chat/completions in 0.418961 seconds
INFO:Retrying request to /chat/completions in 0.470595 seconds
INFO:Retrying request to /chat/completions in 0.429977 seconds
INFO:Retrying request to /chat/completions in 0.454378 seconds
INFO:Retrying request to /chat/completions in 0.455701 seconds
INFO:Retrying request to /chat/completions in 0.423824 

In [10]:
dfq1a1 = pd.DataFrame(result, columns=['QID', 'ai_answer'])
dfM = pd.merge(df_judge, dfq1a1, on='QID')
dfM['√'] = dfM.apply(lambda x: 1 if x['answer'] in x['ai_answer'] else 0, axis=1)
dfM.to_csv('answer/llama_en_judge_noRag.txt', sep='\t', index=False)
dfM['√'].value_counts()

√
1    587
0    253
Name: count, dtype: int64

In [14]:
async def async_rag_search(sessions, rag, top_k=1):
    tasks = []
    for session_id, session in sessions.items():
        tasks.append(rag.aquery(session, param=QueryParam(mode="mix", top_k=top_k, only_need_prompt=True)))   
    # results = await asyncio.gather(*tasks)
    semaphore = asyncio.Semaphore(20)
    async with semaphore:
        results = await asyncio.gather(*tasks)
        await asyncio.sleep(30)
    return results

In [15]:
if os.path.exists('llama_kgs_en_judge.pkl'):
    with open('llama_kgs_en_judge.pkl', 'rb') as f:
        kgs = pickle.load(f)
else:
    kgs = await async_rag_search(dict(zip(df_judge['QID'], df_judge['question'])), rag)
    with open('llama_kgs_en_judge.pkl', 'wb') as f:
        pickle.dump(kgs, f)

In [16]:
if os.path.exists('answer/llama_en_judge_Rag.txt'):
    dfM = pd.read_table('answer/llama_en_judge_Rag.txt')
    retry_list = []
    for k, v in zip(dfM['QID'], dfM['ai_answer']):
        if '"Error"' in v:
            retry_list.append(k)
    if len(retry_list) > 0:
        print("retry list:", len(retry_list))
        df_judge = df_judge[df_judge['QID'].isin(retry_list)]

In [19]:
judge_sessions_rag = {}
for x, y, z in zip(df_judge['QID'], df_judge['question'], kgs):
    chunk = z.split('---Data Sources---')[1].split('---Response Requirements---')[0].strip()
    messages = [
        {"role": 'system', 'content': query_system_rag_en},
        {"role": "user", 'content': judge_query_RAG_en.render(query=y, content=chunk)}
    ]
    judge_sessions_rag[x] = messages
result2 = await async_response_with_llama(judge_sessions_rag)

INFO:Retrying request to /chat/completions in 0.493741 seconds
INFO:Retrying request to /chat/completions in 0.441279 seconds
INFO:Retrying request to /chat/completions in 0.452285 seconds
INFO:Retrying request to /chat/completions in 0.428626 seconds
INFO:Retrying request to /chat/completions in 0.441550 seconds
INFO:Retrying request to /chat/completions in 0.380758 seconds
INFO:Retrying request to /chat/completions in 0.453457 seconds
INFO:Retrying request to /chat/completions in 0.469818 seconds
INFO:Retrying request to /chat/completions in 0.499794 seconds
INFO:Retrying request to /chat/completions in 0.489210 seconds
INFO:Retrying request to /chat/completions in 0.450175 seconds
INFO:Retrying request to /chat/completions in 0.421503 seconds
INFO:Retrying request to /chat/completions in 0.463017 seconds
INFO:Retrying request to /chat/completions in 0.439514 seconds
INFO:Retrying request to /chat/completions in 0.477220 seconds
INFO:Retrying request to /chat/completions in 0.387363 

In [21]:
dfq1a2 = pd.DataFrame(result2, columns=['QID', 'ai_answer'])
dfM2 = pd.merge(df_judge, dfq1a2, on='QID')
dfM2['√'] = dfM2.apply(lambda x: 1 if x['answer'] in x['ai_answer'] else 0, axis=1)
dfM2.to_csv('answer/llama_en_judge_Rag.txt', sep='\t', index=False)
dfM2['√'].value_counts()

√
1    516
0    324
Name: count, dtype: int64

In [18]:
from src.prompts import sele_query_noRAG_en, sele_query_RAG_en

In [19]:
df_query = pd.read_table('query/Query_EN.txt')
df_query['QID'] = df_query.apply(lambda x: f'{x["tools"]}:{x["query_id"]}', axis=1)
df_sele = df_query[df_query['Attribute']=='selection'].copy()

In [23]:
if os.path.exists('answer/llama_en_sele_noRag.txt'):
    dfM = pd.read_table('answer/llama_en_sele_noRag.txt')
    retry_list = []
    for k, v in zip(dfM['QID'], dfM['ai_answer']):
        if '"Error"' in v:
            retry_list.append(k)
    if len(retry_list) > 0:
        print("retry list:", len(retry_list))
        df_sele = df_sele[df_sele['QID'].isin(retry_list)]
        result_dict = dict(zip(dfM['QID'], dfM['ai_answer']))

retry list: 473


In [25]:
result_dict

{'deepseek:Query_1': '{"title": "Error",\n                        "content": "Failed to generate step after 5 attempts. Error: Request timed out.",\n                        "next_action": "final_answer"}',
 'deepseek:Query_2': '{"title": "Error",\n                        "content": "Failed to generate step after 5 attempts. Error: Request timed out.",\n                        "next_action": "final_answer"}',
 'deepseek:Query_3': '{"title": "Error",\n                        "content": "Failed to generate step after 5 attempts. Error: Request timed out.",\n                        "next_action": "final_answer"}',
 'deepseek:Query_4': '{"title": "Error",\n                        "content": "Failed to generate step after 5 attempts. Error: Request timed out.",\n                        "next_action": "final_answer"}',
 'deepseek:Query_5': '{"title": "Error",\n                        "content": "Failed to generate step after 5 attempts. Error: Request timed out.",\n                        "ne

In [26]:
sele_sessions = {}
for k, v in zip(df_sele['QID'], df_sele['question']):
    messages = [
        {"role": 'system', 'content': query_system_norag_en}, 
        {"role": "user", 'content': sele_query_noRAG_en.render(query=v)}
    ]
    sele_sessions[k] = messages
result = await async_response_with_llama(sele_sessions)

In [36]:
result_update = dict(result)
if 'result_dict' in locals():
    result_dict.update(result_update)
else:
    result_dict = result_update.copy()
retry_list = []
for k, v in result_dict.items():
    if '"Error"' in v:
        retry_list.append(k)
if len(retry_list) > 0:
    print("retry list:", len(retry_list))

In [58]:
dfq2a1 = pd.DataFrame.from_dict([result_dict]).T
dfq2a1.columns = ['ai_answer']
dfq2a1['QID'] = dfq2a1.index.to_list()
df_sele = df_query[df_query['Attribute']=='selection'].copy()
dfM = pd.merge(df_sele, dfq2a1, on='QID')
dfM['√'] = dfM.apply(lambda x: 1 if x['answer'] in x['ai_answer'] else 0, axis=1)
dfM.to_csv('answer/llama_en_sele_noRag.txt', sep='\t', index=False)
dfM['√'].value_counts()

√
1    644
0    196
Name: count, dtype: int64

In [60]:
df_sele = df_query[df_query['Attribute']=='selection'].copy()
if os.path.exists('llama_kgs_en_sele.pkl'):
    with open('llama_kgs_en_sele.pkl', 'rb') as f:
        kgs = pickle.load(f)
else:
    kgs = await async_rag_search(dict(zip(df_sele['QID'], df_sele['question'])), rag)
    # kgs = await async_rag_search(dict(zip(df_judge['QID'], df_judge['question'])), rag)
    with open('llama_kgs_en_sele.pkl', 'wb') as f:
        pickle.dump(kgs, f)

In [61]:
sele_sessions_rag = {}
for x, y, z in zip(df_sele['QID'], df_sele['question'], kgs):
    chunk = z.split('---Data Sources---')[1].split('---Response Requirements---')[0].strip()
    messages = [
        {"role": 'system', 'content': query_system_rag_en},
        {"role": "user", 'content': sele_query_RAG_en.render(query=y, content=chunk)}
    ]
    sele_sessions_rag[x] = messages

In [62]:
if os.path.exists('answer/llama_en_sele_Rag.txt'):
    dfM = pd.read_table('answer/llama_en_sele_Rag.txt')
    retry_list = []
    for k, v in zip(dfM['QID'], dfM['ai_answer']):
        if '"Error"' in v:
            retry_list.append(k)
    if len(retry_list) > 0:
        print("retry list:", len(retry_list))
        # df_sele = df_sele[df_sele['QID'].isin(retry_list)]
        sele_sessions_rag = {k: v for k, v in sele_sessions_rag.items() if k in retry_list}
        print("retry dict:", len(sele_sessions_rag))
        result_dict = dict(zip(dfM['QID'], dfM['ai_answer']))

retry list: 147
retry dict: 147


In [63]:
result2 = await async_response_with_llama(sele_sessions_rag)

In [65]:
result_update = dict(result2)
if 'result_dict' in locals():
    result_dict.update(result_update)
else:
    result_dict = result_update.copy()
retry_list = []
for k, v in result_dict.items():
    if '"Error"' in v:
        retry_list.append(k)
if len(retry_list) > 0:
    print("retry list:", len(retry_list))

In [66]:
# sele_sessions_rag = {}
# for k, v in zip(df_sele['query_id'], df_sele['question']):
#     kg = rag.query(v, param=QueryParam(mode="mix", top_k=1, only_need_prompt=True))
#     chunk = kg.split('---Data Sources---')[1].split('---Response Requirements---')[0].strip()
#     messages = [
#         {"role": 'system', 'content': query_system_rag_en},
#         {"role": "user", 'content': sele_query_RAG_en.render(query=v, content=chunk)}
#     ]
#     sele_sessions_rag[k] = messages
# result2 = await async_response_with_llama(sele_sessions_rag)

In [68]:
dfq2a2 = pd.DataFrame.from_dict([result_dict]).T
dfq2a2.columns = ['ai_answer']
dfq2a2['QID'] = dfq2a2.index.to_list()
# dfq2a2 = pd.DataFrame(result2, columns=['QID', 'ai_answer'])
dfM2 = pd.merge(df_sele, dfq2a2, on='QID')
dfM2['√'] = dfM2.apply(lambda x: 1 if x['answer'] in x['ai_answer'] else 0, axis=1)
dfM2.to_csv('answer/llama_en_sele_Rag.txt', sep='\t', index=False)
dfM2['√'].value_counts()

√
1    639
0    201
Name: count, dtype: int64