<a href="https://colab.research.google.com/github/syoooooung/capstone_design/blob/main/Query_Decomposition/SubQ_NER_hit.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install langchain openai ragas datasets faiss-cpu groq langchain_community evaluate anls sentence_transformers
!pip install -U langchain-openai

Collecting ragas
  Downloading ragas-0.2.6-py3-none-any.whl.metadata (8.1 kB)
Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.9.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.4 kB)
Collecting groq
  Downloading groq-0.12.0-py3-none-any.whl.metadata (13 kB)
Collecting langchain_community
  Downloading langchain_community-0.3.7-py3-none-any.whl.metadata (2.9 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting anls
  Downloading anls-0.0.2-py3-none-any.whl.metadata (5.2 kB)
Collecting tiktoken (from ragas)
  Downloading tiktoken-0.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)
Collecting langchain-openai (from ragas)
  Downloading langchain_openai-0.2.9-py3-none-any.whl.metadata (2.6 kB)
Collecting appdirs (from ragas)
  Downloading appdirs-1.4.4-py2.py3-none-any.whl.metadata (9.0 kB)
Collecting p

## Sub-Query의 Entity List 추출

In [None]:
from google.colab import drive
from langchain.vectorstores import FAISS
import faiss
from langchain_openai import OpenAIEmbeddings
from langchain.embeddings import OpenAIEmbeddings
from google.colab import drive, userdata
import os

os.environ['OPENAI_API_KEY'] = userdata.get('OPENAI_API_KEY2')

drive.mount('/content/drive')

file_path = '/content/drive/MyDrive/Jimin/hotpotqa_vectorstore_noise_type3.index'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
from langchain_openai import OpenAI
from langchain.prompts import ChatPromptTemplate
from langchain.schema.runnable import RunnablePassthrough
from langchain.schema.output_parser import StrOutputParser
from langchain.chat_models import ChatOpenAI

In [None]:
NER_prompt = """
You are a capable entity extractor.
You need to extract all Entities from the given sentence.
When extract entity, follow these guidelines:
1. Entities in all noun forms must be extracted.
2. Extracts all entities with explicitly stated meanings in sentences. Extract entities as specifically as possible without duplicating.
3. All Entities should be individually meaningful, You shouldn't extract meaningless Entities such as Be verbs
4. if a relationship is not explicitly stated, connect and extract related entities. if there is no relationship between entities, list them separately.
   - Entities should be connected based on their semantic relationship or if they belong to the same category (e.g., nationality -> American).
   - Avoid connecting entities where the relationship is unclear or ambiguous.
5. interrogative word must should be treated as an Entity.
All Entities should be extracted in the form of Entities, Entities, Entities.
Over-extracting is better than missing out.
Don't print anything other than what you asked

e.g. )
Question: What measures might the international community take if X (formerly Twitter) fails to comply with the European Union's Code?

->
What, measures, international community, X (formerly Twitter), European Union's Code

e.g. )
Question: Who was the Super Bowl MVP in 1979 and 1980.

->
Who, Super Bowl MVP, 1979 and 1980

e.g. )
Question: Is Kelly coming to the party tonight?

->
Kelly, party, tonight
"""

In [None]:
import json
import openai
from tqdm import tqdm

# JSON 파일 경로 설정
#input_file = '/content/drive/MyDrive/Final/Dataset/MultiHop_RAG/multihop_langchainv1.json'  # 입력 파일 경로
#output_file = '/content/multihop_subQ_Entities_langchain.json'  # 출력 파일 경로
#input_file = '/content/drive/MyDrive/Final/Dataset/HotpotQA/hotpot_langchainv2.json'  # 입력 파일 경로
input_file = '/content/drive/MyDrive/Dataset/QA/decomposed_langchain/decomposed_hotpot_langchain1.json'
output_file = '/content/hotpot_subQ_Entities_langchain.json'  # 출력 파일 경로
model = "gpt-4o-mini"

# LLM 호출을 통해 엔티티 추출
def get_entities(query):
    messages = [{
        "role": "system",
        "content": NER_prompt
    }, {
        "role": "user",
        "content": f"Question: {query}"
    }]
    response = openai.chat.completions.create(
        model=model,
        messages=messages,
        temperature=0
    )
    return response.choices[0].message.content.strip()

# JSON 파일 처리
with open(input_file, 'r', encoding='utf-8') as f:
    data = json.load(f)

cnt=0
for item in tqdm(data, desc="Processing queries", unit="query"):
    # decomposed_queries 이어붙이기
    concatenated_query = " ".join(item["decomposed"])

    # LLM 호출하여 엔티티 추출
    try:
        entity_list = get_entities(concatenated_query)
    except Exception as e:
        print(f"Error processing query: {item['question']}\nError: {e}")
        entity_list = ""

    # 콤마로 나눈 엔티티 리스트 생성
    entities = [entity.strip() for entity in entity_list.split(',') if entity.strip()]

    # 결과를 새로운 필드에 추가
    item["rener"] = {"entities": entities}


# 수정된 JSON 저장
with open(output_file, 'w', encoding='utf-8') as f:
    json.dump(data, f, ensure_ascii=False, indent=4)

print(f"저장 완료: {output_file}")



Processing queries: 100%|██████████| 500/500 [05:07<00:00,  1.62query/s]

저장 완료: /content/hotpot_subQ_Entities_langchain.json





## Entity Test

In [None]:
# JSON 파일 경로 설정 by conjuction
multihop_graph_file = '/content/drive/MyDrive/Final/Dataset/MultiHop_RAG/multihop_graph.json'


In [None]:
multihop_graph_file = '/content/drive/MyDrive/Final/Dataset/HotpotQA/hotpot_graph.json'

In [None]:
# JSON 파일 경로 설정 by langchain
#multihop_subq_entities_file = '/content/multihop_subQ_Entities_mycon.json'
#multihop_subq_entities_file = '/content/multihop_subQ_Entities_langchain.json'
#multihop_subq_entities_file = '/content/hotpot_subQ_Entities_mycon.json'
multihop_subq_entities_file = '/content/hotpot_subQ_Entities_langchain.json'

In [None]:
# JSON 로드
with open(multihop_graph_file, 'r', encoding='utf-8') as f:
    multihop_graph = json.load(f)

with open(multihop_subq_entities_file, 'r', encoding='utf-8') as f:
    multihop_subq_entities = json.load(f)

In [None]:
# Question 매칭을 위한 딕셔너리 생성
subq_map = {
    item['question'].lower(): {
        "entities": {entity.lower() for entity in item['rener']['entities']},  # 소문자로 변환
        "decomposed": [sentence.lower() for sentence in item['decomposed']]  # 소문자로 변환
    }
    for item in multihop_subq_entities
}

# Missing & Addition 계산
missing_counts = []
addition_counts = []
missing_cases = []  # Missing 발생 케이스 저장
addition_cases = []  # Addition 발생 케이스 저장

for graph_item in multihop_graph:
    question = graph_item['question']
    graph_entities = {entity.lower() for entity in graph_item['rener']['entities']}  # 소문자로 변환
    subq_data = subq_map.get(question.lower(), {"entities": set(), "decomposed": []})

    # Decomposed 필드에서 Graph의 모든 엔티티가 존재하는지 확인 (Missing)
    decomposed_text = " ".join(subq_data["decomposed"])  # 모든 decomposed 문장 합치기
    missing_entities = [entity for entity in graph_entities if entity not in decomposed_text]

    missing = len(missing_entities) > 0
    missing_counts.append(1 if missing else 0)
    if missing:
        missing_cases.append({
            "question": question,
            "missing_entities": missing_entities
        })

    # SubQ 엔티티가 Graph Question에 존재하는지 확인 (Addition)
    addition_entities = [entity for entity in subq_data["entities"] if entity not in question.lower()]
    addition = len(addition_entities) > 0
    addition_counts.append(1 if addition else 0)
    if addition:
        addition_cases.append({
            "question": question,
            "addition_entities": addition_entities
        })

# 평균 계산
missing_avg = sum(missing_counts) / len(missing_counts) if missing_counts else 0
addition_avg = sum(addition_counts) / len(addition_counts) if addition_counts else 0

# 결과 출력
print(f"Missing 평균: {missing_avg:.2f}")
print(f"Addition 평균: {addition_avg:.2f}")

# Missing 발생 케이스 출력
if missing_cases:
    print("\nMissing 발생 케이스:")
    for case in missing_cases:
        print(f"- Question: {case['question']}")
        print(f"  Missing Entities: {case['missing_entities']}")

# Addition 발생 케이스 출력
if addition_cases:
    print("\nAddition 발생 케이스:")
    for case in addition_cases:
        print(f"- Question: {case['question']}")
        print(f"  Addition Entities: {case['addition_entities']}")

Missing 평균: 0.49
Addition 평균: 0.72

Missing 발생 케이스:
- Question: Peter Curtis and Scott Draper are both what?
  Missing Entities: ['what']
- Question: Were Illinois Institute of Technology and Boise State University both bounded before 1950?
  Missing Entities: ['1950']
- Question: Which American singer and songwriter has a mezzo-soprano vocal range, Tim Armstrong or Tori Amos?
  Missing Entities: ['which', 'american singer and songwriter']
- Question: Who has won more awards, Dan Schneider or Helen Hunt?
  Missing Entities: ['who']
- Question: Which was released first, The Climb or Voices of Iraq?
  Missing Entities: ['which']
- Question: Which film came out first, Still Kicking: Six Artistic Women of Project Arts & Longevity or Time Bombs?
  Missing Entities: ['which', 'film']
- Question: Which opera has more acts, Parsifal or Manon by Massenet?
  Missing Entities: ['which']
- Question: What type of music were vocalists Billie Joe Armstrong and Frank Iero involved with?
  Missing Enti