In [1]:
import os
import json
import pandas as pd
from tqdm import tqdm
from collections import Counter

from dotenv import load_dotenv
load_dotenv()

import matplotlib.pyplot as plt
plt.rcParams['font.family'] = 'Malgun Gothic'
plt.rcParams['axes.unicode_minus'] = False

In [2]:
import sys
sys.path.append('../code/ragas_custom')

from analyze_knowledgeGraph.relation import analyze_relationship
from generate_dataset.fastMultiAbstract import FastMultiHopAbstractQuerySynthesizer
from generate_dataset.utils import translate, analyze_synthetic_data
from generate_dataset.customPersonas import personas
from generate_dataset.customSenarios import make_scenarios
from generate_dataset.customMultiSpecific import SingleRelationMultiHopScenario, MultiRelationMultiHopScenario
from retrieve.config import generate_retriever_configs
from retrieve.precompute import precompute_retrievals
from retrieve.sparse import BM25
from evaluation.retrieve import optimization

from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper
from ragas.testset.graph import KnowledgeGraph
from ragas.testset.persona import generate_personas_from_kg
from ragas.testset import TestsetGenerator
from ragas.testset.synthesizers.single_hop.specific import SingleHopSpecificQuerySynthesizer
from ragas.testset.synthesizers.multi_hop.specific import MultiHopSpecificQuerySynthesizer

from langchain_openai import ChatOpenAI
from langchain_openai import OpenAIEmbeddings
from langchain_core.documents import Document

from langchain_community.vectorstores import FAISS

  from .autonotebook import tqdm as notebook_tqdm


# 1. 지식 그래프

In [3]:
kg = KnowledgeGraph.load('../data/document/역도/kg.json')

In [4]:
nodes = []
relations = []

for node in kg.nodes:
    if node.properties['document_metadata']['heading']['heading1'].count('Ⅰ') > 0:
        nodes.append(node)
for relation in kg.relationships:
    if relation.target.properties['document_metadata']['heading']['heading1'].count('Ⅰ') > 0 and relation.source.properties['document_metadata']['heading']['heading1'].count('Ⅰ') > 0:
        relations.append(relation)

kg = KnowledgeGraph(nodes, relations)

In [5]:
generator_llm = LangchainLLMWrapper(ChatOpenAI(model='gpt-4o-mini'))
generator_embeddings = LangchainEmbeddingsWrapper(OpenAIEmbeddings())

# 2. RAGAS Basic 데이터셋 생성

## 2-1. 페르소나 생성

In [6]:
# generated_personas = generate_personas_from_kg(kg=kg, llm=generator_llm, num_personas=4)

## 2-2. Basic 데이터셋 생성

In [8]:
query_distribution = [
    (SingleHopSpecificQuerySynthesizer(llm=generator_llm), 0.3),
    (MultiHopSpecificQuerySynthesizer(llm=generator_llm), 0.3),
    (FastMultiHopAbstractQuerySynthesizer(llm=generator_llm), 0.4)
]

generator = TestsetGenerator(
    llm=generator_llm,
    embedding_model=generator_embeddings,
    knowledge_graph=kg,
    persona_list=generated_personas,
)

# testset = generator.generate(testset_size=15, query_distribution=query_distribution)
basic_testset = testset.to_pandas()

Generating Scenarios: 100%|██████████| 3/3 [00:16<00:00,  5.58s/it]
Generating Samples: 100%|██████████| 16/16 [00:05<00:00,  3.03it/s]


In [10]:
# basic_testset = translate(basic_testset)
basic_testset = analyze_synthetic_data(kg, basic_testset, 'heading2')

# basic_testset.to_csv('../data/modular/dataset/basic_section1.csv', index=False)

# 3. RAGAS Custom 데이터셋 생성

In [17]:
personas

[Persona(name='Middle-Aged Beginner', role_description='User aged 40-60+ who is sensitive to injury prevention and posture correction. Pursues safe exercise methods considering joint health and overall physical condition.'),
 Persona(name='Diet-Motivated Beginner', role_description='Primary goal is body transformation and weight loss, interested in weightlifting-based routines and diet plans. Values information about exercise effectiveness and calorie consumption.'),
 Persona(name='Amateur Weightlifting Competitor', role_description='Preparing for amateur weightlifting competitions, interested in weight management and pre-competition routine planning. Needs information about competition preparation process and weight management.'),
 Persona(name='Fitness Gym Novice', role_description='Beginner who has received basic training at the gym but is new to weightlifting. Hopes for proper feedback on form and movement.'),
 Persona(name='Weightlifting Performance Optimizer', role_description='P

## 3-1. 시나리오 생성

In [6]:
nameOnly, bodyOnly, namePhase, nameBody = make_scenarios(kg, heading='heading2')

print(len(nameOnly))
print(len(bodyOnly))
print(len(namePhase))
print(len(nameBody))

7
21
0
3


## 3-2. Custom 데이터셋 생성

In [10]:
query_distribution = [
    (SingleRelationMultiHopScenario(llm=generator_llm, relation_list=nameOnly, name='exercise name', relation_type='exercise_entities_exercise_name_overlap', heading='heading2'), 0.2),
    (SingleRelationMultiHopScenario(llm=generator_llm, relation_list=bodyOnly, name='body part', relation_type='exercise_entities_body_part_overlap', heading='heading2'), 0.6),
    (MultiRelationMultiHopScenario(llm=generator_llm, relation_list=nameBody, name='exercise name, body part', relation_types=['exercise_entities_exercise_name_overlap', 'exercise_entities_body_part_overlap'], heading='heading2'), 0.2),
]

generator = TestsetGenerator(
    llm=generator_llm,
    embedding_model=generator_embeddings,
    knowledge_graph=kg,
    persona_list=personas,
)

In [11]:
# testset = generator.generate(testset_size=15, query_distribution=query_distribution)
custom_testset = testset.to_pandas()

Generating Scenarios:   0%|          | 0/3 [00:00<?, ?it/s]

exercise name, body part: No clusters found in the knowledge graph. Check relation types and relation list.


Generating Scenarios: 100%|██████████| 3/3 [00:01<00:00,  1.77it/s]
Generating Samples: 100%|██████████| 12/12 [00:05<00:00,  2.39it/s]


In [14]:
# custom_testset = translate(custom_testset)
custom_testset = analyze_synthetic_data(kg, custom_testset, 'heading2')
# custom_testset.to_csv('../data/modular/dataset/custom_section1.csv', index=False)

# 4. First-Stage Retrieve 최적화

In [20]:
basic_testset = pd.read_csv('../data/modular/dataset/basic_section1.csv')
custom_testset = pd.read_csv('../data/modular/dataset/custom_section1.csv')

merged_testset = pd.concat([basic_testset, custom_testset])
# merged_testset.to_csv('../data/modular/dataset/merged_section1.csv', index=False)

In [7]:
documents = [Document(page_content=node.properties['page_content'], metadata=node.properties['document_metadata']) for node in kg.nodes]
texts = [node.properties['page_content'] for node in kg.nodes]

kiwi_pos = BM25(k=15, type='kiwi_pos')
kiwi = BM25(k=15, type='kiwi')
bm25 = BM25(k=15)

kiwi_pos.from_texts(texts)
kiwi.from_texts(texts)
bm25.from_texts(texts)

embeddings = OpenAIEmbeddings()
db = FAISS.from_documents(documents, embeddings)

In [None]:
# merged_testset = precompute_retrievals(merged_testset, db, bm25, kiwi, kiwi_pos)
configs = generate_retriever_configs()
optimization_result = optimization(configs, merged_testset)

In [17]:
optimization_result.sort_values(by=['recall', 'map', 'ndcg'], ascending=False).head(20)

Unnamed: 0,k,alpha,dense_type,morphological_analyzer,fetch_k,lambda_mult,score_threshold,ndcg,recall,map
27,15,100,mmr,,1.5,0.3,,0.783093,0.050143,0.821852
28,15,100,mmr,,1.5,0.5,,0.783093,0.050143,0.821852
29,15,100,mmr,,1.5,0.7,,0.783093,0.050143,0.821852
30,15,100,mmr,,2.0,0.3,,0.783093,0.050143,0.821852
31,15,100,mmr,,2.0,0.5,,0.783093,0.050143,0.821852
32,15,100,mmr,,2.0,0.7,,0.783093,0.050143,0.821852
33,15,100,mmr,,2.5,0.3,,0.783093,0.050143,0.821852
34,15,100,mmr,,2.5,0.5,,0.783093,0.050143,0.821852
35,15,100,mmr,,2.5,0.7,,0.783093,0.050143,0.821852
18,10,100,mmr,,1.5,0.3,,0.748668,0.033728,0.817954


In [18]:
optimization_result.sort_values(by=['recall', 'map', 'ndcg'], ascending=False).head(40)

Unnamed: 0,k,alpha,dense_type,morphological_analyzer,fetch_k,lambda_mult,score_threshold,ndcg,recall,map
27,15,100,mmr,,1.5,0.3,,0.783093,0.050143,0.821852
28,15,100,mmr,,1.5,0.5,,0.783093,0.050143,0.821852
29,15,100,mmr,,1.5,0.7,,0.783093,0.050143,0.821852
30,15,100,mmr,,2.0,0.3,,0.783093,0.050143,0.821852
31,15,100,mmr,,2.0,0.5,,0.783093,0.050143,0.821852
32,15,100,mmr,,2.0,0.7,,0.783093,0.050143,0.821852
33,15,100,mmr,,2.5,0.3,,0.783093,0.050143,0.821852
34,15,100,mmr,,2.5,0.5,,0.783093,0.050143,0.821852
35,15,100,mmr,,2.5,0.7,,0.783093,0.050143,0.821852
18,10,100,mmr,,1.5,0.3,,0.748668,0.033728,0.817954
