In [1]:
import asyncio
import json
import numpy as np
from typing import List, Dict, Any
import sys
import os
from dotenv import load_dotenv

load_dotenv()

# 프로젝트 루트 경로 추가
sys.path.append('/Users/jang-wonjun/Desktop/Dev/FOM2025_Summer_Conf')

# 필요한 모듈들 임포트
from backend.app.shared.infra.external.supabase_client import supabase_client
from backend.app.shared.infra.external.openai_client import openai_client

In [3]:
async def test_db_connection():
    """DB 연결 테스트"""
    try:
        # 간단한 쿼리로 연결 테스트
        result = supabase_client.client.table("papers").select("id, title").execute()
        print("✅ DB 연결 성공!")
        print(f"총 {len(result.data)}개 논문 조회됨")
        
        # 첫 번째 논문 정보 출력
        if result.data:
            first_paper = result.data[0]
            print(f"첫 번째 논문: ID={first_paper['id']}, 제목={first_paper['title'][:50]}...")
        
        return True
    except Exception as e:
        print(f"❌ DB 연결 실패: {e}")
        return False

# 실행
await test_db_connection()

✅ DB 연결 성공!
총 1000개 논문 조회됨
첫 번째 논문: ID=4017, 제목=MultiGO: Towards Multi-level Geometry Learning for...


True

In [4]:
async def test_embedding_generation():
    """임베딩 생성 테스트"""
    test_queries = [
        "RAG retrieval augmented generation",
        "large language models",
        "computer vision object detection",
        "machine learning deep learning"
    ]
    
    for query in test_queries:
        try:
            print(f"\n🔍 쿼리: {query}")
            
            # 임베딩 생성
            embedding = await openai_client.generate_embedding(query)
            print(f"✅ 임베딩 생성 성공! 차원: {len(embedding)}")
            print(f"임베딩 샘플: {embedding[:5]}...")
            
        except Exception as e:
            print(f"❌ 임베딩 생성 실패: {e}")

# 실행
await test_embedding_generation()


🔍 쿼리: RAG retrieval augmented generation
✅ 임베딩 생성 성공! 차원: 1536
임베딩 샘플: [0.019824672, 0.0017523596, 0.0031451173, 0.004810837, -0.015121903]...

🔍 쿼리: large language models
✅ 임베딩 생성 성공! 차원: 1536
임베딩 샘플: [0.008157737, 0.00031622127, 0.034374364, -0.045386944, -0.0640415]...

🔍 쿼리: computer vision object detection
✅ 임베딩 생성 성공! 차원: 1536
임베딩 샘플: [-0.017754117, -0.016087063, -0.025196318, -0.035103377, -0.036651354]...

🔍 쿼리: machine learning deep learning
✅ 임베딩 생성 성공! 차원: 1536
임베딩 샘플: [-0.02746794, -0.054169122, 0.009009394, -0.033038203, 0.023814568]...


In [5]:
def calculate_cosine_similarity(vec1: List[float], vec2: List[float]) -> float:
    """코사인 유사도 계산"""
    try:
        vec1 = np.array(vec1, dtype=float)
        vec2 = np.array(vec2, dtype=float)
        
        if vec1.shape != vec2.shape:
            return 0.0
            
        dot_product = np.dot(vec1, vec2)
        norm1 = np.linalg.norm(vec1)
        norm2 = np.linalg.norm(vec2)
        
        if norm1 == 0 or norm2 == 0:
            return 0.0
            
        return float(dot_product / (norm1 * norm2))
    except Exception as e:
        print(f"유사도 계산 오류: {e}")
        return 0.0

async def test_cosine_similarity():
    """코사인 유사도 테스트"""
    # 테스트용 임베딩들
    query = "RAG retrieval augmented generation"
    query_embedding = await openai_client.generate_embedding(query)
    
    # DB에서 몇 개 논문 가져오기
    result = supabase_client.client.table("papers").select("id, title, combined_embedding").limit(10).execute()
    
    similarities = []
    for paper in result.data:
        try:
            # 임베딩 파싱
            if isinstance(paper['combined_embedding'], str):
                paper_embedding = json.loads(paper['combined_embedding'])
            else:
                paper_embedding = paper['combined_embedding']
            
            # 유사도 계산
            similarity = calculate_cosine_similarity(query_embedding, paper_embedding)
            similarities.append({
                'id': paper['id'],
                'title': paper['title'][:50] + "...",
                'similarity': similarity
            })
            
        except Exception as e:
            print(f"논문 {paper['id']} 처리 실패: {e}")
    
    # 유사도 순으로 정렬
    similarities.sort(key=lambda x: x['similarity'], reverse=True)
    
    print(f"\n🔍 쿼리: {query}")
    print("�� 유사도 결과 (상위 5개):")
    for i, item in enumerate(similarities[:5]):
        print(f"{i+1}. ID: {item['id']}, 유사도: {item['similarity']:.3f}")
        print(f"   제목: {item['title']}")

# 실행
await test_cosine_similarity()


🔍 쿼리: RAG retrieval augmented generation
�� 유사도 결과 (상위 5개):
1. ID: 10, 유사도: 0.571
   제목: Unsupervised Information Refinement Training of La...
2. ID: 4, 유사도: 0.321
   제목: MAGE: Machine-generated Text Detection in the Wild...
3. ID: 11, 유사도: 0.270
   제목: CSCD-NS: a Chinese Spelling Check Dataset for Nati...
4. ID: 3, 유사도: 0.265
   제목: Unsupervised Multimodal Clustering for Semantics D...
5. ID: 7, 유사도: 0.255
   제목: Exploring Chain-of-Thought for Multi-modal Metapho...


In [6]:
async def test_conference_search():
    """학회별 논문 검색 테스트"""
    field = "Natural Language Processing (NLP)"
    keywords = ["RAG"]
    
    print(f"🔍 분야: {field}")
    print(f"�� 키워드: {keywords}")
    
    try:
        # 학회별 Top-3 검색
        papers = await supabase_client.get_top_papers_by_conference(field, keywords, 3)
        
        print(f"\n✅ 총 {len(papers)}개 논문 검색됨")
        
        # 학회별로 그룹화
        conference_groups = {}
        for paper in papers:
            conference = paper.get('conference', 'Unknown')
            if conference not in conference_groups:
                conference_groups[conference] = []
            conference_groups[conference].append(paper)
        
        # 결과 출력
        for conference, papers_in_conf in conference_groups.items():
            print(f"\n�� {conference} ({len(papers_in_conf)}개):")
            for i, paper in enumerate(papers_in_conf):
                print(f"  {i+1}. ID: {paper['id']}, 유사도: {paper.get('similarity_score', 0):.3f}")
                print(f"     제목: {paper['title'][:60]}...")
                
    except Exception as e:
        print(f"❌ 검색 실패: {e}")

# 실행
await test_conference_search()

🔍 분야: Natural Language Processing (NLP)
�� 키워드: ['RAG']

✅ 총 8개 논문 검색됨

�� ACL Anthology (ACL, EMNLP, NAACL, COLING) (3개):
  1. ID: 703, 유사도: 0.416
     제목: DRAGIN: Dynamic Retrieval Augmented Generation based on the ...
  2. ID: 586, 유사도: 0.403
     제목: RAGTruth: A Hallucination Corpus for Developing Trustworthy ...
  3. ID: 10, 유사도: 0.394
     제목: Unsupervised Information Refinement Training of Large Langua...

�� EMNLP (Empirical Methods in NLP) (3개):
  1. ID: 1184, 유사도: 0.256
     제목: RSA-Control: A Pragmatics-Grounded Lightweight Controllable ...
  2. ID: 1933, 유사도: 0.127
     제목: Comparing a BERT Classifier and a GPT classifier for Detecti...
  3. ID: 870, 유사도: 0.120
     제목: Prompts have evil twins...

�� NAACL (North American Chapter of ACL) (2개):
  1. ID: 2138, 유사도: 0.172
     제목: An Interactive Framework for Profiling News Media Sources...
  2. ID: 2180, 유사도: 0.140
     제목: BeLLM: Backward Dependency Enhanced Large Language Model for...


In [7]:
import asyncio
import pandas as pd
from collections import defaultdict

async def analyze_paper_statistics():
    """분야별 및 학회별 논문 통계 분석"""
    
    print("📊 논문 통계 분석 시작...")
    
    try:
        # 모든 논문 조회 (페이지네이션으로)
        all_papers = []
        page_size = 100
        page = 0
        
        while True:
            result = supabase_client.client.table("papers").select("field, conference, id").range(
                page * page_size, (page + 1) * page_size - 1
            ).execute()
            
            papers = result.data
            if not papers:
                break
                
            all_papers.extend(papers)
            page += 1
            
            # 너무 많은 데이터 방지
            if len(all_papers) >= 10000:
                break
        
        print(f"✅ 총 {len(all_papers)}개 논문 조회됨")
        
        # 분야별 통계
        field_stats = defaultdict(int)
        conference_stats = defaultdict(int)
        field_conference_stats = defaultdict(lambda: defaultdict(int))
        
        for paper in all_papers:
            field = paper.get('field', 'Unknown')
            conference = paper.get('conference', 'Unknown')
            
            field_stats[field] += 1
            conference_stats[conference] += 1
            field_conference_stats[field][conference] += 1
        
        # 결과 출력
        print("\n" + "="*60)
        print("📈 분야별 논문 개수")
        print("="*60)
        for field, count in sorted(field_stats.items(), key=lambda x: x[1], reverse=True):
            print(f"{field}: {count}개")
        
        print("\n" + "="*60)
        print("🏛️ 학회별 논문 개수")
        print("="*60)
        for conference, count in sorted(conference_stats.items(), key=lambda x: x[1], reverse=True):
            print(f"{conference}: {count}개")
        
        print("\n" + "="*60)
        print("📊 분야별 학회 세부 통계")
        print("="*60)
        for field in sorted(field_stats.keys()):
            print(f"\n🔍 {field} ({field_stats[field]}개):")
            field_conferences = field_conference_stats[field]
            for conference, count in sorted(field_conferences.items(), key=lambda x: x[1], reverse=True):
                print(f"  - {conference}: {count}개")
        
        # 임베딩이 있는 논문 통계
        print("\n" + "="*60)
        print("🔗 임베딩 데이터 통계")
        print("="*60)
        
        embedding_stats = defaultdict(int)
        embedding_field_stats = defaultdict(int)
        
        for paper in all_papers:
            field = paper.get('field', 'Unknown')
            has_embedding = paper.get('combined_embedding') is not None
            
            if has_embedding:
                embedding_stats['total'] += 1
                embedding_field_stats[field] += 1
        
        print(f"임베딩이 있는 논문: {embedding_stats['total']}개")
        print(f"임베딩 비율: {embedding_stats['total']/len(all_papers)*100:.1f}%")
        
        print("\n분야별 임베딩 개수:")
        for field, count in sorted(embedding_field_stats.items(), key=lambda x: x[1], reverse=True):
            total_in_field = field_stats[field]
            percentage = count / total_in_field * 100 if total_in_field > 0 else 0
            print(f"  {field}: {count}개 ({percentage:.1f}%)")
        
        return {
            'total_papers': len(all_papers),
            'field_stats': dict(field_stats),
            'conference_stats': dict(conference_stats),
            'embedding_stats': dict(embedding_stats),
            'embedding_field_stats': dict(embedding_field_stats)
        }
        
    except Exception as e:
        print(f"❌ 통계 분석 실패: {e}")
        return None

# 실행
stats = await analyze_paper_statistics()

📊 논문 통계 분석 시작...
✅ 총 10000개 논문 조회됨

📈 분야별 논문 개수
Computer Vision (CV): 4612개
Machine Learning / Deep Learning (ML/DL): 3686개
Natural Language Processing (NLP): 1702개

🏛️ 학회별 논문 개수
NeurIPS: 3625개
CVPR (IEEE Conference on Computer Vision and Pattern Recognition): 2293개
ECCV (European Conference on Computer Vision): 1391개
WACV (Winter Conference on Applications of Computer Vision): 928개
EMNLP (Empirical Methods in NLP): 830개
ACL Anthology (ACL, EMNLP, NAACL, COLING): 558개
NAACL (North American Chapter of ACL): 314개
ICML: 61개

📊 분야별 학회 세부 통계

🔍 Computer Vision (CV) (4612개):
  - CVPR (IEEE Conference on Computer Vision and Pattern Recognition): 2293개
  - ECCV (European Conference on Computer Vision): 1391개
  - WACV (Winter Conference on Applications of Computer Vision): 928개

🔍 Machine Learning / Deep Learning (ML/DL) (3686개):
  - NeurIPS: 3625개
  - ICML: 61개

🔍 Natural Language Processing (NLP) (1702개):
  - EMNLP (Empirical Methods in NLP): 830개
  - ACL Anthology (ACL, EMNLP, NAACL, COLING)

In [8]:
async def debug_conference_selection(field: str, keywords: List[str]):
    """학회별 논문 선택 과정 디버깅"""
    print(f"🔍 {field} 분야에서 '{keywords}' 검색 디버깅")
    
    # 1. 전체 논문 수 확인
    try:
        result = supabase_client.client.table("papers").select("id, conference, combined_embedding").eq("field", field).execute()
        total_papers = len(result.data)
        print(f"✅ 전체 논문: {total_papers}개")
        
        # 학회별 전체 논문 수
        conference_counts = {}
        for paper in result.data:
            conference = paper.get('conference', 'Unknown')
            conference_counts[conference] = conference_counts.get(conference, 0) + 1
        
        print("📊 학회별 전체 논문 수:")
        for conf, count in sorted(conference_counts.items()):
            print(f"  {conf}: {count}개")
        
    except Exception as e:
        print(f"❌ 전체 논문 조회 실패: {e}")
        return
    
    # 2. 임베딩이 있는 논문만 확인
    try:
        result = supabase_client.client.table("papers").select("id, conference, combined_embedding").eq("field", field).not_.is_("combined_embedding", "null").execute()
        papers_with_embedding = len(result.data)
        print(f"✅ 임베딩 있는 논문: {papers_with_embedding}개 ({papers_with_embedding/total_papers*100:.1f}%)")
        
        # 학회별 임베딩 있는 논문 수
        conference_embedding_counts = {}
        for paper in result.data:
            conference = paper.get('conference', 'Unknown')
            conference_embedding_counts[conference] = conference_embedding_counts.get(conference, 0) + 1
        
        print("📊 학회별 임베딩 있는 논문 수:")
        for conf, count in sorted(conference_embedding_counts.items()):
            total_conf = conference_counts.get(conf, 0)
            percentage = count/total_conf*100 if total_conf > 0 else 0
            print(f"  {conf}: {count}/{total_conf}개 ({percentage:.1f}%)")
        
    except Exception as e:
        print(f"❌ 임베딩 있는 논문 조회 실패: {e}")
        return
    
    # 3. 실제 검색 결과 확인
    try:
        papers = await supabase_client.get_top_papers_by_conference(field, keywords, 3)
        print(f"\n🔍 실제 검색 결과: {len(papers)}개 논문")
        
        # 학회별 검색 결과
        result_by_conference = {}
        for paper in papers:
            conference = paper.get('conference', 'Unknown')
            if conference not in result_by_conference:
                result_by_conference[conference] = []
            result_by_conference[conference].append(paper)
        
        print("📊 학회별 검색 결과:")
        for conf, papers_in_conf in result_by_conference.items():
            print(f"  {conf}: {len(papers_in_conf)}개")
            for i, paper in enumerate(papers_in_conf):
                print(f"    {i+1}. ID: {paper['id']}, 유사도: {paper.get('similarity_score', 0):.3f}")
                print(f"       제목: {paper['title'][:60]}...")
        
    except Exception as e:
        print(f"❌ 검색 실패: {e}")

# 실행
await debug_conference_selection("Natural Language Processing (NLP)", ["RAG"])

🔍 Natural Language Processing (NLP) 분야에서 '['RAG']' 검색 디버깅
✅ 전체 논문: 1000개
📊 학회별 전체 논문 수:
  ACL Anthology (ACL, EMNLP, NAACL, COLING): 558개
  EMNLP (Empirical Methods in NLP): 438개
  NAACL (North American Chapter of ACL): 4개
✅ 임베딩 있는 논문: 1000개 (100.0%)
📊 학회별 임베딩 있는 논문 수:
  ACL Anthology (ACL, EMNLP, NAACL, COLING): 558/558개 (100.0%)
  EMNLP (Empirical Methods in NLP): 438/438개 (100.0%)
  NAACL (North American Chapter of ACL): 4/4개 (100.0%)

🔍 실제 검색 결과: 8개 논문
📊 학회별 검색 결과:
  ACL Anthology (ACL, EMNLP, NAACL, COLING): 3개
    1. ID: 703, 유사도: 0.416
       제목: DRAGIN: Dynamic Retrieval Augmented Generation based on the ...
    2. ID: 586, 유사도: 0.403
       제목: RAGTruth: A Hallucination Corpus for Developing Trustworthy ...
    3. ID: 10, 유사도: 0.394
       제목: Unsupervised Information Refinement Training of Large Langua...
  EMNLP (Empirical Methods in NLP): 3개
    1. ID: 1184, 유사도: 0.256
       제목: RSA-Control: A Pragmatics-Grounded Lightweight Controllable ...
    2. ID: 1933, 유사도: 0.127
   

In [9]:
async def test_multiple_keywords():
    """다중 키워드 검색 테스트"""
    field = "Natural Language Processing (NLP)"
    keywords = ["RAG", "Information Retrieval"]
    
    print(f"🔍 {field} 분야에서 '{keywords}' 다중 키워드 검색 테스트")
    
    try:
        papers = await supabase_client.get_top_papers_by_conference(field, keywords, 3)
        
        print(f"✅ 검색 완료! 총 {len(papers)}개 논문 선택됨")
        
        # 학회별로 그룹화
        conference_groups = {}
        for paper in papers:
            conference = paper.get('conference', 'Unknown')
            if conference not in conference_groups:
                conference_groups[conference] = []
            conference_groups[conference].append(paper)
        
        # 결과 출력
        print("\n📊 학회별 검색 결과:")
        for conference, papers_in_conf in conference_groups.items():
            print(f"\n🏛️ {conference} ({len(papers_in_conf)}개):")
            for i, paper in enumerate(papers_in_conf):
                print(f"  {i+1}. ID: {paper['id']}, 유사도: {paper.get('similarity_score', 0):.3f}")
                print(f"     제목: {paper['title'][:60]}...")
                # 키워드 포함 여부 확인
                title_lower = paper['title'].lower()
                abstract_lower = paper.get('abstract', '').lower()
                rag_included = 'rag' in title_lower or 'rag' in abstract_lower
                ir_included = 'information retrieval' in title_lower or 'information retrieval' in abstract_lower
                print(f"     RAG 포함: {'✅' if rag_included else '❌'}, IR 포함: {'✅' if ir_included else '❌'}")
        
        return papers
        
    except Exception as e:
        print(f"❌ 검색 실패: {e}")
        return []

# 실행
await test_multiple_keywords()

🔍 Natural Language Processing (NLP) 분야에서 '['RAG', 'Information Retrieval']' 다중 키워드 검색 테스트
✅ 검색 완료! 총 8개 논문 선택됨

📊 학회별 검색 결과:

🏛️ ACL Anthology (ACL, EMNLP, NAACL, COLING) (3개):
  1. ID: 703, 유사도: 0.563
     제목: DRAGIN: Dynamic Retrieval Augmented Generation based on the ...
     RAG 포함: ✅, IR 포함: ❌
  2. ID: 10, 유사도: 0.553
     제목: Unsupervised Information Refinement Training of Large Langua...
     RAG 포함: ✅, IR 포함: ❌
  3. ID: 563, 유사도: 0.498
     제목: Bridging the Preference Gap between Retrievers and LLMs...
     RAG 포함: ✅, IR 포함: ❌

🏛️ EMNLP (Empirical Methods in NLP) (3개):
  1. ID: 1184, 유사도: 0.292
     제목: RSA-Control: A Pragmatics-Grounded Lightweight Controllable ...
     RAG 포함: ✅, IR 포함: ❌
  2. ID: 1933, 유사도: 0.231
     제목: Comparing a BERT Classifier and a GPT classifier for Detecti...
     RAG 포함: ❌, IR 포함: ❌
  3. ID: 1228, 유사도: 0.225
     제목: Verba volant, scripta volant? Don’t worry! There are computa...
     RAG 포함: ❌, IR 포함: ❌

🏛️ NAACL (North American Chapter of ACL) (2개

[{'id': 703,
  'title': 'DRAGIN: Dynamic Retrieval Augmented Generation based on the Real-time Information Needs of Large Language Models',
  'abstract': 'Dynamic retrieval augmented generation (RAG) paradigm actively decides when and what to retrieve during the text generation process of Large Language Models (LLMs).There are two key elements of this paradigm: identifying the optimal moment to activate the retrieval module (deciding when to retrieve) and crafting the appropriate query once retrieval is triggered (determining what to retrieve).However, current dynamic RAG methods fall short in both aspects. Firstly, the strategies for deciding when to retrieve often rely on static rules. Moreover, the strategies for deciding what to retrieve typically limit themselves to the LLM’s most recent sentence or the last few tokens, while the LLM’s information needs may span across the entire context.To overcome these limitations, we introduce a new framework, DRAGIN, i.e., Dynamic Retrieval A