In [None]:
import json
from pymongo import MongoClient
from openai import OpenAI
from typing import List, Dict, Set
import os
from tqdm import tqdm
from pathlib import Path
import glob

# Configuration
MONGODB_URI = os.getenv("MONGODB_URI", "mongodb+srv://username:password@polcon.mongodb.net/")
DATABASE_NAME = "polcon_rag"
COLLECTION_NAME = "document_chunks"
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
EMBEDDING_MODEL = "text-embedding-3-large"  # 3072 dimensions
BATCH_SIZE = 100  # Process embeddings in batches
CHUNKS_FOLDER = "./chunks"  # Path to chunks folder

def get_existing_chunk_ids(collection) -> Set[str]:
    """
    Retrieve all existing chunk IDs from MongoDB.
    """
    print("🔍 Checking for existing chunks in MongoDB...")
    existing_ids = set()
    
    # Only fetch the chunk_id field to minimize data transfer
    cursor = collection.find({}, {'chunk_id': 1, '_id': 0})
    
    for doc in cursor:
        if 'chunk_id' in doc:
            existing_ids.add(doc['chunk_id'])
    
    print(f"✅ Found {len(existing_ids)} existing chunks in database")
    return existing_ids

def find_jsonl_files(folder_path: str) -> List[str]:
    """
    Find all JSONL files in the specified folder.
    """
    folder = Path(folder_path)
    if not folder.exists():
        print(f"❌ Folder not found: {folder_path}")
        return []
    
    # Find all .jsonl files
    jsonl_files = list(folder.glob("*.jsonl"))
    
    if not jsonl_files:
        print(f"⚠️  No JSONL files found in: {folder_path}")
        return []
    
    print(f"\n📁 Found {len(jsonl_files)} JSONL file(s) in {folder_path}:")
    for file in jsonl_files:
        print(f"   - {file.name}")
    
    return [str(f) for f in jsonl_files]

def load_jsonl(file_path: str) -> List[Dict]:
    """Load chunks from a single JSONL file."""
    chunks = []
    file_name = Path(file_path).name
    
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            for line_num, line in enumerate(f, 1):
                try:
                    chunk = json.loads(line)
                    # Validate required fields
                    if 'chunk_id' not in chunk or 'text' not in chunk:
                        print(f"⚠️  {file_name} line {line_num}: Missing chunk_id or text")
                        continue
                    chunks.append(chunk)
                except json.JSONDecodeError as e:
                    print(f"⚠️  {file_name} line {line_num}: Invalid JSON - {e}")
                    continue
    except Exception as e:
        print(f"❌ Error reading {file_path}: {e}")
        return []
    
    return chunks

def load_all_jsonl_files(folder_path: str) -> List[Dict]:
    """
    Load chunks from all JSONL files in the folder.
    """
    jsonl_files = find_jsonl_files(folder_path)
    
    if not jsonl_files:
        return []
    
    all_chunks = []
    
    print(f"\n📂 Loading chunks from {len(jsonl_files)} file(s)...")
    for file_path in jsonl_files:
        file_name = Path(file_path).name
        chunks = load_jsonl(file_path)
        print(f"   ✅ {file_name}: {len(chunks)} chunks")
        all_chunks.extend(chunks)
    
    return all_chunks

def get_embeddings(texts: List[str], client: OpenAI) -> List[List[float]]:
    """Generate embeddings for a batch of texts using OpenAI."""
    response = client.embeddings.create(
        input=texts,
        model=EMBEDDING_MODEL
    )
    return [item.embedding for item in response.data]

def embed_and_insert_chunks(chunks_folder: str = CHUNKS_FOLDER, force_reindex: bool = False):
    """
    Main function to embed chunks from all JSONL files and insert into MongoDB.
    Only processes new chunks that haven't been inserted before.
    
    Args:
        chunks_folder: Path to the folder containing JSONL files
        force_reindex: If True, re-embed and update all chunks even if they exist
    """
    
    # Initialize clients
    openai_client = OpenAI(api_key=OPENAI_API_KEY)
    mongo_client = MongoClient(MONGODB_URI)
    db = mongo_client[DATABASE_NAME]
    collection = db[COLLECTION_NAME]
    
    # Create a unique index on chunk_id to prevent duplicates
    collection.create_index("chunk_id", unique=True)
    print(f"✅ Connected to MongoDB Atlas")
    print(f"   Project: Arayuz")
    print(f"   Cluster: polcon")
    print(f"   Database: {DATABASE_NAME}")
    print(f"   Collection: {COLLECTION_NAME}")
    
    # Load chunks from all JSONL files in the folder
    chunks = load_all_jsonl_files(chunks_folder)
    
    if not chunks:
        print("\n❌ No chunks loaded. Please check the folder path and file contents.")
        mongo_client.close()
        return
    
    print(f"\n✅ Total chunks loaded: {len(chunks)}")
    
    # Get existing chunk IDs from database
    if not force_reindex:
        existing_ids = get_existing_chunk_ids(collection)
        
        # Filter out chunks that already exist
        new_chunks = [chunk for chunk in chunks if chunk['chunk_id'] not in existing_ids]
        
        print(f"\n📊 Deduplication Status:")
        print(f"   Total chunks in files: {len(chunks)}")
        print(f"   Already in database: {len(chunks) - len(new_chunks)}")
        print(f"   New chunks to process: {len(new_chunks)}")
        
        if len(new_chunks) == 0:
            print("\n✅ All chunks already exist in database. Nothing to do!")
            print("💡 Use force_reindex=True to re-embed existing chunks")
            mongo_client.close()
            return
        
        chunks_to_process = new_chunks
    else:
        print(f"\n🔄 Force reindex enabled - processing all {len(chunks)} chunks")
        chunks_to_process = chunks
    
    # Check for duplicate chunk_ids in the loaded data
    chunk_ids = [chunk['chunk_id'] for chunk in chunks_to_process]
    duplicate_ids = set([x for x in chunk_ids if chunk_ids.count(x) > 1])
    if duplicate_ids:
        print(f"\n⚠️  Warning: Found {len(duplicate_ids)} duplicate chunk_ids in source files")
        print(f"   First few: {list(duplicate_ids)[:5]}")
    
    # Process chunks in batches
    documents_to_insert = []
    
    print(f"\n🤖 Generating embeddings for {len(chunks_to_process)} chunks...")
    print(f"   Using model: {EMBEDDING_MODEL}")
    print(f"   Batch size: {BATCH_SIZE}")
    
    for i in tqdm(range(0, len(chunks_to_process), BATCH_SIZE), desc="Processing batches"):
        batch = chunks_to_process[i:i + BATCH_SIZE]
        
        # Extract text from each chunk
        texts = [chunk['text'] for chunk in batch]
        
        # Generate embeddings
        try:
            embeddings = get_embeddings(texts, openai_client)
        except Exception as e:
            print(f"\n❌ Error generating embeddings for batch {i//BATCH_SIZE + 1}: {e}")
            continue
        
        # Prepare documents for insertion
        for chunk, embedding in zip(batch, embeddings):
            doc = {
                'chunk_id': chunk['chunk_id'],
                'text': chunk['text'],
                'embedding': embedding,
                'char_count': chunk.get('char_count', len(chunk['text'])),
                'word_count': chunk.get('word_count', len(chunk['text'].split())),
                'source_file': chunk.get('source_file', 'unknown')
            }
            documents_to_insert.append(doc)
    
    # Insert new documents
    print(f"\n💾 Inserting {len(documents_to_insert)} documents into MongoDB...")
    try:
        if force_reindex:
            # Use bulk write with upsert for force reindex
            from pymongo import UpdateOne
            operations = [
                UpdateOne(
                    {'chunk_id': doc['chunk_id']},
                    {'$set': doc},
                    upsert=True
                )
                for doc in documents_to_insert
            ]
            result = collection.bulk_write(operations)
            print(f"✅ Upsert complete:")
            print(f"   Inserted: {result.upserted_count}")
            print(f"   Modified: {result.modified_count}")
        else:
            # Regular insert for new documents
            result = collection.insert_many(documents_to_insert, ordered=False)
            print(f"✅ Successfully inserted {len(result.inserted_ids)} documents")
    except Exception as e:
        print(f"⚠️  Insertion completed with some warnings: {e}")
    
    mongo_client.close()
    print("\n✅ Embedding and insertion complete!")
    print("\n" + "="*70)
    print("📋 NEXT STEP: Create Vector Search Index in Atlas UI")
    print("="*70)
    print_index_instructions()

def print_index_instructions():
    """Print instructions for creating the vector search index."""
    
    index_definition = {
        "fields": [
            {
                "type": "vector",
                "path": "embedding",
                "numDimensions": 3072,
                "similarity": "cosine"
            },
            {
                "type": "filter",
                "path": "source_file"
            },
            {
                "type": "filter",
                "path": "chunk_id"
            }
        ]
    }
    
    print("\n🔧 CREATE VECTOR SEARCH INDEX:")
    print("\n1. Go to MongoDB Atlas: https://cloud.mongodb.com")
    print("2. Navigate to your 'polcon' cluster")
    print("3. Click on 'Atlas Search' tab")
    print("4. Click 'Create Search Index'")
    print("5. Choose 'Atlas Vector Search' → 'JSON Editor'")
    print("6. Configure:")
    print(f"   - Database: {DATABASE_NAME}")
    print(f"   - Collection: {COLLECTION_NAME}")
    print("   - Index Name: vector_index")
    print("\n7. Paste this JSON definition:\n")
    print(json.dumps(index_definition, indent=2))
    print("\n8. Click 'Create Search Index'")
    print("\n⏱️  Index creation takes 5-10 minutes. You'll get a notification when ready.")
    print("="*70)

def query_vector_store(query_text: str, top_k: int = 5, source_file: str = None) -> List[Dict]:
    """
    Query the vector store with a text query.
    Use this after creating the vector search index in Atlas.
    
    Args:
        query_text: The search query
        top_k: Number of results to return
        source_file: Optional filter by source file
    """
    openai_client = OpenAI(api_key=OPENAI_API_KEY)
    mongo_client = MongoClient(MONGODB_URI)
    db = mongo_client[DATABASE_NAME]
    collection = db[COLLECTION_NAME]
    
    print(f"🔍 Searching for: '{query_text}'")
    
    # Generate embedding for query
    query_embedding = get_embeddings([query_text], openai_client)[0]
    
    # Build vector search pipeline
    vector_search_stage = {
        "$vectorSearch": {
            "index": "vector_index",
            "path": "embedding",
            "queryVector": query_embedding,
            "numCandidates": top_k * 10,
            "limit": top_k
        }
    }
    
    # Add source file filter if provided
    if source_file:
        vector_search_stage["$vectorSearch"]["filter"] = {
            "source_file": {"$eq": source_file}
        }
        print(f"📁 Filtering by source: {source_file}")
    
    pipeline = [
        vector_search_stage,
        {
            "$project": {
                "_id": 0,
                "chunk_id": 1,
                "text": 1,
                "source_file": 1,
                "char_count": 1,
                "word_count": 1,
                "score": {"$meta": "vectorSearchScore"}
            }
        }
    ]
    
    try:
        results = list(collection.aggregate(pipeline))
        mongo_client.close()
        return results
    except Exception as e:
        mongo_client.close()
        print(f"❌ Error querying vector store: {e}")
        print("💡 Make sure you've created the 'vector_index' in Atlas UI")
        return []

def get_collection_stats():
    """Get statistics about the vector store."""
    mongo_client = MongoClient(MONGODB_URI)
    db = mongo_client[DATABASE_NAME]
    collection = db[COLLECTION_NAME]
    
    total_chunks = collection.count_documents({})
    
    # Get unique source files
    pipeline = [
        {"$group": {"_id": "$source_file"}},
        {"$sort": {"_id": 1}}
    ]
    unique_sources = list(collection.aggregate(pipeline))
    
    # Get total word count
    pipeline = [
        {"$group": {"_id": None, "total_words": {"$sum": "$word_count"}}}
    ]
    total_words_result = list(collection.aggregate(pipeline))
    total_words = total_words_result[0]['total_words'] if total_words_result else 0
    
    print(f"\n📊 Vector Store Statistics:")
    print(f"   Total chunks: {total_chunks:,}")
    print(f"   Total words: {total_words:,}")
    print(f"   Unique source files: {len(unique_sources)}")
    if unique_sources and len(unique_sources) <= 10:
        print(f"\n   Source files:")
        for source in unique_sources:
            count = collection.count_documents({"source_file": source['_id']})
            print(f"      - {source['_id']}: {count} chunks")
    
    mongo_client.close()


In [4]:

if __name__ == "__main__":
    # Process all JSONL files in the chunks folder
    embed_and_insert_chunks(chunks_folder=CHUNKS_FOLDER, force_reindex=False)
    
    # Get statistics
    get_collection_stats()
    
    print("\n" + "="*70)
    print("🎉 All done! Next steps:")
    print("="*70)
    print("1. Create the vector search index in Atlas UI (see instructions above)")
    print("2. Wait 5-10 minutes for index to build")
    print("3. Test queries using query_vector_store() function")
    print("="*70)
    
    # Example query (uncomment after creating the vector index)
    # print("\n" + "="*70)
    # print("Testing Query (after index is ready):")
    # print("="*70)
    # results = query_vector_store("What is machine learning?", top_k=3)
    # for i, result in enumerate(results, 1):
    #     print(f"\n{i}. Score: {result['score']:.4f}")
    #     print(f"   Chunk ID: {result['chunk_id']}")
    #     print(f"   Source: {result['source_file']}")
    #     print(f"   Words: {result['word_count']}")
    #     print(f"   Text preview: {result['text'][:150]}...")

✅ Connected to MongoDB Atlas
   Project: Arayuz
   Cluster: polcon
   Database: polcon_rag
   Collection: document_chunks

📁 Found 89 JSONL file(s) in ./chunks:
   - 1) Temel kavramlar önyargı, kalıpyargı ve ayrımcılık_chunks.jsonl
   - 10) TÜRKİYE’DE ÖRGÜTLENME ÖZGÜRLÜĞÜNÜN GENEL GÖRÜNÜMÜ-II _chunks.jsonl
   - 11) Yurttaslik_Alani_Bilgi_Notu_1_chunks.jsonl
   - 12) TERÖRLE MÜCADELEYİ ARAÇSALLAŞTIRMAK_chunks.jsonl
   - 13) PROTESTO HAKKINI KORU_chunks.jsonl
   - 14) KomploTeorileri_AR_23.03.23_web_chunks.jsonl
   - 15) Feminist_Hareketin_Gundemleri__chunks.jsonl
   - 16) Sivil Toplum Kuruluşlarının Devlet Tarafından Finansmanı Üzerine Bir Tartışma_chunks.jsonl
   - 17) Gençlik Politikalarında Karşılaştırmalı Bir Değerlendirme-Türkiye ve Finlandiya Örneği_chunks.jsonl
   - 18) Avrupa Konseyi Politik Karar Alma Süreçlerine Sivil Katılım Rehberi Çevirisi_chunks.jsonl
   - 19) Kampüsten Öğrenci Toplulukları _chunks.jsonl
   - 2) Ayrımcılık ve medya_chunks.jsonl
   - 20) Gençler Ne(ler) İst

Processing batches:   2%|▏         | 22/909 [00:18<06:56,  2.13it/s] 


❌ Error generating embeddings for batch 22: Error code: 400 - {'error': {'message': "This model's maximum context length is 8192 tokens, however you requested 9613 tokens (9613 in your prompt; 0 for the completion). Please reduce your prompt; or completion length.", 'type': 'invalid_request_error', 'param': None, 'code': None}}


Processing batches:   3%|▎         | 23/909 [00:19<06:05,  2.42it/s]


❌ Error generating embeddings for batch 23: Error code: 400 - {'error': {'message': "This model's maximum context length is 8192 tokens, however you requested 10089 tokens (10089 in your prompt; 0 for the completion). Please reduce your prompt; or completion length.", 'type': 'invalid_request_error', 'param': None, 'code': None}}


Processing batches:   3%|▎         | 24/909 [00:19<05:55,  2.49it/s]


❌ Error generating embeddings for batch 24: Error code: 400 - {'error': {'message': "This model's maximum context length is 8192 tokens, however you requested 10967 tokens (10967 in your prompt; 0 for the completion). Please reduce your prompt; or completion length.", 'type': 'invalid_request_error', 'param': None, 'code': None}}


Processing batches:   3%|▎         | 25/909 [00:19<05:23,  2.73it/s]


❌ Error generating embeddings for batch 25: Error code: 400 - {'error': {'message': "This model's maximum context length is 8192 tokens, however you requested 13062 tokens (13062 in your prompt; 0 for the completion). Please reduce your prompt; or completion length.", 'type': 'invalid_request_error', 'param': None, 'code': None}}


Processing batches:   3%|▎         | 27/909 [00:21<07:46,  1.89it/s]


❌ Error generating embeddings for batch 27: Error code: 400 - {'error': {'message': "This model's maximum context length is 8192 tokens, however you requested 9896 tokens (9896 in your prompt; 0 for the completion). Please reduce your prompt; or completion length.", 'type': 'invalid_request_error', 'param': None, 'code': None}}


Processing batches:  13%|█▎        | 121/909 [01:15<08:08,  1.61it/s]


KeyboardInterrupt: 