In [4]:
pip install pinecone

Collecting pinecone
  Using cached pinecone-7.3.0-py3-none-any.whl.metadata (9.5 kB)
Collecting pinecone-plugin-assistant<2.0.0,>=1.6.0 (from pinecone)
  Downloading pinecone_plugin_assistant-1.8.0-py3-none-any.whl.metadata (30 kB)
Collecting pinecone-plugin-interface<0.0.8,>=0.0.7 (from pinecone)
  Using cached pinecone_plugin_interface-0.0.7-py3-none-any.whl.metadata (1.2 kB)
Collecting urllib3>=1.26.0 (from pinecone)
  Using cached urllib3-2.5.0-py3-none-any.whl.metadata (6.5 kB)
Collecting packaging<25.0,>=24.2 (from pinecone-plugin-assistant<2.0.0,>=1.6.0->pinecone)
  Using cached packaging-24.2-py3-none-any.whl.metadata (3.2 kB)
Collecting requests<3.0.0,>=2.32.3 (from pinecone-plugin-assistant<2.0.0,>=1.6.0->pinecone)
  Using cached requests-2.32.5-py3-none-any.whl.metadata (4.9 kB)
Using cached pinecone-7.3.0-py3-none-any.whl (587 kB)
Downloading pinecone_plugin_assistant-1.8.0-py3-none-any.whl (259 kB)
Using cached packaging-24.2-py3-none-any.whl (65 kB)
Using cached pinecone_

In [1]:
pip install -r requirements.txt

Collecting python-dotenv (from -r requirements.txt (line 2))
  Using cached python_dotenv-1.1.1-py3-none-any.whl.metadata (24 kB)
Using cached python_dotenv-1.1.1-py3-none-any.whl (20 kB)
Installing collected packages: python-dotenv
Successfully installed python-dotenv-1.1.1
Note: you may need to restart the kernel to use updated packages.


In [7]:
import json
from pathlib import Path
from typing import List, Dict
from openai import OpenAI
from pinecone import Pinecone
import time
import os
from dotenv import load_dotenv
import hashlib


# Load environment variables
load_dotenv()

# Initialize clients
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))


def create_ascii_id(filename: str, chunk_id: int) -> str:
    """
    Create an ASCII-safe ID using hash of filename.
    
    Args:
        filename: Original filename
        chunk_id: Chunk ID
        
    Returns:
        ASCII-safe vector ID
    """
    # Create a short hash of the filename
    file_hash = hashlib.md5(filename.encode('utf-8')).hexdigest()[:12]
    return f"{file_hash}_{chunk_id}"


def create_embedding(text: str, model: str = "text-embedding-3-large") -> List[float]:
    """
    Create embedding for a single text.
    
    Args:
        text: Input text
        model: OpenAI embedding model
        
    Returns:
        Embedding vector
    """
    response = client.embeddings.create(
        input=text,
        model=model
    )
    return response.data[0].embedding


def create_embeddings_batch(texts: List[str], model: str = "text-embedding-3-large") -> List[List[float]]:
    """
    Create embeddings for multiple texts in batch.
    
    Args:
        texts: List of input texts
        model: OpenAI embedding model
        
    Returns:
        List of embedding vectors
    """
    response = client.embeddings.create(
        input=texts,
        model=model
    )
    return [item.embedding for item in response.data]


def load_chunks_from_jsonl(jsonl_path: Path) -> List[Dict]:
    """
    Load chunks from a JSONL file.
    
    Args:
        jsonl_path: Path to JSONL file
        
    Returns:
        List of chunk dictionaries
    """
    chunks = []
    with open(jsonl_path, 'r', encoding='utf-8') as f:
        for line in f:
            chunks.append(json.loads(line))
    return chunks


def get_existing_vector_ids(index) -> set:
    """
    Get all existing vector IDs from Pinecone index.
    
    Args:
        index: Pinecone index object
        
    Returns:
        Set of existing vector IDs
    """
    existing_ids = set()
    
    # Pinecone doesn't have a direct "list all IDs" method
    # We'll use a dummy query to check stats and fetch in batches
    stats = index.describe_index_stats()
    
    print(f"Index currently contains {stats.total_vector_count} vectors")
    
    return existing_ids


def check_if_file_processed(index, jsonl_file_stem: str) -> bool:
    """
    Check if a JSONL file has already been processed by querying for its chunks.
    
    Args:
        index: Pinecone index object
        jsonl_file_stem: Stem of the JSONL filename (without extension)
        
    Returns:
        True if file has been processed, False otherwise
    """
    # Try to fetch a vector with this file's prefix (using hash)
    test_id = create_ascii_id(jsonl_file_stem, 0)
    
    try:
        result = index.fetch(ids=[test_id])
        if result.vectors:
            return True
    except:
        pass
    
    return False


def process_and_upload_chunks(
    chunks_dir: str = r"C:\Users\yigit\Desktop\Enterprises\arayuz-9\chunks",
    index_name: str = "polcon",
    batch_size: int = 100
) -> None:
    """
    Process all JSONL files, create embeddings, and upload to Pinecone.
    
    Args:
        chunks_dir: Directory containing JSONL chunk files
        index_name: Name of the Pinecone index
        batch_size: Number of chunks to process in each batch
    """
    chunks_path = Path(chunks_dir)
    
    # Connect to existing Pinecone index
    index = pc.Index(index_name)
    
    # Get current index stats
    get_existing_vector_ids(index)
    
    # Find all JSONL files
    jsonl_files = list(chunks_path.glob("*.jsonl"))
    
    if not jsonl_files:
        print(f"No JSONL files found in {chunks_dir}")
        return
    
    print(f"Found {len(jsonl_files)} JSONL files to process\n")
    
    total_uploaded = 0
    skipped_files = 0
    
    for jsonl_file in jsonl_files:
        print(f"Processing: {jsonl_file.name}")
        
        # Check if this file has already been processed
        if check_if_file_processed(index, jsonl_file.stem):
            print(f"  ⊘ File already processed, skipping\n")
            skipped_files += 1
            continue
        
        try:
            # Load chunks
            chunks = load_chunks_from_jsonl(jsonl_file)
            print(f"  Loaded {len(chunks)} chunks")
            
            # Process in batches
            for i in range(0, len(chunks), batch_size):
                batch = chunks[i:i + batch_size]
                
                # Extract texts for embedding
                texts = [chunk['text'] for chunk in batch]
                
                # Create embeddings
                print(f"  Creating embeddings for batch {i//batch_size + 1}...", end=" ")
                embeddings = create_embeddings_batch(texts)
                print("✓")
                
                # Prepare vectors for Pinecone
                vectors = []
                for j, (chunk, embedding) in enumerate(zip(batch, embeddings)):
                    vector_id = create_ascii_id(jsonl_file.stem, chunk['chunk_id'])
                    
                    metadata = {
                        "text": chunk['text'],
                        "source_file": chunk['source_file'],
                        "source_path": chunk['source_path'],
                        "chunk_id": chunk['chunk_id'],
                        "char_count": chunk['char_count'],
                        "word_count": chunk['word_count'],
                        "jsonl_file": jsonl_file.stem
                    }
                    
                    vectors.append({
                        "id": vector_id,
                        "values": embedding,
                        "metadata": metadata
                    })
                
                # Upload to Pinecone
                print(f"  Uploading batch to Pinecone...", end=" ")
                index.upsert(vectors=vectors)
                print("✓")
                
                total_uploaded += len(vectors)
                
                # Small delay to avoid rate limits
                time.sleep(0.5)
            
            print(f"  ✓ Completed {jsonl_file.name}\n")
            
        except Exception as e:
            print(f"  ✗ Error processing {jsonl_file.name}: {e}\n")
    
    print(f"\nProcessing complete!")
    print(f"Files skipped (already processed): {skipped_files}")
    print(f"Total new vectors uploaded: {total_uploaded}")
    
    # Get index stats
    stats = index.describe_index_stats()
    print(f"Total vectors in index: {stats.total_vector_count}")


def query_similar_chunks(
    query: str,
    index_name: str = "polcon",
    top_k: int = 5
) -> List[Dict]:
    """
    Query Pinecone for similar chunks.
    
    Args:
        query: Search query
        index_name: Name of the Pinecone index
        top_k: Number of results to return
        
    Returns:
        List of similar chunks with scores
    """
    # Create query embedding
    query_embedding = create_embedding(query)
    
    # Query Pinecone
    index = pc.Index(index_name)
    results = index.query(
        vector=query_embedding,
        top_k=top_k,
        include_metadata=True
    )
    
    # Format results
    matches = []
    for match in results.matches:
        matches.append({
            "score": match.score,
            "text": match.metadata.get('text', ''),
            "source_file": match.metadata.get('source_file', ''),
            "chunk_id": match.metadata.get('chunk_id', '')
        })
    
    return matches


In [8]:
# Run the uploader
if __name__ == "__main__":
    # Upload all chunks to Pinecone
    process_and_upload_chunks()
    
    # Example query
    # results = query_similar_chunks("gençlik örgütleri nedir?")
    # for i, result in enumerate(results):
    #     print(f"\n{i+1}. Score: {result['score']:.4f}")
    #     print(f"Source: {result['source_file']}")
    #     print(f"Text: {result['text'][:200]}...")

Index currently contains 0 vectors
Found 4 JSONL files to process

Processing: 1) Temel kavramlar önyargı, kalıpyargı ve ayrımcılık_chunks.jsonl
  Loaded 34 chunks
  Creating embeddings for batch 1... ✓
  Uploading batch to Pinecone... ✓
  ✓ Completed 1) Temel kavramlar önyargı, kalıpyargı ve ayrımcılık_chunks.jsonl

Processing: 2) Ayrımcılık ve medya_chunks.jsonl
  Loaded 45 chunks
  Creating embeddings for batch 1... ✓
  Uploading batch to Pinecone... ✓
  ✓ Completed 2) Ayrımcılık ve medya_chunks.jsonl

Processing: 3) Toplumsal Cinsiyete Dayalı Ayrımcılık_chunks.jsonl
  Loaded 33 chunks
  Creating embeddings for batch 1... ✓
  Uploading batch to Pinecone... ✓
  ✓ Completed 3) Toplumsal Cinsiyete Dayalı Ayrımcılık_chunks.jsonl

Processing: 4) Uluslararası Af Örgütü Raporu 2021-2022 Avrupa ve Orta Asya Değerlendirmesi(sayfa 46-54)_chunks.jsonl
  Loaded 35 chunks
  Creating embeddings for batch 1... ✓
  Uploading batch to Pinecone... ✓
  ✓ Completed 4) Uluslararası Af Örgütü Raporu 2021