In [None]:
# !pip install uv
# uv init
# uv add unstructured
# uv add "unstructured[pdf]"
# uv pip install -r requirements.txt

In [None]:
from unstructured.partition.pdf import partition_pdf
from unstructured.staging.base import elements_to_json
import os
import json
import re
import os
import glob
from pathlib import Path
from dotenv import load_dotenv
from tqdm import tqdm
from typing import List
import requests

# Standard imports
from langchain_community.document_loaders import DirectoryLoader, TextLoader
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_community.vectorstores import Pinecone
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain.prompts import ChatPromptTemplate
from langchain_core.documents import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_pinecone import PineconeVectorStore
from langchain_experimental.text_splitter import SemanticChunker
from langchain.schema import Document
from langchain.chains.query_constructor.schema import AttributeInfo
from langchain.retrievers.self_query.base import SelfQueryRetriever
import lark
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate


In [None]:
load_dotenv()

langchain_tracing_v2 = os.getenv('LANGCHAIN_TRACING_V2')
langchain_endpoint = os.getenv('LANGCHAIN_ENDPOINT')
langchain_api_key = os.getenv('LANGCHAIN_API_KEY')
openai_api_key = os.getenv('OPENAI_API_KEY')
pinecone_api_key = os.getenv('PINECONE_API_KEY')
index_name = os.getenv('PINECONE_INDEX_NAME')

In [None]:
def enumerate_text_parts_with_position(text_elements, exclude_types=None, column_threshold=50):
    """
    Enumerate consecutive parts of a page in reading order, assigning a 'position' field to each.
    Footer always gets the last position in a page.
    Returns a list of dicts, each with 'position', 'text', and other original fields.
    
    Args:
        text_elements (list): List of text elements with coordinates and type
        exclude_types (list): List of element types to exclude (except 'footer', which is handled specially)
        column_threshold (int): Pixel threshold to determine column boundaries
    
    Returns:
        list: List of dicts, each with 'position', 'text', and other original fields
    """
    # Always process footers separately, regardless of exclude_types
    if exclude_types is None:
        exclude_types = []
    exclude_types_no_footer = [t for t in exclude_types if t.lower() != 'footer']

    # Split elements into footers and non-footers
    footer_elements = [
        elem for elem in text_elements
        if elem.get('type', '').lower() == 'footer'
    ]
    non_footer_elements = [
        elem for elem in text_elements
        if elem.get('type', '').lower() not in [t.lower() for t in exclude_types_no_footer + ['footer']]
    ]

    # If there are no non-footer elements and no footers, return empty
    if not non_footer_elements and not footer_elements:
        return []

    # Group non-footer elements by approximate Y position (rows)
    rows = {}
    for element in non_footer_elements:
        coords = element.get('coordinates', [[0, 0]])
        if coords:
            x, y = coords[0]  # Upper-left corner
            row_key = round(y / 20) * 20  # Group by 20-pixel rows
            if row_key not in rows:
                rows[row_key] = []
            rows[row_key].append(element)

    # Sort rows by Y position (top to bottom)
    sorted_rows = sorted(rows.items())

    # Flatten elements in reading order
    ordered_elements = []
    for row_y, row_elements in sorted_rows:
        row_elements.sort(key=lambda elem: elem.get('coordinates', [[0, 0]])[0][0])
        ordered_elements.extend(row_elements)

    # Enumerate and assign position to non-footer elements
    enumerated_parts = []
    position = 0
    for element in ordered_elements:
        text = element.get('text', '').strip()
        if text:
            text = re.sub(r' +', ' ', text)
            part = dict(element)
            part['text'] = text
            part['position'] = position
            enumerated_parts.append(part)
            position += 1

    # Now process footers, assign them the last position(s)
    for i, element in enumerate(footer_elements):
        text = element.get('text', '').strip()
        if text:
            text = re.sub(r' +', ' ', text)
            part = dict(element)
            part['text'] = text
            # If multiple footers, assign consecutive positions after non-footers
            part['position'] = position + i
            enumerated_parts.append(part)

    return enumerated_parts


def wrap_metadata(item):
    """Wrap all fields except 'element_id' and 'text' into a 'metadata' dict. 
    The values of metadata fields are the original values (including dicts)."""
    if not isinstance(item, dict):
        return item
    new_item = {}
    # Always keep 'element_id' and 'text' at top level if present
    if "element_id" in item:
        new_item["element_id"] = item["element_id"]
    if "text" in item:
        new_item["text"] = item["text"]
    # Everything else goes into metadata
    metadata = {}
    for k, v in item.items():
        if k not in ("element_id", "text", "metadata"):
            metadata[k] = v
        elif k == "metadata" and isinstance(v, dict):
            # Merge existing metadata fields
            for mk, mv in v.items():
                metadata[mk] = mv
    new_item["metadata"] = metadata
    return new_item

def remove_metadata_fields(metadata, fields_to_remove):
    """Remove specified fields from a metadata dict."""
    if not isinstance(metadata, dict):
        return metadata
    for field in fields_to_remove:
        metadata.pop(field, None)
    return metadata

def convert_languages_field_in_metadata(metadata):
    """If 'languages' in metadata is a list of length 1, convert it to a string."""
    if isinstance(metadata, dict) and "languages" in metadata:
        if isinstance(metadata["languages"], list) and len(metadata["languages"]) == 1:
            metadata["languages"] = metadata["languages"][0]
    return metadata

def get_position(x):
    if isinstance(x, dict):
        # Try top-level, then metadata
        if "position" in x:
            return x["position"]
        elif "metadata" in x and isinstance(x["metadata"], dict) and "position" in x["metadata"]:
            return x["metadata"]["position"]
    return -1

def get_type(x):
    if isinstance(x, dict):
        if "type" in x:
            return x["type"]
        elif "metadata" in x and isinstance(x["metadata"], dict) and "type" in x["metadata"]:
            return x["metadata"]["type"]
    return None

def get_text(x):
    if isinstance(x, dict):
        return x.get("text", None)
    return None

def replace_nulls(obj):
    """Recursively replace None values in dicts/lists with a space character."""
    if isinstance(obj, dict):
        return {k: replace_nulls(v) if v is not None else " " for k, v in obj.items()}
    elif isinstance(obj, list):
        return [replace_nulls(v) for v in obj]
    else:
        return obj if obj is not None else " "
    
def metadata_without_position(meta):
    """Return a copy of metadata dict without the 'position' field."""
    if not isinstance(meta, dict):
        return meta
    return {k: v for k, v in meta.items() if k != "position"}

def can_merge_nodes(node1, node2):
    """Return True if node1 and node2 can be merged according to the rules."""
    if not (isinstance(node1, dict) and isinstance(node2, dict)):
        return False
    meta1 = node1.get("metadata", {})
    meta2 = node2.get("metadata", {})
    # Compare all metadata fields except 'position'
    if metadata_without_position(meta1) != metadata_without_position(meta2):
        return False
    # Both must have 'text' fields that are strings
    text1 = node1.get("text", "")
    text2 = node2.get("text", "")
    if not (isinstance(text1, str) and isinstance(text2, str)):
        return False
    # Total length of text fields must be less than 200
    if len(text1) + len(text2) < 200:
        return True
    return False

def merge_nodes(node1, node2):
    """Merge node2 into node1, concatenating text and updating position to the lower of the two."""
    merged = dict(node1)
    merged["text"] = node1.get("text", "") + " " + node2.get("text", "")
    # Merge metadata, keep the lower position
    meta1 = node1.get("metadata", {})
    meta2 = node2.get("metadata", {})
    merged_meta = dict(meta1)
    pos1 = meta1.get("position", node1.get("position", None))
    pos2 = meta2.get("position", node2.get("position", None))
    # Use the lower position if both exist
    if pos1 is not None and pos2 is not None:
        merged_meta["position"] = min(pos1, pos2)
    elif pos1 is not None:
        merged_meta["position"] = pos1
    elif pos2 is not None:
        merged_meta["position"] = pos2
    merged["metadata"] = merged_meta
    return merged

In [5]:
pdf_dir = "C:/Users/yigit/Desktop/Enterprises/arayuz-4/okumalar-pdf/"
json_dir = os.path.join(os.path.dirname(pdf_dir.rstrip("/\\")), "okumalar-json")

os.makedirs(json_dir, exist_ok=True)

# Get all PDF and JSON base filenames (without extension)
pdf_files = [f for f in os.listdir(pdf_dir) if f.lower().endswith(".pdf")]
json_files = [f for f in os.listdir(json_dir) if f.lower().endswith('.json')]

pdf_basenames = set(os.path.splitext(f)[0] for f in pdf_files)
json_basenames = set(os.path.splitext(f)[0] for f in json_files)

# Find PDFs that do not have a corresponding JSON
unprocessed_basenames = pdf_basenames - json_basenames

if not unprocessed_basenames:
    print("All PDFs have been processed to JSON.")
else:
    print(f"Processing {len(unprocessed_basenames)} unprocessed PDFs...")
    for base_file_name in sorted(unprocessed_basenames):
        pdf_path = os.path.join(pdf_dir, f"{base_file_name}.pdf")
        json_path = os.path.join(json_dir, f"{base_file_name}.json")
        print(f"Processing: {pdf_path} -> {json_path}")
        elements = partition_pdf(
            filename=pdf_path,
            languages=["tur"],
            strategy="fast",
            infer_table_structure=True,
        )
        elements_to_json(elements=elements, filename=json_path)


Processing: C:/Users/yigit/Desktop/Enterprises/arayuz-4/okumalar-pdf/30) TOG Gençlik Çalışmasının Toplumsal Katılıma Etkisi Araştırması.pdf -> C:/Users/yigit/Desktop/Enterprises/arayuz-4\okumalar-json\30) TOG Gençlik Çalışmasının Toplumsal Katılıma Etkisi Araştırması.json


In [6]:
# Now process all JSON files
json_files = [f for f in os.listdir(json_dir) if f.lower().endswith('.json')]

# Create the sibling "okumalar-prep" directory
prep_dir = os.path.join(os.path.dirname(json_dir.rstrip("/\\")), "okumalar-prep")
os.makedirs(prep_dir, exist_ok=True)

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=300,
    chunk_overlap=30,
    length_function=len,
    is_separator_regex=False,
)


if not json_files:
    print("No JSON files found in the directory.")
else:
    for json_file in sorted(json_files):
        json_path = os.path.join(json_dir, json_file)
        print(f"\nProcessing JSON file: {json_file}")

        with open(json_path, "r", encoding="utf-8") as f:
            json_data = json.load(f)

        # Use enumerate_text_parts_with_position to assign position numbers before removing coordinates
        if isinstance(json_data, list):
            enumerated_json_data = enumerate_text_parts_with_position(json_data)
        else:
            enumerated_json_data = json_data  # fallback, should be a list

        # Remove specified fields from metadata in every dict in the list
        # Also, remove all occurrences of "- " (dash and space) that are preceded and followed by letters in the 'text' field
        fields_to_remove = ["coordinates", "file_directory", "filetype", "last_modified"]
        processed_data = []
        if isinstance(enumerated_json_data, list):
            for item in enumerated_json_data:
                if isinstance(item, dict):
                    # Remove "- " between letters in 'text'
                    if "text" in item and isinstance(item["text"], str):
                        item["text"] = re.sub(r'(?<=[A-Za-zÇĞİÖŞÜçğıöşü])\- (?=[A-Za-zÇĞİÖŞÜçğıöşü])', '', item["text"])
                    # Wrap metadata
                    wrapped = wrap_metadata(item)
                    # Remove specified fields from metadata
                    wrapped["metadata"] = remove_metadata_fields(wrapped.get("metadata", {}), fields_to_remove)
                    # Convert languages field from list to string if needed
                    wrapped["metadata"] = convert_languages_field_in_metadata(wrapped.get("metadata", {}))
                    # Replace nulls in wrapped and metadata
                    wrapped = replace_nulls(wrapped)
                    processed_data.append(wrapped)
                else:
                    processed_data.append(replace_nulls(item))
        else:
            item = enumerated_json_data
            if isinstance(item, dict):
                if "text" in item and isinstance(item["text"], str):
                    item["text"] = re.sub(r'(?<=[A-Za-zÇĞİÖŞÜçğıöşü])\- (?=[A-Za-zÇĞİÖŞÜçğıöşü])', '', item["text"])
                wrapped = wrap_metadata(item)
                wrapped["metadata"] = remove_metadata_fields(wrapped.get("metadata", {}), fields_to_remove)
                wrapped["metadata"] = convert_languages_field_in_metadata(wrapped.get("metadata", {}))
                wrapped = replace_nulls(wrapped)
                processed_data = [wrapped]
            else:
                processed_data = [replace_nulls(item)]

        # Add "title" field to each dict, using the text of the most recent preceding Title-type dict (by position)
        # First, build a mapping from position to (type, text)
        position_to_title = {}
        last_title_text = None
        # Sort processed_data by position if available

        sorted_data = sorted(processed_data, key=get_position)
        for item in sorted_data:
            item_type = get_type(item)
            item_text = get_text(item)
            item_position = get_position(item)
            if item_type == "Title" and item_text:
                last_title_text = item_text
            if item_position is not None:
                position_to_title[item_position] = last_title_text

        # Now, assign the "title" field to each item (in metadata)
        for item in sorted_data:
            item_position = get_position(item)
            title_val = position_to_title.get(item_position, None)
            # Place "title" in metadata
            if isinstance(item, dict):
                if "metadata" not in item or not isinstance(item["metadata"], dict):
                    item["metadata"] = {}
                # If title_val is None, set to a space
                item["metadata"]["title"] = title_val if title_val is not None else " "

        # --- MERGE LOGIC: merge consecutive nodes with same metadata (except position) and total text length < 200 ---
        merged_data = []
        i = 0
        while i < len(sorted_data):
            current = sorted_data[i]
            if (
                i + 1 < len(sorted_data)
                and can_merge_nodes(current, sorted_data[i + 1])
            ):
                # Merge current and next
                merged = merge_nodes(current, sorted_data[i + 1])
                merged_data.append(merged)
                i += 2  # skip next
            else:
                merged_data.append(current)
                i += 1

        # Print number of nodes before split
        print(f"Number of nodes before split: {len(merged_data)}")

        # Split text and propagate the "title" field
        split_json_data = []
        for item in merged_data:
            if isinstance(item, dict) and "text" in item and isinstance(item["text"], str):
                splits = text_splitter.split_text(item["text"])
                for split_text in splits:
                    new_item = dict(item)  # shallow copy
                    new_item["text"] = split_text
                    # Ensure no nulls in new_item or its metadata
                    new_item = replace_nulls(new_item)
                    split_json_data.append(new_item)
            else:
                split_json_data.append(replace_nulls(item))

        # Print number of nodes after split
        print(f"Number of nodes after split: {len(split_json_data)}")

        # Save the processed/split JSON to the prep directory
        prep_json_path = os.path.join(prep_dir, json_file)
        with open(prep_json_path, "w", encoding="utf-8") as f:
            json.dump(split_json_data, f, ensure_ascii=False, indent=2)
        print(f"Saved processed JSON to: {prep_json_path}")



Processing JSON file: 1) Temel kavramlar önyargı, kalıpyargı ve ayrımcılık.json
Number of nodes before split: 50
Number of nodes after split: 138
Saved processed JSON to: C:/Users/yigit/Desktop/Enterprises/arayuz-4\okumalar-prep\1) Temel kavramlar önyargı, kalıpyargı ve ayrımcılık.json

Processing JSON file: 10) TÜRKİYE’DE ÖRGÜTLENME ÖZGÜRLÜĞÜNÜN GENEL GÖRÜNÜMÜ-II .json
Number of nodes before split: 869
Number of nodes after split: 1244
Saved processed JSON to: C:/Users/yigit/Desktop/Enterprises/arayuz-4\okumalar-prep\10) TÜRKİYE’DE ÖRGÜTLENME ÖZGÜRLÜĞÜNÜN GENEL GÖRÜNÜMÜ-II .json

Processing JSON file: 11) Yurttaslik_Alani_Bilgi_Notu_1.json
Number of nodes before split: 263
Number of nodes after split: 302
Saved processed JSON to: C:/Users/yigit/Desktop/Enterprises/arayuz-4\okumalar-prep\11) Yurttaslik_Alani_Bilgi_Notu_1.json

Processing JSON file: 12) TERÖRLE MÜCADELEYİ ARAÇSALLAŞTIRMAK.json
Number of nodes before split: 210
Number of nodes after split: 456
Saved processed JSON to: C

In [7]:
# # Path to the prep folder and the target file
# prep_dir = os.path.join(os.path.dirname(json_dir.rstrip("/\\")), "okumalar-prep")
# target_filename = "7) eşitsiz demokrasiler.json"
# target_path = os.path.join(prep_dir, target_filename)

# # Load the JSON data
# with open(target_path, "r", encoding="utf-8") as f:
#     data = json.load(f)

# # Extract the 'text' field from each element
# texts = [element['text'] for element in data if 'text' in element]

# # Compute statistics
# lengths = [len(text) for text in texts]
# count = len(lengths)
# average = sum(lengths) / count if count > 0 else 0
# max_len = max(lengths) if lengths else 0
# min_len = min(lengths) if lengths else 0
# median_len = sorted(lengths)[len(lengths) // 2] if lengths else 0

# print(f"Summary statistics for 'text' field in '{target_filename}':")
# print(f"Number of elements: {count}")
# print(f"Average length: {average:.2f} characters")
# print(f"Median length: {median_len:.2f} characters")
# print(f"Max length: {max_len} characters")
# print(f"Min length: {min_len} characters")


In [8]:
# Path to the prep folder
prep_dir = os.path.join(os.path.dirname(json_dir.rstrip("/\\")), "okumalar-prep")

all_texts = []

# Iterate over all JSON files in the prep directory and collect all 'text' fields
for json_file in glob.glob(os.path.join(prep_dir, "*.json")):
    with open(json_file, "r", encoding="utf-8") as f:
        data = json.load(f)
    all_texts.extend([element['text'] for element in data if 'text' in element])

# Compute statistics across all files
lengths = [len(text) for text in all_texts]
count = len(lengths)
average = sum(lengths) / count if count > 0 else 0
max_len = max(lengths) if lengths else 0
min_len = min(lengths) if lengths else 0
median_len = sorted(lengths)[len(lengths) // 2] if lengths else 0

print(f"Summary statistics for 'text' field across all files in '{prep_dir}':")
print(f"Number of elements: {count}")
print(f"Average length: {average:.2f} characters")
print(f"Median length: {median_len:.2f} characters")
print(f"Max length: {max_len} characters")
print(f"Min length: {min_len} characters")


Summary statistics for 'text' field across all files in 'C:/Users/yigit/Desktop/Enterprises/arayuz-4\okumalar-prep':
Number of elements: 30020
Average length: 123.03 characters
Median length: 89.00 characters
Max length: 300 characters
Min length: 1 characters


In [9]:
# 1. Iterate over all JSON files in prep_dir
pdf_dir = "C:/Users/yigit/Desktop/Enterprises/arayuz-4/okumalar-pdf/"
json_dir = os.path.join(os.path.dirname(pdf_dir.rstrip("/\\")), "okumalar-json")
prep_dir = os.path.join(os.path.dirname(json_dir.rstrip("/\\")), "okumalar-prep")

all_documents = []
for json_path in glob.glob(os.path.join(prep_dir, "*.json")):
    with open(json_path, "r", encoding="utf-8") as f:
        split_json_data = json.load(f)
    # Convert split_json_data to Document objects
    documents = [
        Document(
            page_content=element['text'],
            metadata=element['metadata'] | {'element_id': element['element_id']}
        )
        for element in split_json_data
    ]
    all_documents.extend(documents)

# 2. Create embeddings
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

# 3. Upsert all documents into Pinecone vector store
vectorstore = PineconeVectorStore.from_documents(
    documents=all_documents,
    embedding=embeddings,
    index_name=index_name
)

PineconeApiException: (400)
Reason: Bad Request
HTTP response headers: HTTPHeaderDict({'Date': 'Fri, 25 Jul 2025 21:09:48 GMT', 'Content-Type': 'application/json', 'Content-Length': '150', 'Connection': 'keep-alive', 'x-pinecone-request-latency-ms': '270', 'x-pinecone-request-id': '5045374394484409737', 'x-envoy-upstream-service-time': '2', 'server': 'envoy'})
HTTP response body: {"code":3,"message":"Metadata value must be a string, number, boolean or list of strings, got '[{\"start_index\":...' for field 'links'","details":[]}


## Retrieval

In [None]:
# Make sure you have the correct index_name and embedding model
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
vectorstore = PineconeVectorStore(
    index_name=index_name,
    embedding=embeddings
)

metadata_field_info = [
    AttributeInfo(
        name="filename",
        description="The name of the PDF document the text comes from",
        type="string",
    ),
    AttributeInfo(
        name="languages",
        description="The list of language codes used in the document (e.g. 'tur', 'eng')",
        type="list[string]",
    ),
    AttributeInfo(
        name="page_number",
        description="The page number within the PDF document",
        type="integer",
    ),
    AttributeInfo(
        name="type",
        description="The structural type of the text chunk (e.g. Title, Paragraph, List, Quote)",
        type="string",
    ),
    AttributeInfo(
        name="title",
        description="The section title of the chunk",
        type="string",
    ),
]

document_content_description = "Sociological and political researches and analyses"


### OpenAI

In [None]:
# llm = ChatOpenAI(temperature=0)

# retriever = SelfQueryRetriever.from_llm(
#     llm=llm,
#     vectorstore=vectorstore,
#     document_contents="text",
#     document_content_description=document_content_description,
#     metadata_field_info=metadata_field_info,
#     search_kwargs={"k": 5}
# )


In [None]:
# --- Simple prompt ---
template = """Aşağıda bazı metin parçaları verilmiştir. Bu bilgilere dayanarak son kullanıcı sorusuna tarafsız ve güvenli bir şekilde yanıt ver:

Metin parçaları:
{context}

Soru: {question}
Yanıt:"""

prompt = PromptTemplate(
    input_variables=["context", "question"],
    template=template
)


#### GPT 3.5 Turbo

In [None]:
# --- QA Chain ---
qa_chain = RetrievalQA.from_chain_type(
    llm=ChatOpenAI(model="gpt-3.5-turbo", temperature=0),
    retriever=vectorstore.as_retriever(search_kwargs={"k": 4}),
    chain_type="stuff",
    chain_type_kwargs={"prompt": prompt}
)

# --- User question ---
query = "Türkiye büyük millet meclisi ile İspanya parlamentosu arasındaki en önemli farklar nelerdir?"

# --- Get Answer ---
result = qa_chain.run(query)
print("\nYanıt:\n", result)



#### GPT 3.5 Turbo (Newer Release)

In [None]:
# --- QA Chain ---
qa_chain = RetrievalQA.from_chain_type(
    llm=ChatOpenAI(model="gpt-3.5-turbo-0125", temperature=0),
    retriever=vectorstore.as_retriever(search_kwargs={"k": 4}),
    chain_type="stuff",
    chain_type_kwargs={"prompt": prompt}
)

# --- User question ---
query = "Türkiye büyük millet meclisi ile İspanya parlamentosu arasındaki en önemli farklar nelerdir?"

# --- Get Answer ---
result = qa_chain.run(query)
print("\nYanıt:\n", result)



#### GPT 4.1 Mini

In [None]:
# --- QA Chain ---
qa_chain = RetrievalQA.from_chain_type(
    llm=ChatOpenAI(model="gpt-4.1-mini-2025-04-14", temperature=0),
    retriever=vectorstore.as_retriever(search_kwargs={"k": 4}),
    chain_type="stuff",
    chain_type_kwargs={"prompt": prompt}
)

# --- User question ---
query = "Türkiye büyük millet meclisi ile İspanya parlamentosu arasındaki en önemli farklar nelerdir?"

# --- Get Answer ---
result = qa_chain.run(query)
print("\nYanıt:\n", result)



#### GPT 4-o

In [None]:
# --- QA Chain ---
qa_chain = RetrievalQA.from_chain_type(
    llm=ChatOpenAI(model="gpt-4o-2024-08-06", temperature=0),
    retriever=vectorstore.as_retriever(search_kwargs={"k": 4}),
    chain_type="stuff",
    chain_type_kwargs={"prompt": prompt}
)

# --- User question ---
query = "Türkiye büyük millet meclisi ile İspanya parlamentosu arasındaki en önemli farklar nelerdir?"

# --- Get Answer ---
result = qa_chain.run(query)
print("\nYanıt:\n", result)



### Google Gemini

In [None]:
import google.generativeai as genai
from google.generativeai import GenerativeModel

In [None]:
google_api_key = os.getenv('GOOGLE_API_KEY')

In [None]:
# Configure Gemini API
genai.configure(api_key=google_api_key)

class GeminiQAChain:
    def __init__(self, model_name="gemini-2.5-flash", vectorstore=None, k=4, prompt_template=None):
        """
        Initialize Gemini QA Chain
        
        Args:
            model_name: "gemini-2.5-pro" or "gemini-2.5-flash"
            vectorstore: Your vector store (same as before)
            k: Number of documents to retrieve
            prompt_template: Custom prompt template
        """
        self.model = GenerativeModel(model_name)
        self.vectorstore = vectorstore
        self.k = k
        self.prompt_template = prompt_template or self._default_prompt_template()
    
    def _default_prompt_template(self):
        return """
        Context: {context}
        
        Question: {question}
        
        Please provide a comprehensive answer based on the context provided above. If the context doesn't contain enough information to answer the question, please state that clearly.
        
        Answer:
        """
    
    def run(self, query):
        """
        Run the QA chain with the given query
        """
        # Retrieve relevant documents
        if self.vectorstore:
            retriever = self.vectorstore.as_retriever(search_kwargs={"k": self.k})
            docs = retriever.get_relevant_documents(query)
            context = "\n\n".join([doc.page_content for doc in docs])
        else:
            context = "No context provided"
        
        # Format the prompt
        formatted_prompt = self.prompt_template.format(
            context=context,
            question=query
        )
        
        # Generate response using Gemini
        try:
            response = self.model.generate_content(
                formatted_prompt,
                generation_config={
                    "temperature": 0,
                    "top_p": 1,
                    "top_k": 1
                }
            )
            return response.text
        except Exception as e:
            return f"Error generating response: {str(e)}"


In [None]:
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

# 2. Instantiate vectorstore without upserting data
vectorstore = PineconeVectorStore(
    index_name=index_name,
    embedding=embeddings
)

In [None]:
custom_prompt = """
Bağlam: {context}

Soru: {question}

Lütfen yukarıdaki bağlam bilgilerine dayanarak kapsamlı bir yanıt verin. Türkçe yanıtlayın.

Yanıt:
"""

# --- User question ---
query = "Türkiye büyük millet meclisi ile İspanya parlamentosu arasındaki en önemli farklar nelerdir?"

#### Gemini 2.5 Pro

In [None]:
# Option 1: Using Gemini 2.5 Pro (more capable, slower)
qa_chain_pro = GeminiQAChain(
    model_name="gemini-2.5-pro",
    vectorstore=vectorstore,
    k=4,
    prompt_template=custom_prompt
)

result_pro = qa_chain_pro.run(query)
print("\nGemini 2.5 Pro Yanıtı:\n", result_pro)

#### Gemini 2.5 Flash

In [None]:
# Option 2: Using Gemini 2.5 Flash (faster, good performance)
qa_chain_flash = GeminiQAChain(
    model_name="gemini-2.5-flash",
    vectorstore=vectorstore,  # Your existing vectorstore
    k=4,
    prompt_template=custom_prompt
)

result_flash = qa_chain_flash.run(query)
print("\nGemini 2.5 Flash Yanıtı:\n", result_flash)

#### Gemini 2.5 Flash-Lite

In [None]:
# Option 2: Using Gemini 2.5 Flash (faster, good performance)
qa_chain_flash = GeminiQAChain(
    model_name="gemini-2.5-flash-lite",
    vectorstore=vectorstore,  # Your existing vectorstore
    k=4,
    prompt_template=custom_prompt
)

result_flash = qa_chain_flash.run(query)
print("\nGemini 2.5 Flash-Lite Yanıtı:\n", result_flash)

### DeepSeek

In [None]:
import openai
from openai import OpenAI

In [None]:
# Configure DeepSeek API (uses OpenAI-compatible interface)
deepseek_api_key = os.getenv('DEEPSEEK_API_KEY')

client = OpenAI(
    api_key=deepseek_api_key,
    base_url="https://api.deepseek.com"
)


In [None]:
class DeepSeekQAChain:
    def __init__(self, model_name="deepseek-chat", vectorstore=None, k=4, prompt_template=None, temperature=0):
        """
        Initialize DeepSeek QA Chain
        
        Args:
            model_name: "deepseek-chat" (main model)
            vectorstore: Your vector store
            k: Number of documents to retrieve
            prompt_template: Custom prompt template
            temperature: 0 for deterministic, higher for creative
        """
        self.client = client
        self.model_name = model_name
        self.vectorstore = vectorstore
        self.k = k
        self.temperature = temperature
        self.prompt_template = prompt_template or self._default_prompt_template()
    
    def _default_prompt_template(self):
        return """
        Context: {context}
        
        Question: {question}
        
        Please provide a comprehensive answer based on the context provided above. If the context doesn't contain enough information to answer the question, please state that clearly.
        
        Answer:
        """
    
    def run(self, query):
        """
        Run the QA chain with the given query
        """
        # Retrieve relevant documents
        if self.vectorstore:
            retriever = self.vectorstore.as_retriever(search_kwargs={"k": self.k})
            docs = retriever.get_relevant_documents(query)
            context = "\n\n".join([doc.page_content for doc in docs])
        else:
            context = "No context provided"
        
        # Format the prompt
        formatted_prompt = self.prompt_template.format(
            context=context,
            question=query
        )
        
        # Generate response using DeepSeek
        try:
            response = self.client.chat.completions.create(
                model=self.model_name,
                messages=[
                    {"role": "user", "content": formatted_prompt}
                ],
                temperature=self.temperature,
                max_tokens=2048,
                top_p=1,
                frequency_penalty=0,
                presence_penalty=0,
                stream=False
            )
            return response.choices[0].message.content
        except Exception as e:
            return f"Error generating response: {str(e)}"


In [None]:
# --- Custom prompt for Turkish content ---
custom_prompt = """
Bağlam: {context}

Soru: {question}

Lütfen yukarıdaki bağlam bilgilerine dayanarak detaylı ve kapsamlı bir yanıt ver. Yanıtınızı Türkçe olarak yaz ve mümkün olduğunca objektif bir şekilde karşılaştırma yap.

Yanıt:
"""

#### DeepSeek Chat

In [None]:
# Using DeepSeek Chat (general purpose, excellent reasoning)
qa_chain_chat = DeepSeekQAChain(
    model_name="deepseek-chat",
    vectorstore=vectorstore,  # Your existing vectorstore
    k=4,
    temperature=0,
    prompt_template=custom_prompt
)


In [None]:
# --- User question ---
query = "Türkiye büyük millet meclisi ile İspanya parlamentosu arasındaki en önemli farklar nelerdir?"

# --- Get Answer ---
# Using DeepSeek Chat
result_chat = qa_chain_chat.run(query)
print("\nDeepSeek Chat Yanıtı:\n", result_chat)