In [1]:
import pdfplumber
import json
from pathlib import Path
from typing import Union, Dict


def extract_text_from_pdf(pdf_path: Union[str, Path]) -> str:
    """
    Extract all text from a PDF file using pdfplumber.
    
    Args:
        pdf_path: Path to the PDF file
        
    Returns:
        Extracted text as a single string
    """
    pdf_path = Path(pdf_path)
    
    if not pdf_path.exists():
        raise FileNotFoundError(f"PDF file not found: {pdf_path}")
    
    text = []
    
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            page_text = page.extract_text()
            if page_text:
                text.append(page_text)
    
    return '\n\n'.join(text)


def clean_text(text: str) -> str:
    """
    Basic text cleaning for RAG ingestion.
    
    Args:
        text: Raw extracted text
        
    Returns:
        Cleaned text
    """
    import re
    
    # Remove hyphen patterns between any letters (including Turkish characters)
    text = re.sub(r'(\w)\s*-\s*(\w)', r'\1\2', text)
    
    # Add space between lowercase letter followed by uppercase letter
    text = re.sub(r'([a-zçğıöşü])([A-ZÇĞİÖŞÜ])', r'\1 \2', text)
    
    # Remove excessive whitespace
    lines = [line.strip() for line in text.split('\n')]
    lines = [line for line in lines if line]
    
    # Join with single newlines
    cleaned = '\n'.join(lines)
    
    # Remove multiple spaces
    cleaned = ' '.join(cleaned.split())
    
    return cleaned


def save_as_json(text: str, output_path: Path, metadata: Dict = None) -> None:
    """
    Save extracted text as JSON with metadata.
    
    Args:
        text: Processed text
        output_path: Path for output JSON file
        metadata: Optional metadata dict
    """
    data = {
        "text": text,
        "char_count": len(text),
        "metadata": metadata or {}
    }
    
    with open(output_path, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=2)


def save_as_text(text: str, output_path: Path) -> None:
    """
    Save extracted text as plain text file.
    
    Args:
        text: Processed text
        output_path: Path for output text file
    """
    with open(output_path, 'w', encoding='utf-8') as f:
        f.write(text)


def process_pdf_directory(
    input_dir: str = r"C:\Users\yigit\Desktop\Enterprises\arayuz-9\pdfs",
    output_dir: str = r"C:\Users\yigit\Desktop\Enterprises\arayuz-9\text",
    output_format: str = "json",  # "json" or "txt"
    clean: bool = True
) -> None:
    """
    Process all PDFs in a directory and save to output directory.
    
    Args:
        input_dir: Directory containing PDF files
        output_dir: Directory to save processed files
        output_format: Output format ("json" or "txt")
        clean: Whether to apply text cleaning
    """
    input_path = Path(input_dir)
    output_path = Path(output_dir)
    
    # Create output directory if it doesn't exist
    output_path.mkdir(parents=True, exist_ok=True)
    
    # Find all PDF files
    pdf_files = list(input_path.glob("*.pdf"))
    
    if not pdf_files:
        print(f"No PDF files found in {input_dir}")
        return
    
    print(f"Found {len(pdf_files)} PDF files to process\n")
    
    # Process each PDF
    for pdf_file in pdf_files:
        try:
            print(f"Processing: {pdf_file.name}...", end=" ")
            
            # Extract and clean text
            text = extract_text_from_pdf(pdf_file)
            
            if clean:
                text = clean_text(text)
            
            # Prepare output filename
            output_filename = pdf_file.stem
            
            # Save in chosen format
            if output_format == "json":
                output_file = output_path / f"{output_filename}.json"
                metadata = {
                    "source_file": pdf_file.name,
                    "source_path": str(pdf_file)
                }
                save_as_json(text, output_file, metadata)
            else:
                output_file = output_path / f"{output_filename}.txt"
                save_as_text(text, output_file)
            
            print(f"✓ Saved to {output_file.name} ({len(text)} chars)")
            
        except Exception as e:
            print(f"✗ Error: {e}")
    
    print(f"\nProcessing complete! Files saved to: {output_dir}")


# Run the processor
if __name__ == "__main__":
    # Process all PDFs and save as JSON (recommended for RAG - includes metadata)
    process_pdf_directory(output_format="json")
    
    # Or save as plain text files
    # process_pdf_directory(output_format="txt")

Found 4 PDF files to process

Processing: 1) Temel kavramlar önyargı, kalıpyargı ve ayrımcılık.pdf... ✓ Saved to 1) Temel kavramlar önyargı, kalıpyargı ve ayrımcılık.json (30688 chars)
Processing: 2) Ayrımcılık ve medya.pdf... ✓ Saved to 2) Ayrımcılık ve medya.json (39186 chars)
Processing: 3) Toplumsal Cinsiyete Dayalı Ayrımcılık.pdf... ✓ Saved to 3) Toplumsal Cinsiyete Dayalı Ayrımcılık.json (30249 chars)
Processing: 4) Uluslararası Af Örgütü Raporu 2021-2022 Avrupa ve Orta Asya Değerlendirmesi(sayfa 46-54).pdf... ✓ Saved to 4) Uluslararası Af Örgütü Raporu 2021-2022 Avrupa ve Orta Asya Değerlendirmesi(sayfa 46-54).json (31836 chars)

Processing complete! Files saved to: C:\Users\yigit\Desktop\Enterprises\arayuz-9\text


In [3]:
# Run the processor
if __name__ == "__main__":
    # Process all PDFs and save as JSON (recommended for RAG - includes metadata)
    process_pdf_directory(output_format="json")
    
    # Or save as plain text files
    # process_pdf_directory(output_format="txt")

Found 89 PDF files to process

Processing: 1) Temel kavramlar önyargı, kalıpyargı ve ayrımcılık.pdf... ✓ Saved to 1) Temel kavramlar önyargı, kalıpyargı ve ayrımcılık.json (30688 chars)
Processing: 10) TÜRKİYE’DE ÖRGÜTLENME ÖZGÜRLÜĞÜNÜN GENEL GÖRÜNÜMÜ-II .pdf... ✓ Saved to 10) TÜRKİYE’DE ÖRGÜTLENME ÖZGÜRLÜĞÜNÜN GENEL GÖRÜNÜMÜ-II .json (173146 chars)
Processing: 11) Yurttaslik_Alani_Bilgi_Notu_1.pdf... ✓ Saved to 11) Yurttaslik_Alani_Bilgi_Notu_1.json (26995 chars)
Processing: 12) TERÖRLE MÜCADELEYİ ARAÇSALLAŞTIRMAK.pdf... ✓ Saved to 12) TERÖRLE MÜCADELEYİ ARAÇSALLAŞTIRMAK.json (90302 chars)
Processing: 13) PROTESTO HAKKINI KORU.pdf... ✓ Saved to 13) PROTESTO HAKKINI KORU.json (230476 chars)
Processing: 14) KomploTeorileri_AR_23.03.23_web.pdf... ✓ Saved to 14) KomploTeorileri_AR_23.03.23_web.json (78319 chars)
Processing: 15) Feminist_Hareketin_Gundemleri_.pdf... ✓ Saved to 15) Feminist_Hareketin_Gundemleri_.json (46519 chars)
Processing: 16) Sivil Toplum Kuruluşlarının Devlet Tarafında

Could get FontBBox from font descriptor because None cannot be parsed as 4 floats
Could get FontBBox from font descriptor because None cannot be parsed as 4 floats
Could get FontBBox from font descriptor because None cannot be parsed as 4 floats
Could get FontBBox from font descriptor because None cannot be parsed as 4 floats
Could get FontBBox from font descriptor because None cannot be parsed as 4 floats
Could get FontBBox from font descriptor because None cannot be parsed as 4 floats
Could get FontBBox from font descriptor because None cannot be parsed as 4 floats
Could get FontBBox from font descriptor because None cannot be parsed as 4 floats
Could get FontBBox from font descriptor because None cannot be parsed as 4 floats
Could get FontBBox from font descriptor because None cannot be parsed as 4 floats
Could get FontBBox from font descriptor because None cannot be parsed as 4 floats
Could get FontBBox from font descriptor because None cannot be parsed as 4 floats
Could get FontBB

✓ Saved to 19) Kampüsten Öğrenci Toplulukları .json (398702 chars)
Processing: 2) Ayrımcılık ve medya.pdf... 

Could get FontBBox from font descriptor because None cannot be parsed as 4 floats
Could get FontBBox from font descriptor because None cannot be parsed as 4 floats
Could get FontBBox from font descriptor because None cannot be parsed as 4 floats
Could get FontBBox from font descriptor because None cannot be parsed as 4 floats
Could get FontBBox from font descriptor because None cannot be parsed as 4 floats
Could get FontBBox from font descriptor because None cannot be parsed as 4 floats
Could get FontBBox from font descriptor because None cannot be parsed as 4 floats


✓ Saved to 2) Ayrımcılık ve medya.json (39186 chars)
Processing: 20) Gençler Ne(ler) İstiyor_ .pdf... 

Could get FontBBox from font descriptor because None cannot be parsed as 4 floats
Could get FontBBox from font descriptor because None cannot be parsed as 4 floats
Could get FontBBox from font descriptor because None cannot be parsed as 4 floats
Could get FontBBox from font descriptor because None cannot be parsed as 4 floats
Could get FontBBox from font descriptor because None cannot be parsed as 4 floats
Could get FontBBox from font descriptor because None cannot be parsed as 4 floats
Could get FontBBox from font descriptor because None cannot be parsed as 4 floats
Could get FontBBox from font descriptor because None cannot be parsed as 4 floats


✓ Saved to 20) Gençler Ne(ler) İstiyor_ .json (26763 chars)
Processing: 21) Türkiye’de Gençlik ve Siyaset_ Gelecek İçin Nasıl Bir Katılım_ .pdf... ✓ Saved to 21) Türkiye’de Gençlik ve Siyaset_ Gelecek İçin Nasıl Bir Katılım_ .json (66638 chars)
Processing: 22) Gençlik Araştırmaları Dergisi 13.sayı.pdf... ✓ Saved to 22) Gençlik Araştırmaları Dergisi 13.sayı.json (556468 chars)
Processing: 23) Türkiye_de Gençlik Miti 1980 Sonrası Türkiye Gençliği İletişim Yayınları.pdf... ✓ Saved to 23) Türkiye_de Gençlik Miti 1980 Sonrası Türkiye Gençliği İletişim Yayınları.json (433421 chars)
Processing: 24) Türkiye’nin Gençliği Araştırması Raporu -SODEV- .pdf... 

Cannot set gray non-stroke color because /'P275' is an invalid float value


✓ Saved to 24) Türkiye’nin Gençliği Araştırması Raporu -SODEV- .json (7739 chars)
Processing: 25) Türkiye’de Gençlerin Güvencesizliği_ Çalışma, Geçim ve Yaşam Algısı.pdf... ✓ Saved to 25) Türkiye’de Gençlerin Güvencesizliği_ Çalışma, Geçim ve Yaşam Algısı.json (152567 chars)
Processing: 26) Toplumun Boğaziçi Üniversitesi Olaylarına Bakışı.pdf... 

Cannot set gray non-stroke color because /'P0' is an invalid float value
Cannot set gray non-stroke color because /'P1' is an invalid float value
Cannot set gray non-stroke color because /'P2' is an invalid float value
Cannot set gray non-stroke color because /'P3' is an invalid float value
Cannot set gray non-stroke color because /'P4' is an invalid float value
Cannot set gray non-stroke color because /'P5' is an invalid float value
Cannot set gray non-stroke color because /'P6' is an invalid float value
Cannot set gray non-stroke color because /'P7' is an invalid float value
Cannot set gray non-stroke color because /'P8' is an invalid float value
Cannot set gray non-stroke color because /'P9' is an invalid float value
Cannot set gray non-stroke color because /'P10' is an invalid float value
Cannot set gray non-stroke color because /'P11' is an invalid float value
Cannot set gray non-stroke color because /'P12' is an invalid float value
Cannot set gray non-stroke color because /'P13' 

✓ Saved to 26) Toplumun Boğaziçi Üniversitesi Olaylarına Bakışı.json (32694 chars)
Processing: 27) Kürt Gençler’20 Benzerlikler Farklar Değişimler.pdf... 

Cannot set gray non-stroke color because /'P0' is an invalid float value
Cannot set gray non-stroke color because /'P1' is an invalid float value
Cannot set gray non-stroke color because /'P2' is an invalid float value
Cannot set gray non-stroke color because /'P3' is an invalid float value
Cannot set gray non-stroke color because /'P4' is an invalid float value
Cannot set gray non-stroke color because /'P5' is an invalid float value
Cannot set gray non-stroke color because /'P6' is an invalid float value
Cannot set gray non-stroke color because /'P7' is an invalid float value
Cannot set gray non-stroke color because /'P8' is an invalid float value
Cannot set gray non-stroke color because /'P9' is an invalid float value
Cannot set gray non-stroke color because /'P10' is an invalid float value
Cannot set gray non-stroke color because /'P11' is an invalid float value
Cannot set gray non-stroke color because /'P12' is an invalid float value
Cannot set gray non-stroke color because /'P13' 

✓ Saved to 27) Kürt Gençler’20 Benzerlikler Farklar Değişimler.json (111196 chars)
Processing: 28) NEET Gençler Araştırması – NEET Gençlerin İnsan Onuruna Yaraşır Yaşam Sürme Hakkına Erişimi.pdf... ✓ Saved to 28) NEET Gençler Araştırması – NEET Gençlerin İnsan Onuruna Yaraşır Yaşam Sürme Hakkına Erişimi.json (245763 chars)
Processing: 29) TGSP Türkiye’nin Gençleri Araştırması.pdf.pdf... 

Cannot set gray non-stroke color because /'P0' is an invalid float value
Cannot set gray non-stroke color because /'P1' is an invalid float value
Cannot set gray non-stroke color because /'P2' is an invalid float value
Cannot set gray non-stroke color because /'P3' is an invalid float value
Cannot set gray non-stroke color because /'P4' is an invalid float value
Cannot set gray non-stroke color because /'P5' is an invalid float value


✓ Saved to 29) TGSP Türkiye’nin Gençleri Araştırması.pdf.json (66812 chars)
Processing: 3) Toplumsal Cinsiyete Dayalı Ayrımcılık.pdf... ✓ Saved to 3) Toplumsal Cinsiyete Dayalı Ayrımcılık.json (30249 chars)
Processing: 30) TOG Gençlik Çalışmasının Toplumsal Katılıma Etkisi Araştırması.pdf... ✓ Saved to 30) TOG Gençlik Çalışmasının Toplumsal Katılıma Etkisi Araştırması.json (170706 chars)
Processing: 31) Türkiye’de Gençlerin İyi Olma Hali Saha Araştırması Bulguları- HABİTAT- .pdf... ✓ Saved to 31) Türkiye’de Gençlerin İyi Olma Hali Saha Araştırması Bulguları- HABİTAT- .json (203949 chars)
Processing: 32) Türkiye Gençlik Araştırması 2021.pdf... ✓ Saved to 32) Türkiye Gençlik Araştırması 2021.json (789221 chars)
Processing: 33) Türkiye’nin Gençliği Araştırması Raporu -SODEV- 2020.pdf... ✓ Saved to 33) Türkiye’nin Gençliği Araştırması Raporu -SODEV- 2020.json (9099 chars)
Processing: 34)Uluslararası Af Örgütü.pdf... ✓ Saved to 34)Uluslararası Af Örgütü.json (208862 chars)
Processing: 37) P

Cannot set gray non-stroke color because /'P187' is an invalid float value


✓ Saved to 78) TOG Üniversiteli Gençlerin İhtiyaçları Araştırması 2024.json (45542 chars)
Processing: 79) Haberlerdeki Üniversite 2022.pdf... ✓ Saved to 79) Haberlerdeki Üniversite 2022.json (70979 chars)
Processing: 8) genc-oy-strateji-rapor.pdf... ✓ Saved to 8) genc-oy-strateji-rapor.json (133740 chars)
Processing: 80) TOG Gençlerin İhtiyaçları Araştırması 2022.pdf... ✓ Saved to 80) TOG Gençlerin İhtiyaçları Araştırması 2022.json (16198 chars)
Processing: 81) Yereliz GENÇLİK ALANINDA ÇALIŞAN SİVİL TOPLUM ÖRGÜTLERİ İÇİN YEREL SAVUNUCULUK REHBERİ.pdf... ✓ Saved to 81) Yereliz GENÇLİK ALANINDA ÇALIŞAN SİVİL TOPLUM ÖRGÜTLERİ İÇİN YEREL SAVUNUCULUK REHBERİ.json (58555 chars)
Processing: 82) KONDA Barometre 2024.pdf... ✓ Saved to 82) KONDA Barometre 2024.json (39283 chars)
Processing: 83) OECD Youth Policy Toolkit.pdf... ✓ Saved to 83) OECD Youth Policy Toolkit.json (550241 chars)
Processing: 84) TİP_li Öğrenciler Barınma Raporu 2023.pdf... ✓ Saved

Cannot set non-stroke color because 2 components are specified but only 1 (grayscale), 3 (rgb) and 4 (cmyk) are supported
Cannot set non-stroke color because 2 components are specified but only 1 (grayscale), 3 (rgb) and 4 (cmyk) are supported
Cannot set non-stroke color because 2 components are specified but only 1 (grayscale), 3 (rgb) and 4 (cmyk) are supported
Cannot set non-stroke color because 2 components are specified but only 1 (grayscale), 3 (rgb) and 4 (cmyk) are supported
Cannot set non-stroke color because 2 components are specified but only 1 (grayscale), 3 (rgb) and 4 (cmyk) are supported
Cannot set non-stroke color because 2 components are specified but only 1 (grayscale), 3 (rgb) and 4 (cmyk) are supported
Cannot set non-stroke color because 2 components are specified but only 1 (grayscale), 3 (rgb) and 4 (cmyk) are supported
Cannot set non-stroke color because 2 components are specified but only 1 (grayscale), 3 (rgb) and 4 (cmyk) are supported
Cannot set non-stroke co

✓ Saved to 90) FES Genclerin Gözünden Dindar-Seküler Eksenli Kutuplaşma.json (404742 chars)
Processing: 91) Veriler.pdf... ✓ Saved to 91) Veriler.json (17127 chars)

Processing complete! Files saved to: C:\Users\yigit\Desktop\Enterprises\arayuz-9\text
