In [1]:
import base64
import os
import re
from pathlib import Path
from mistralai import Mistral
from dotenv import load_dotenv

def encode_pdf(pdf_path):
    """Encode the pdf to base64."""
    try:
        with open(pdf_path, "rb") as pdf_file:
            return base64.b64encode(pdf_file.read()).decode('utf-8')
    except FileNotFoundError:
        print(f"Error: The file {pdf_path} was not found.")
        return None
    except Exception as e:
        print(f"Error: {e}")
        return None

def remove_image_references(text):
    """Remove markdown image references like ![img-3.jpeg](img-3.jpeg)"""
    # Pattern to match ![img-X.jpeg](img-X.jpeg) where X is any number
    pattern = r'!\[img-\d+\.jpeg\]\(img-\d+\.jpeg\)'
    cleaned_text = re.sub(pattern, '', text)
    # Remove extra blank lines that might result from removing images
    cleaned_text = re.sub(r'\n{3,}', '\n\n', cleaned_text)
    return cleaned_text

# Load environment variables
load_dotenv()

# Setup
api_key = os.environ["MISTRAL_API_KEY"]
client = Mistral(api_key=api_key)

pdf_folder = Path("pdfs")
output_folder = Path("text-mistral")
output_folder.mkdir(exist_ok=True)

# Process each PDF
for pdf_file in pdf_folder.glob("*.pdf"):
    # Check if output file already exists
    output_file = output_folder / f"{pdf_file.stem}.txt"
    if output_file.exists():
        print(f"Skipping: {pdf_file.name} (already processed)")
        continue
    
    print(f"Processing: {pdf_file.name}")
    
    # Encode PDF to base64
    base64_pdf = encode_pdf(pdf_file)
    if base64_pdf is None:
        continue
    
    # OCR with Mistral
    ocr_response = client.ocr.process(
        model="mistral-ocr-latest",
        document={
            "type": "document_url",
            "document_url": f"data:application/pdf;base64,{base64_pdf}" 
        },
        include_image_base64=True
    )
    
    # Extract markdown text from all pages and concatenate with line breaks
    extracted_text = "\n\n".join([page.markdown for page in ocr_response.pages])
    
    # Remove image references
    cleaned_text = remove_image_references(extracted_text)
    
    # Save extracted text as .txt file
    with open(output_file, "w", encoding="utf-8") as f:
        f.write(cleaned_text)
    
    print(f"✓ Saved to: {output_file.name}")

print("\nDone!")


Skipping: 1) Temel kavramlar önyargı, kalıpyargı ve ayrımcılık.pdf (already processed)
Skipping: 10) TÜRKİYE’DE ÖRGÜTLENME ÖZGÜRLÜĞÜNÜN GENEL GÖRÜNÜMÜ-II .pdf (already processed)
Skipping: 11) Yurttaslik_Alani_Bilgi_Notu_1.pdf (already processed)
Skipping: 12) TERÖRLE MÜCADELEYİ ARAÇSALLAŞTIRMAK.pdf (already processed)
Skipping: 13) PROTESTO HAKKINI KORU.pdf (already processed)
Skipping: 14) KomploTeorileri_AR_23.03.23_web.pdf (already processed)
Skipping: 15) Feminist_Hareketin_Gundemleri_.pdf (already processed)
Skipping: 16) Sivil Toplum Kuruluşlarının Devlet Tarafından Finansmanı Üzerine Bir Tartışma.pdf (already processed)
Skipping: 17) Gençlik Politikalarında Karşılaştırmalı Bir Değerlendirme-Türkiye ve Finlandiya Örneği.pdf (already processed)
Skipping: 18) Avrupa Konseyi Politik Karar Alma Süreçlerine Sivil Katılım Rehberi Çevirisi.pdf (already processed)
Skipping: 19) Kampüsten Öğrenci Toplulukları .pdf (already processed)
Skipping: 2) Ayrımcılık ve medya.pdf (already processe