In [None]:
from google import genai
from google.genai import types
import dotenv
import mammoth
import os
import unicodedata
import re
import time

dotenv.load_dotenv()

client = genai.Client(
    api_key=dotenv.get_key(dotenv.find_dotenv(), 'GEMINI_API_KEY')
)

## Sanitize filenames and convert docx to md

In [None]:

def sanitize_filename(filename):
    replacements = {
        'þ': 's',
        'µ': 's',
        'ƒ': 'c',
        'đ': 'dz',
        'ð': 'z',
        'º': 'z',
    }
    
    name, ext = os.path.splitext(filename)
    for old, new in replacements.items():
        name = name.replace(old, new)
    
    name = unicodedata.normalize('NFKD', name).encode('ascii', 'ignore').decode('ascii')
    name = re.sub(r'[^a-zA-Z0-9 \-_]', '', name)
    
    clean_name = f'{name.strip()}{ext.lower()}'
    return clean_name

directory = 'sources/'

if os.path.exists(directory):
    print(f'Renaming: {directory}\n')
    
    for filename in os.listdir(directory):
        old_path = os.path.join(directory, filename)
        
        if not os.path.isfile(old_path):
            continue
            
        new_filename = sanitize_filename(filename)
        new_path = os.path.join(directory, new_filename)
        
        if old_path != new_path:
            try:
                if os.path.exists(new_path):
                    print(f'Warning: File {new_filename} already exists. Skipping.')
                else:
                    os.rename(old_path, new_path)
                    print(f'Renamed: {filename} -> {new_filename}')
            except Exception as e:
                print(f'Error renaming {filename}: {e}')
    
    print('\nRenaming completed.')
else:
    print(f'Directory "{directory}" does not exist.')

In [None]:

# Configuration
folder = 'sources'

def ignore_images(image):
    return []


for filename in os.listdir(folder):
    if filename.endswith(".docx"):
        docx_path = os.path.join(folder, filename)
        md_path = os.path.join(folder, filename.replace(".docx", ".md"))

        with open(docx_path, "rb") as docx_file:
            # Convert to markdown
            result = mammoth.convert_to_markdown(docx_file, convert_image=ignore_images)
            markdown = result.value # The generated markdown
            
            with open(md_path, "w", encoding="utf-8") as md_file:
                md_file.write(markdown)
                
        print(f"Converted: {filename}")

print("Done!")

## Uploading to file store

In [66]:
# File name will be visible in citations
store_name = 'ai-fact-checker-ZPPKZ-documents'
file_search_store = client.file_search_stores.create(config={'display_name': store_name})

In [None]:
for store in client.file_search_stores.list():
    print(store)

In [None]:
# go over all the pdf and docx files in the "sources" folder, upload them to the file search store with display name

uploading = False
for filename in os.listdir(directory):
    if filename == 'KME_Igor.docx':
        uploading = True
    
    if not uploading:
        print(f'Skipping: {filename}')
        continue
    
    if filename.endswith(".pdf") or filename.endswith(".docx"):
        # for docx, upload md version
        if filename.endswith(".docx"):
            file_to_upload = os.path.join(directory, filename.replace(".docx", ".md"))
        else:
            file_to_upload = os.path.join(directory, filename)

        print(f'Uploading: {filename}, file: {file_to_upload}')

        operation = client.file_search_stores.upload_to_file_search_store(
            file=file_to_upload,
            file_search_store_name=file_search_store.name,
            config={
                'display_name' : filename
            }
        )
        while not operation.done or operation.error:
            time.sleep(2)
            operation = client.operations.get(operation)

        if operation.error:
            print(f'Error uploading {filename}: {operation.error}')
        else:
            print(f'Uploaded: {filename}')

print('All files uploaded.')

In [None]:
for document_in_store in client.file_search_stores.documents.list(parent=file_search_store.name):
  print(document_in_store)

In [None]:
# client.file_search_stores.delete(name=file_search_store.name, config={'force': True})
# client.file_search_stores.delete(name='fileSearchStores/aifactcheckerzppkzdocuments-s32ne3nci4pn', config={'force': True})
