In [2]:
import os
import shutil
import pandas as pd

# Load the list of poetry titles
poetry_file_path = 'filtered_texts_1600_1700.csv' 
df = pd.read_csv(poetry_file_path)


In [3]:
filtered_df = df[(df['jaar'] >= 1610) & (df['jaar'] <= 1670)]
print(len(filtered_df))

707


In [4]:
import os
import shutil
import pandas as pd


# Create a set of base filenames (without extensions) for fast lookup
file_list = set(filtered_df['ti_id'].dropna().unique())

# Define source and destination folder paths
source_folder = 'dbnl_xml'  # Update with the actual path of the XML files folder
destination_folder = '17thcentury_files' # Update with the actual path for output

# Ensure the destination folder exists
os.makedirs(destination_folder, exist_ok=True)

# Copy matching files from source to destination
for filename in os.listdir(source_folder):
    # Remove the .xml extension and any suffixes after an underscore
    base_name = filename.split(".xml")[0].split("_")[0]
    
    if base_name in file_list:
        source_path = os.path.join(source_folder, filename)
        destination_path = os.path.join(destination_folder, filename)
        shutil.copy2(source_path, destination_path)
        print(f"Copied {filename} to {destination_folder}")


Copied poir001afbe01_01.xml to 17thcentury_files
Copied rode001hert01_01.xml to 17thcentury_files
Copied stal001extr01_01.xml to 17thcentury_files
Copied jonc006hede01_01.xml to 17thcentury_files
Copied heyn003wegw01_01.xml to 17thcentury_files
Copied maer005stic01_01.xml to 17thcentury_files
Copied pass004spie01_01.xml to 17thcentury_files
Copied camp001uytb01_01.xml to 17thcentury_files
Copied stal001evan01_01.xml to 17thcentury_files
Copied ocke003heme01_01.xml to 17thcentury_files
Copied ruys008flor01_01.xml to 17thcentury_files
Copied crus001epig01_01.xml to 17thcentury_files
Copied thie008scha01_01.xml to 17thcentury_files
Copied haef001lust01_01.xml to 17thcentury_files
Copied gabb001lykt01_01.xml to 17thcentury_files
Copied hube012psal01_01.xml to 17thcentury_files
Copied bolo001ghee01_01.xml to 17thcentury_files
Copied merw001uyth01_01.xml to 17thcentury_files
Copied roch007natu01_01.xml to 17thcentury_files
Copied boel009onee01_01.xml to 17thcentury_files
Copied momm004brab02

In [5]:
num_files = len(os.listdir(destination_folder))
print(f"Number of files in {destination_folder}: {num_files}")

Number of files in 17thcentury_files: 320


In [6]:
from dbnl_bear import parse

# Ensure the parser instance is created (as in your initial code)
parser = parse.DBNLParser()

# Run the main parsing function
parser.dbnl_to_txt(input_dir="17thcentury_files", output_dir="tulips")


  0%|          | 0/320 [00:00<?, ?it/s]

100%|██████████| 320/320 [00:00<00:00, 536.81it/s]


Directory tulips created


100%|██████████| 320/320 [00:10<00:00, 29.96it/s]


In [None]:

from dotenv import load_dotenv
import os

# Load environment v0.ariables from .env file
load_dotenv()

True

In [2]:
import os
import asyncio
from langchain_openai import ChatOpenAI
from tqdm.asyncio import tqdm
from typing import Optional
from pydantic import BaseModel, Field, create_model
from langchain_core.prompts import ChatPromptTemplate
from langchain_text_splitters import RecursiveCharacterTextSplitter


text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=400,
    chunk_overlap=20,
    length_function=len,
    is_separator_regex=False,
)

def create_model_name(phenomenon_of_interest: str) -> str:
    return ''.join(word.capitalize() for word in phenomenon_of_interest.split())

def create_llm_analysis_model(phenomenon_of_interest: str) -> BaseModel:
    """
    This model goes to the LLM. The difference here is that the original_sentence field is not present
    as it would be a waste of resources to let the LLM do that.
    """
    model_name = create_model_name(phenomenon_of_interest)
    return create_model(
        f"{model_name}InText",
        explanation=(str, Field(description=f"Explain whether the sentence contains information about {phenomenon_of_interest}")),
        judgement=(bool, Field(description=f"Whether the sentence contains information about {phenomenon_of_interest}"))
    )


def create_full_analysis_model(phenomenon_of_interest: str) -> BaseModel:
    """
    This model is the same as the llm_analysis_model but with the original sentence field added.
    """
    model_name = create_model_name(phenomenon_of_interest)
    return create_model(
        f"{model_name}InText",
        explanation=(str, Field(description=f"Explain whether the sentence contains information about {phenomenon_of_interest}")),
        judgement=(bool, Field(description=f"Whether the sentence contains information about {phenomenon_of_interest}")),
        original_sentence=(str, Field(description="The original sentence from the document"))
    )

def get_system_prompt(phenomenon_of_interest: str) -> str:
    return f"""I am a Cultural Historian and Literary Scholar interested in  {phenomenon_of_interest}. Your task is to read sentences in Early Modern Dutch and indicate whether, 
    given my research interest, the sentence is relevant to my research. You should provide a clear explanation, a boolean judgement, and details about
    {phenomenon_of_interest} if present."""

async def analyze_document(input_file: str, phenomenon_of_interest: str, text_splitter=text_splitter,
                           model="gpt-4o-mini-2024-07-18", max_fragment_tasks=10):

    with open(input_file, 'r', encoding='utf-8') as f:
        original_text = f.read()

    sentences = text_splitter.split_text(original_text)

    api_key = os.getenv("OPENAI_API_KEY")
    if not api_key:
        raise ValueError("API key not found. Install python-dotenv, make .env style and save the API key there as: OPENAI_API_KEY=your-api-key-here")

    llm = ChatOpenAI(model=model)
    LLMAnalysisModel = create_llm_analysis_model(phenomenon_of_interest)
    FullAnalysisModel = create_full_analysis_model(phenomenon_of_interest)

    llm_structured_output = llm.with_structured_output(LLMAnalysisModel)

    system_prompt = get_system_prompt(phenomenon_of_interest)
    prompt = ChatPromptTemplate.from_messages([("system", system_prompt), ("human", "{input}")])
    structured_llm = prompt | llm_structured_output

    # Semaphore to limit concurrent fragment processing
    sem = asyncio.Semaphore(max_fragment_tasks)

    async def process_fragment(sentence):
        async with sem:
            return await analyze_sentence(sentence, structured_llm, FullAnalysisModel)

    tasks = [process_fragment(sentence) for sentence in sentences]

    # Use tqdm for progress tracking
    return await tqdm.gather(*tasks)


In [3]:
import nest_asyncio
nest_asyncio.apply()

In [None]:
import os
import asyncio

# Directory to save individual output files
output_dir = "output_relevant_passages"
os.makedirs(output_dir, exist_ok=True)  # Create output directory if it doesn't exist


async def process_file(filename):
    path = "tulips/" + filename
    result = await asyncio.wait_for(ai_read.analyze_document(path, "tulips"), timeout=120)
    relevant_passages = [e.original_sentence for e in result if e.judgement]
    if relevant_passages:
        output_file_path = os.path.join(output_dir, f"{filename}_relevant.txt")
        with open(output_file_path, "w", encoding="utf-8") as outfile:
            for passage in relevant_passages:
                outfile.write(f"{passage}\n")
    else:
            print(f"No relevant passages found in file {filename}")
                
async def main():
    tasks = []
    for filename in os.listdir("tulips"):
        tasks.append(process_file(filename))
    await asyncio.gather(*tasks)

if __name__ == "__main__":
    asyncio.run(main())

started with houw001para01.txt
started with zasy001borg01.txt
started with ampz001besc01.txt
started with hoof002jans02.txt
started with nieu001soph03.txt
started with born006gees01.txt
started with aker002clee01.txt
started with bors001denb01.txt
started with koni001acha01.txt
started with breu004cupi01.txt
started with nier007opre01.txt
started with momm004brab02.txt
started with goed012meta01.txt
started with cats001maec01.txt
started with fort007gees01.txt
started with zach001bruy01.txt
started with rula001saty01.txt
started with hond001dape02.txt
started with smid018sinn01.txt
started with krul001amst01.txt
started with brug061chri01.txt
started with uile003chri03.txt
started with vrol014matr01.txt
started with harm001suyv01.txt
started with vaer010haar01.txt
started with rode001casa01.txt
started with oost026ryme01.txt
started with dyck014oude01.txt
started with vict001goli01.txt
started with else006lacc01.txt
started with goos015nieu01.txt
started with lope001dull01.txt
started 

  0%|          | 0/1326 [00:00<?, ?it/s]
[A

[A[A


[A[A[A



[A[A[A[A




[A[A[A[A[A





[A[A[A[A[A[A






[A[A[A[A[A[A[A







[A[A[A[A[A[A[A[A








[A[A[A[A[A[A[A[A[A









[A[A[A[A[A[A[A[A[A[A










[A[A[A[A[A[A[A[A[A[A[A











[A[A[A[A[A[A[A[A[A[A[A[A












[A[A[A[A[A[A[A[A[A[A[A[A[A













[A[A[A[A[A[A[A[A[A[A[A[A[A[A














[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A















[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A
















[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A

















[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[AException ignored in: <function Task.__del__ at 0x7fa7ccac3c40>
Traceback (most recent call last):
  File "/usr/lib/python3.11/asyncio/tasks.py", line 142, in __del__
    super().__del__()
  File "/usr/lib/python3.11/asyncio/futures.py", line 91, in __del__
    def __del__(se

In [2]:
import os

# Count the number of files in the 17thcentury_files folder
num_files = len(os.listdir('tulips'))
print(f"Number of files in tulips: {num_files}")

# Count the number of words in all text files together
total_word_count = 0

for filename in os.listdir('tulips'):
    file_path = os.path.join('tulips', filename)
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()
        words = text.split()
        total_word_count += len(words)

print(f"Total number of words in all text files: {total_word_count}")

Number of files in tulips: 320
Total number of words in all text files: 14533127


In [5]:
import os

# Initialize total word count
total_word_count_relevant_passages = 0

# Iterate through each file in the output_relevant_passages folder
for filename in os.listdir('../output_relevant_passages'):
    file_path = os.path.join('../output_relevant_passages', filename)
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()
        words = text.split()
        total_word_count_relevant_passages += len(words)

print(f"Total number of words in all relevant passages: {total_word_count_relevant_passages}")

Total number of words in all relevant passages: 23618


In [6]:
import os

# Count the number of files in the ../output_relevant_passages folder
num_files_relevant_passages = len(os.listdir('../output_relevant_passages'))
print(f"Number of files in ../output_relevant_passages: {num_files_relevant_passages}")

Number of files in ../output_relevant_passages: 144
