In [None]:
import os
os.environ["OPENAI_API_KEY"]="Your_GPT_key_here"
pdf_directory = "Your_pdf_directory"
# Example on windows: r"C:\Users\Name\Desktop\documents"

In [None]:
from dokument import Dokument
import os
import fitz 

# For getting DOI from file name, modify this to the naming convention of your prepared PDF folder
def get_doi_from_filename(filename):
    return "10.1145/" + os.path.splitext(filename)[0]

# Initialize the list to hold Dokument objects
dokument_list = []
# File counter to keep track of number of files processed
file_counter = 0
# Loop through all files in the directory
for filename in os.listdir(pdf_directory):
    if filename.endswith(".pdf"):
        file_counter += 1
        file_path = os.path.join(pdf_directory, filename)
        doi = get_doi_from_filename(filename)

        with fitz.open(file_path) as doc:
            raw_data = ""
            for page_num in range(doc.page_count):
                page = doc[page_num]
                # Extract raw text from the document
                raw_data += page.get_text()
        # Debug lines
        print(f"Processing PDF file number {file_counter}: {filename}")
        print("DOI:", doi)
        print("Extracted text:", raw_data[:100])  # Print first 100 characters for debug
        print("\n")

        # Create a Dokument object with DOI and raw data attributes and add it to the list
        dokument = Dokument(DOI=doi, raw_data=raw_data)
        dokument_list.append(dokument)

In [None]:
# Check number of documents for debugging
print(len(dokument_list), "documents with raw data extracted")

In [None]:
def count_tokens(dokument_list, model):
    long_documents_found = False
    print("Documents with more than 127,000 tokens:")
    for i, document in enumerate(dokument_list):
        # Count the number of tokens in the document with the language model
        num_tokens = model.get_num_tokens(document.raw_data)
        document.token_count = num_tokens
        if num_tokens > 127000:
            long_documents_found = True
            # Debug message for documents exceeding the token limit
            print(f"Document {i + 1}: DOI = {document.DOI}, Tokens = {num_tokens}")
    if not long_documents_found:
        print("No documents have more than 127,000 tokens.")
    # Print information about the token limit of the GPT-4 model
    print(""""\ngpt-4-0125-preview model only supports up to 128,000 tokens. 
    Documents with raw data exceeding 127,000 will not be added to document list to be used for research.
    """)


In [None]:
# Need to define LLM model to count token
from langchain_openai import ChatOpenAI
gpt4_model = ChatOpenAI(temperature=0, model_name="gpt-4-0125-preview")
count_tokens(dokument_list, gpt4_model)

dokument_list = [dokument for dokument in dokument_list if int(dokument.token_count) <= 127000]
# debug
print("After filtering out documents with more than 127000 tokens,",len(dokument_list),"documents remaining")

In [None]:
# Save dokument_list with pickle
import pickle

with open("dokument_list.pkl", "wb") as file:
    pickle.dump(dokument_list, file)