In [None]:
!pip install --upgrade openai httpx

Collecting openai
  Downloading openai-1.63.2-py3-none-any.whl.metadata (27 kB)
Downloading openai-1.63.2-py3-none-any.whl (472 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m472.3/472.3 kB[0m [31m11.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: openai
  Attempting uninstall: openai
    Found existing installation: openai 1.61.1
    Uninstalling openai-1.61.1:
      Successfully uninstalled openai-1.61.1
Successfully installed openai-1.63.2


In [None]:
from IPython import get_ipython
from IPython.display import display
# %%
!pip install --upgrade openai httpx
# %%
import openai
import os
import random
from datetime import datetime
import json

# Set OpenAI API Key
api_key = 'INSERT API KEY HERE'  # Replace with your actual API key
os.environ['OPENAI_API_KEY'] = api_key

# Create the client with the API key
client = openai.OpenAI(api_key=api_key)

# Existing assistant ID
assistant_id = 'INSERT ID HERE'# Replace with your assistant ID

# Initialize variables
EPC_extraction = ""
thread_messages = []

# List all PDF files within the folder and its subfolders
folder_path = 'INSERT YOUR FOLDER PATH HERE' # Replace with your folder path
pdf_files = []
for root, _, files in os.walk(folder_path):
    for file in files:
        if file.endswith('.pdf'):
            pdf_files.append(os.path.join(root, file))  # Get full file path


# Function to call the assistant synchronously
def call_assistant(client, thread_id, messages):
    response = None
    try:
        client.beta.threads.messages.create(
            thread_id=thread_id,
            role="user",
            content=messages
        )
        response = client.beta.threads.runs.create_and_poll(
            thread_id=thread_id,
            assistant_id=assistant_id
        )
    except Exception as e:
        print(f"Error in call_assistant: {e}")
    return response

# Function to extract text content from the response
def extract_text_content(response):
    assistant_response = ""  # Initialize an empty string to store the assistant's response
    try:
        for message in response:
            if message.role == "assistant":
                for content_block in message.content:
                    if content_block.type == "text":
                        assistant_response += content_block.text.value + "\n"  # Add the text value and a newline
    except Exception as e:
        print(f"Error in extract_text_content: {e}")
    return assistant_response  # Return the extracted assistant's response

# Process each PDF file using the assistant's file search capability
def process_files():
    global EPC_extraction
    previous_EPC_extraction = EPC_extraction  # Store the initial EPC_extraction

    # Check if pdf_files is empty before sampling
    if not pdf_files:
        print("No PDF files found in the specified folder.")
        return  # Exit the function if no files are found

    # Select up to 5 files from the list
    selected_files = pdf_files[:min(len(pdf_files), 5)] # Select the first 5 files

    processed_files = set()  # Keep track of processed files
    for pdf_file in selected_files: # Iterate through the selected files
        for i in range(5):  # Repeat the prompt 5 times for each file
            try:
                # Create a new thread for each file
                thread = client.beta.threads.create()

                # Create message to assistant
                messages = f"please read '{pdf_file}', and complete the first task given in the instructions. Do not move forward to the 2nd task [ .pdf] is the DDC editing document. In the document, DDC is commonly mentioned in the form of class number and human readable labels such as ‘613.04 Personal health of people by gender, sex, or age group"

                # Call the assistant synchronously
                run = call_assistant(client, thread.id, messages)

                if run and run.status == "completed":
                    response = client.beta.threads.messages.list(thread_id=thread.id)

                    # Convert the SyncCursorPage[Message] to a list of dictionaries
                    messages_list = []
                    for message in response:
                        message_dict = {
                            "role": message.role,
                            "content": [
                                {
                                    "type": content_block.type,
                                    "value": content_block.text.value if content_block.type == "text" else None  # Handle other content types if needed
                                } for content_block in message.content
                            ]
                        }
                        messages_list.append(message_dict)

                    # Debugging: Print the full response for inspection
                    print("Full response from assistant:")
                    for message in messages_list:
                        print(message)

                    # Track messages and thread
                    thread_message = {
                        'timestamp': datetime.now().isoformat(),
                        'pdf_file': pdf_file,  # Only the file name
                        'thread_id': thread.id,  # Include thread id
                        'EPC_extraction': messages_list  # Store the list of message dictionaries
                    }
                    thread_messages.append(thread_message)

                else:
                    print(f"Run not completed for file {pdf_file}, status: {run.status if run else 'No run'}")
                    thread_message = {
                        'timestamp': datetime.now().isoformat(),
                        'pdf_file': pdf_file,
                        'thread_id': 'N/A',
                        'EPC_extraction': 'unprocessed'
                    }
                    thread_messages.append(thread_message)
                    processed_files.add(pdf_file)

            except Exception as e:
                print(f"Error processing file {pdf_file}: {e}")
                thread_message = {
                    'timestamp': datetime.now().isoformat(),
                    'pdf_file': pdf_file,
                    'thread_id': 'N/A',
                    'EPC_extraction': f"Failed with exception: {e}"
                }
                thread_messages.append(thread_message)

        # Track unprocessed files (This part might be redundant now)
        # for pdf_file in selected_files:
        #     if pdf_file not in processed_files:
        #         thread_message = {
        #             'timestamp': datetime.now().isoformat(),
        #             'pdf_file': pdf_file,
        #             'thread_id': 'N/A',  # No thread ID since it wasn't processed
        #             'EPC_extraction': 'unprocessed'
        #         }
        #         thread_messages.append(thread_message)

        # Save thread messages for tracking
        with open('thread_messages.json', 'w') as f:
            json.dump(thread_messages, f, indent=4)


    print(f"EPC_extraction creation and update process completed for {len(processed_files)} files.")
    # Print the first thread_message for verification
    if thread_messages:
        print("First thread_message result:")
        print(json.dumps(thread_messages[0], indent=4))

# Run the processing function
if __name__ == "__main__":
    process_files()


# Load data from thread_messages.json
with open('thread_messages.json', 'r') as f:
    thread_messages = json.load(f)

# Extract relevant data into a list of dictionaries
data = []
for thread_message in thread_messages:
    data.append({
        'Timestamp': thread_message['timestamp'],
        'PDF File': thread_message['pdf_file'],
        'EPC_extraction': thread_message['EPC_extraction']
    })

# Create a pandas DataFrame
import pandas as pd
df = pd.DataFrame(data)

# Save DataFrame to a CSV file
df.to_csv('INSERT FOLDER PATH', index=False) # Replace with your output folder path

Full response from assistant:
{'role': 'assistant', 'content': [{'type': 'text', 'value': 'From the document "EPC 144-S30.1 Transvestism and cross dressing.pdf," the following warrants were extracted, along with their classifications as specified:\n\n### Warrant Classifications\n\n1. **Document**\n   - Document: EPC 144-S30.1 Transvestism and cross dressing\n   \n2. **Literature**\n   - Anupama, M et al. “Transvestism as a Symptom: A Case Series.” Indian Journal of Psychological Medicine, vol. 38, no. 1 (2016): 78-80. doi:10.4103/0253-7176.175131. [https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4782454/](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4782454/)\n   - Brown, George R. “Transvestic Disorder - Psychiatric Disorders.” Merck Manuals Professional Edition, 18 Apr. 2023. [https://www.merckmanuals.com/professional/psychiatric-disorders/paraphilic-disorders/transvestic-disorder](https://www.merckmanuals.com/professional/psychiatric-disorders/paraphilic-disorders/transvestic-disorder)