In [19]:
import os
from pdfminer.high_level import extract_text
from openai import OpenAI
import pandas as pd

In [21]:
client = OpenAI(
    api_key="KEY",
)

In [11]:
folder_path = '/Users/thomaslee/Desktop/G1-2/Studio/CommunityBoard/monthly-full-board-meeting-minutes'
files = os.listdir(folder_path)
print(files)

['FBM_Minutes_03-04-15.pdf', '20-07-28.pdf', '16-05-24.pdf', 'FBM_Minutes_09-05-26.pdf', '18-02-27.pdf', '11-01-25.pdf', '10-07-27.pdf', 'FBM_Minutes_04-05-18.pdf', 'FBM_Minutes_08-03-25.pdf', '21-06-22.pdf', 'FBM_Minutes_06-01-17.pdf', '21-04-27.pdf', 'FBM_Minutes_08-11-25.pdf', '20-02-25.pdf', '20-12-22.pdf', '14-01-28.pdf', '11-03-22.pdf', 'july-2014.pdf', 'june-2014.pdf', '13-05-28.pdf', 'FBM_Minutes_06-06-20.pdf', '12-06-26.pdf', '11-11-22.pdf', '23-09-26.pdf', '20-10-27.pdf', '22-06-28.pdf', 'FBM_Minutes_06-04-18.pdf', 'june-2015.pdf', '15-10-27.pdf', '13-12-19.pdf', '23-05-23.pdf', 'FBM_Minutes_05-01-18.pdf', '10-05-25.pdf', 'FBM_Minutes_06-11-15.pdf', '24-01-23.pdf', '18-12-19.pdf', '16-9-27.pdf', 'FBM_Minutes_04-02-17.pdf', '19-04-23.pdf', 'FBM_Minutes_03-06-17.pdf', '17-01-24.pdf', '16-07-26.pdf', 'FBM_Minutes_04-07-27.pdf', '22-03-22.pdf', '18-10-23.pdf', '10-12-16.pdf', '22-11-22.pdf', '09-12-15.pdf', '23-07-25.pdf', 'FBM_Minutes_07-12-18.pdf', '11-06-28.pdf', 'FBM_Minutes_

In [96]:
def extract_text_from_pdf(pdf_path):
    return extract_text(pdf_path)

def split_text_into_chunks(text, words_per_chunk=1000):
    words = text.split()
    for i in range(0, len(words), words_per_chunk):
        yield ' '.join(words[i:i+words_per_chunk])

def query_openai(text, prompt):
    GPT_MODEL = "gpt-3.5-turbo" #"gpt-3.5-turbo-1106"
    messages = [
        {"role": "system", "content": 'You are a community board observer.'
        },
        {"role": "user", "content": f"{text}\n\n###\n\n{prompt}"},
    ]
    response = client.chat.completions.create(
        model=GPT_MODEL,
        messages=messages,
        temperature=0
    )
    response_message = response.choices[0].message.content
    return(response_message )

In [115]:
def process_pdf_file(pdf_path):
    filename = os.path.basename(pdf_path)
    text = extract_text_from_pdf(pdf_path)
    data = []

    prompts = [
        ("Key Terms", "Return two key terms from the meeting minutes."),
        ("Compliance", "Did the meeting comply with the Open Meetings Law and NYC Charter? If so, just return Yes and nothing else, if not, return the reason."),
        ("Intersting", "Return two key terms about anyting you noticed interesting/unusual/unexpected for a community board meeting.")
    ]

    for chunk in split_text_into_chunks(text):
        row = {'Filename': filename}  # Start with the filename
        for column_name, prompt_text in prompts:
            response = query_openai(chunk, prompt_text)
            row[column_name] = response  # Add response under the specific column for each prompt
        data.append(row)

    return pd.DataFrame(data)

In [114]:
def process_all_pdfs_in_folder(folder_path):
    all_data = []
    for filename in os.listdir(folder_path):
        if filename.endswith('.pdf'):
            pdf_path = os.path.join(folder_path, filename)
            try:
                df = process_pdf_file(pdf_path)
                all_data.append(df)
            except Exception as e:
                print(f"Failed to process {filename}: {e}")
    return pd.concat(all_data, ignore_index=True)

In [116]:
# Run the function
folder_path = '/Users/thomaslee/Desktop/G1-2/Studio/CommunityBoard/monthly-full-board-meeting-minutes'
result_df = process_all_pdfs_in_folder(folder_path)

In [117]:
# Save the result to a CSV file
print(result_df)

                     Filename  \
0    FBM_Minutes_03-04-15.pdf   
1    FBM_Minutes_03-04-15.pdf   
2    FBM_Minutes_03-04-15.pdf   
3    FBM_Minutes_03-04-15.pdf   
4                20-07-28.pdf   
..                        ...   
690  FBM_Minutes_07-07-31.pdf   
691  FBM_Minutes_07-07-31.pdf   
692              17-06-27.pdf   
693              17-06-27.pdf   
694              17-06-27.pdf   

                                             Key Terms Compliance  \
0    1. South Street Seaport C62-A – 120ft height c...        Yes   
1    1. Variance for Local Children\n2. Battery Par...        Yes   
2                 1. Liberty Bonds\n2. Jersey Barriers        Yes   
3        1. ABSENT: J. MORRONE\n2. PRESENT: NOTARO, A.        Yes   
4      1. Commercial rent tax\n2. Mortgage forbearance        Yes   
..                                                 ...        ...   
690       1. Rent Stabilization\n2. Affordable Housing        Yes   
691  1. World Trade Center (WTC) Construction Site\

In [118]:
result_df.to_csv('output.csv', index=False)