In [72]:
###################
#### Prompts   ####
###################

prompts = {}
prompts['identifier'] = 'Fasse den Text zusammen.'
# ...

In [73]:
###################
#### Config    ####
###################
OPENAU_API_KEY = ''
GPT_MODEL = 'gpt-4-1106-preview'

INPUT_FOLDER = 'input'
OUTPUT_FOLDER = 'output'

COLUMNS_OF_INTEREST = [
    'Link zur Sitzung',
    'Jahr',
    'Sitzungsdatum',
    'Download_Sitzung',
    'Download_Protokoll',
    'Person',
    'Partei',
    'Download Traktandumsdokumente',
    'Link zum Traktandum',
    'Traktandentitel',
    'Volltext'
]
FILENAME_SOURCE_COLUMN = 'Sitzungsdatum'
COMPLETION_SOURCE_COLUMN = 'Volltext'

In [74]:
import datetime
import io
###################
#### Functions ####
###################

from openai import OpenAI
import os
import simplejson
import pandas as pd

from dotenv import load_dotenv
load_dotenv()


client = OpenAI(
    api_key=OPENAU_API_KEY or os.getenv('OPENAI_API_KEY'),
)

def convert_to_datetime(date_str):
    try:
        if (isinstance(date_str, datetime.datetime)):
            return date_str
        dt = datetime.datetime.strptime(date_str, "%Y-%m-%d %H:%M:%S")
        return dt
    except:
        return date_str

def convert_datetime_to_str(date):
    try:
        if (isinstance(date, datetime.datetime)):
            return date.strftime("%Y%m%d%H%M%S")
        return date
    except:
        return date

def build_message(prompt, input_text):
    messages = [
        {"role": "user", "content": prompt},
        {"role": "user", "content": f'"{input_text}"'},
    ]
    return messages

def run_completion(prompts, input_text):
    result = {}
    for key, value in prompts.items():
        print(f'Run completion: "{key}"')
        try:
            completion_result = client.chat.completions.create(
                model=GPT_MODEL,
                messages=build_message(value, input_text)
            )
            assistant = completion_result.choices[0].message.content if completion_result else ''
            assistant_result = assistant.strip('"').strip('\'').strip()
            result[key] = assistant_result
        except Exception as e:
            error_message = f'Error occurred while complete "{key}": {str(e)}'
            print(error_message)
    return result

def write_to_output(filename, data):
    current_folder = os.getcwd()
    output_folder_path = os.path.join(current_folder, OUTPUT_FOLDER)
    if not os.path.exists(output_folder_path):
        os.makedirs(output_folder_path)

    file_path = os.path.join(output_folder_path, f'{filename}.json')
    if os.path.exists(file_path):
        os.remove(file_path)

    print(f'Write to file "{filename}"')
    with io.open(file_path, 'w', encoding='utf-8') as outfile:
        simplejson.dump(data, outfile, ensure_ascii=False, default=str, ignore_nan=True)

def read_input_file():
    file_path = None
    current_folder = os.getcwd()
    input_folder = os.path.join(current_folder, INPUT_FOLDER)
    supported_extensions = ['.xlsx', '.xls', '.csv']
    if os.path.exists(input_folder) and os.path.isdir(input_folder):
        for file in os.listdir(input_folder):
            if any(file.endswith(ext) for ext in supported_extensions):
                file_path = os.path.join(input_folder, file)
    if file_path is None:
        raise IOError('Input file not found.')
    
    if file_path.endswith('.xlsx') or file_path.endswith('.xls'):
        df = pd.read_excel(file_path)
    elif file_path.endswith('.csv'):
        df = pd.read_csv(file_path)
    else:
        raise ValueError('Unsupported file type. Please use a .xlsx, .xls or .csv file.')

    df_selected = df[COLUMNS_OF_INTEREST]

    dict_list = df_selected.to_dict(orient='records')

    return dict_list


In [75]:
###################
#### Execution ####
###################

print('>>> Start <<<')
input_data = read_input_file()
for data in input_data:
    print('#############################################')
    print(f'Processing meeting date: {data[FILENAME_SOURCE_COLUMN]}')
    print('#############################################')
    result = {}
    result['meta'] = data
    text = data[COMPLETION_SOURCE_COLUMN]
    date = convert_to_datetime(data[FILENAME_SOURCE_COLUMN])
    if text and text.strip():
        result['completion'] = run_completion(prompts, text)
    write_to_output(convert_datetime_to_str(date), result)
print('>>> End <<<')


>>> Start <<<
#############################################
Processing meeting date: 2008-03-18 00:00:00
#############################################
Run completion: "identifier"
Write to file "20080318000000"
#############################################
Processing meeting date: 2005-02-15 00:00:00
#############################################
Run completion: "identifier"
Write to file "20050215000000"
#############################################
Processing meeting date: 2016-02-23 00:00:00
#############################################
Run completion: "identifier"
Write to file "20160223000000"
#############################################
Processing meeting date: 2016-02-23 00:00:00
#############################################
Run completion: "identifier"
Write to file "20160223000000"
#############################################
Processing meeting date: 2018-05-08 00:00:00
#############################################
Run completion: "identifier"
Write to file "20180508000000"
#