# 0. Install required libraries and define constants

In [None]:
pip install openai

In [None]:
%env OPENAI_API_KEY=<REPLACE_WITH_YOUR_KEY>

# 1. Setup OpenAI client

In [None]:
from openai import OpenAI
client = OpenAI()

Setup a completion request, which is basically a history of messages that the model will use to generate a response.

In [None]:
response = client.chat.completions.create(
  model="gpt-4o",
  messages=[
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": "What is the capital of Germany?"},
    {"role": "assistant", "content": "The capital of Germany is Berlin. Berlin has around 3 Million inhabitants. The most important attraction is the 'Brandenburger Tor'"},
    {"role": "user", "content": "What is the capital of Italy?"},
  ]
)

Print the response from the GTP-4o model.

In [None]:
print(response.choices[0].message.content)

# 2. Load the transripts and perform the same action for each

Define a folder to save the GPT-4o's output for each transcript to:

In [None]:
LLM_OUTPUT_FOLDER = "llm_ouput"

import os

# Check if the folder exists, if not, create it
if not os.path.exists(LLM_OUTPUT_FOLDER):
    os.makedirs(LLM_OUTPUT_FOLDER)
    print(f"Folder '{LLM_OUTPUT_FOLDER}' created.")
else:
    print(f"Folder '{LLM_OUTPUT_FOLDER}' already exists.")

Create a function that takes a transcript and creates a prompt object for GPT-4o:

In [None]:
def create_prompt_messages(transcript):
    messages = [
        {"role": "user", "content": f"Extract all people mentioned in the following video transcript as a comma-separated list. Only include famous or well-known people. DO NOT PREPEND ANYTHING BEFORE THE LIST! \n\nHere is the transcript: '{transcript.strip()}'"}
    ]
    return messages

Define a function to write GPT-4o's results to a file

In [None]:
import csv

def write_result_to_file(llm_results, output_file_name):
    
    fields = ["file_name", "transcript", "llm_result"]

    csv_file = f"{LLM_OUTPUT_FOLDER}/{output_file_name}" 
    
    # Write data to the CSV file
    with open(csv_file, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.DictWriter(file, fieldnames=fields, quoting=csv.QUOTE_ALL, escapechar='\\')
        
        # Write the header row
        writer.writeheader()
        
        # Write the data rows
        for row in llm_results:
            writer.writerow(row)

    print(f"Data successfully written to {csv_file}")

    

Iterate through all files in the transcript folder and process with GPT-4o:

In [None]:
import os
import glob

# Define the folder path
folder_path = "transcripts"

# Get a list of all text files in the folder
transcript_files = glob.glob(os.path.join(folder_path, '*.txt'))

rows = []

counter = 0
max_transcripts = 10

# Iterate through the list of text files
for file_path in transcript_files:

    # Leave the loop if we have surpassed the threshold
    if counter > max_transcripts:
            break
    
    with open(file_path, 'r', encoding='utf-8') as file:
        
        # Read the content of the file and append it to the variable
        transcript = file.read()
        print(f"File: {file_path} with length {len(transcript)}")

        # Create the messages object using the previously defined function
        messages = create_prompt_messages(transcript)

        try:
            output = client.chat.completions.create(
                model="gpt-4o",
                 messages=messages
                 )
            llm_result = output.choices[0].message.content
            row = { "file_name" : file_path, "transcript": transcript, "llm_result": llm_result }
            rows.append(row)
        except Exception as e:
            print(f"Error: {e}")

    # Increase counter
    counter = counter + 1

write_result_to_file(rows, "extracted_people_gpt4o.csv")