In [4]:
import os
import json
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import JsonOutputParser
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [6]:
# Will be used as Training Data
apple_loader = PyPDFLoader("Apple_Environmental_Progress_Report_2024.pdf")
apple_pages = apple_loader.load()

apple_document = ""
for i in range(len(apple_pages)):
    apple_document += apple_pages[i].page_content

In [7]:
# Split PDFs
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    model_name="gpt-4",
    chunk_size=800,
    chunk_overlap=400,
)

apple_chunks = text_splitter.split_text(apple_document)

In [None]:
label_template = """
You are an AI assistant tasked with generating a single, realistic question-answer pair based on a given document. The question should be something a user might naturally ask when seeking information contained in the document.

Given: {chunk}

Instructions:
1. Analyze the key topics, facts, and concepts in the given document, choose one to focus on.
2. Generate twenty similar questions that a user might ask to find the information in this document that does NOT contain any company name.
3. Use natural language and occasionally include typos or colloquialisms to mimic real user behavior in the question.
4. Ensure the question is semantically related to the document content WITHOUT directly copying phrases.
5. Make sure that all of the questions are similar to eachother. I.E. All asking about a similar topic/requesting the same information.

Output Format:
Return a JSON object with the following structure:
```json
{{
  "question_1": "Generated question text",
  "question_2": "Generated question text",
  ...
}}
```

Be creative, think like a curious user, and generate your 20 similar questions that would naturally lead to the given document in a semantic search. Ensure your response is a valid JSON object containing only the questions.

"""
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")
label_prompt = ChatPromptTemplate.from_template(label_template)
llm = ChatOpenAI(temperature=1.0, model="gpt-4o-mini")

label_chain = label_prompt | llm | JsonOutputParser()

OpenAIError: The api_key client option must be set either by passing api_key to the client or by setting the OPENAI_API_KEY environment variable