In [14]:
import os
import PyPDF2
import openai
import faiss
from fpdf import FPDF
import tiktoken
from dotenv import load_dotenv
import json
from langchain_openai.embeddings.base import OpenAIEmbeddings
import numpy as np

In [15]:
load_dotenv()

True

In [16]:
def extract_text_from_pdfs(pdf_folder):
    documents = []
    for filename in os.listdir(pdf_folder):
        if filename.endswith('.pdf'):
            # Assume filename format: CompanyName_Year.pdf
            pdf_name = os.path.splitext(filename)[0]  # Remove .pdf extension
            parts = pdf_name.split('_')
            if len(parts) >= 2:
                company = '_'.join(parts[:-1])
                year = parts[-1]
            else:
                company = pdf_name
                year = 'Unknown'
            pdf_path = os.path.join(pdf_folder, filename)
            with open(pdf_path, 'rb') as f:
                reader = PyPDF2.PdfReader(f)
                text = ''
                for page in reader.pages:
                    text += page.extract_text()
            # Store text with metadata
            doc = {
                'company': company,
                'year': year,
                'text': text
            }
            documents.append(doc)
    return documents

In [17]:
pdf_folder = 'data/sample_reports'
documents = extract_text_from_pdfs(pdf_folder)

In [18]:
documents[1]

{'company': 'Apple',
 'year': '2023',

In [19]:
def split_text_into_chunks(doc, max_tokens=500):
    tokenizer = tiktoken.get_encoding("cl100k_base")
    tokens = tokenizer.encode(doc['text'])
    chunks = []
    for i in range(0, len(tokens), max_tokens):
        chunk_tokens = tokens[i:i + max_tokens]
        decoded_chunk = tokenizer.decode(chunk_tokens)
        chunk = {
            'company': doc['company'],
            'year': doc['year'],
            'text': decoded_chunk
        }
        chunks.append(chunk)
    return chunks

In [20]:
all_chunks = []
for doc in documents:
    chunks = split_text_into_chunks(doc)
    all_chunks.extend(chunks)

In [21]:
def save_chunks(chunks, filepath):
    with open(filepath, 'w', encoding='utf-8') as f:
        json.dump(chunks, f, ensure_ascii=False, indent=4)

def load_chunks(filepath):
    with open(filepath, 'r', encoding='utf-8') as f:
        chunks = json.load(f)
    return chunks

In [32]:
chunks_file = 'data/chunks.json'
save_chunks(all_chunks, chunks_file)
print(f"Saved chunks to {chunks_file}")

Saved chunks to data/chunks.json


In [23]:
def get_embeddings(chunks):
    embeddings_model = OpenAIEmbeddings()  
    texts = [chunk['text'] for chunk in chunks]
    embeddings = embeddings_model.embed_documents(texts)
    ids = []
    for idx, chunk in enumerate(chunks):
        ids.append(idx)
        chunk['id'] = idx 
    return embeddings, ids, embeddings_model



In [24]:
embeddings, ids, embeddings_model = get_embeddings(all_chunks)

In [25]:
def save_embeddings(embeddings, ids, embeddings_path, ids_path):
    np.save(embeddings_path, embeddings)
    np.save(ids_path, ids)

def load_embeddings(embeddings_path, ids_path):
    embeddings = np.load(embeddings_path)
    ids = np.load(ids_path)
    return embeddings, ids

In [26]:
embeddings_file = 'data/embedings.npy'
ids_file = 'data/ids.npy'

save_embeddings(embeddings, ids, embeddings_file, ids_file)
print(f"Saved embeddings to {embeddings_file} and IDs to {ids_file}")

Saved embeddings to data/embedings.npy and IDs to data/ids.npy


In [27]:
def build_faiss_index(embeddings, ids):
    dimension = len(embeddings[0])
    index = faiss.IndexIDMap(faiss.IndexFlatL2(dimension))
    index.add_with_ids(np.array(embeddings).astype('float32'), np.array(ids))
    return index

In [28]:
print("Building the FAISS index...")
index = build_faiss_index(embeddings, ids)


Building the FAISS index...


## Code to be run

In [35]:
all_chunks1 = load_chunks('data/chunks.json')
embeddings1, ids1 =load_embeddings('data/embedings.npy', 'data/ids.npy')
index = build_faiss_index(embeddings1, ids1)


In [38]:
def retrieve_relevant_chunks(prompt, chunks, index, embeddings_model, k=1):
    # Generate the embedding for the prompt using LangChain's OpenAIEmbeddings
    prompt_embedding = embeddings_model.embed_query(prompt)
    # Perform the similarity search in the FAISS index
    D, I = index.search(np.array([prompt_embedding]).astype('float32'), k)
    relevant_chunks = []
    for idx in I[0]:
        if idx != -1:
            chunk = next((c for c in chunks if c['id'] == idx), None)
            if chunk:
                relevant_chunks.append(chunk)
    return relevant_chunks


In [41]:
# Prompt for report generation
prompt = "Generate an ESG report focusing on environmental factors of Apple for 2018"
# Retrieve relevant chunks

embeddings_model1 = OpenAIEmbeddings()
print("Retrieving relevant chunks...")
relevant_chunks = retrieve_relevant_chunks(prompt, all_chunks1, index, embeddings_model1)
relevant_chunks

Retrieving relevant chunks...


[{'company': 'Apple',
  'year': '2019',
  'text': 'growth in our business.7   We adjusted our methodology for fiscal year 2017 to take into account Apple’s “At Home Advisors” program, \nwhere employees work remotely. \n8  Because energy efficiency measures have lasting benefits, energy efficiency savings are calculated cumulatively \nsince 2011. All efficiency measures are retired based on their effective useful lifetime as documented by the \nClaifornia Energy Commission. \n9  We calculate our progress toward 100 percent renewable energy on a calendar year basis. Beginning January 1, \n2018, 100 percent of the electricity we use to power our global facilities is sourced from renewable energy. \n10  We have adjusted previous years’ avoided emissions to remove double counting biogas emissons.\n11 Beginning in fiscal year 2017, “Data centers” includes water use at colocation facilities. \n12  Beginning in fiscal year 2017, “Corporate” includes water use at Apple distribution centers.\n13

In [45]:
import openai
from fpdf import FPDF
import datetime

# Set up OpenAI API key

# Function to generate report content via OpenAI API
def generate_report_content():
    # Make API request to OpenAI to generate content
    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": "Generate a simple ESG report for Apple in 2023."}
        ]
    )

    return response['choices'][0]['message']['content']

# Function to generate PDF metadata
def generate_pdf_metadata():
    return {
        "title": "Apple Inc. ESG Score Report 2023",
        "author": "Generated by OpenAI",
        "subject": "ESG Report",
        "keywords": "ESG, Apple, Report, 2023",
        "creation_date": datetime.datetime.now().strftime("%Y-%m-%d"),
        "modification_date": datetime.datetime.now().strftime("%Y-%m-%d"),
        "producer": "FPDF"
    }

# Function to create PDF with content and metadata
def create_pdf(content, metadata):
    # Create PDF
    pdf = FPDF()
    
    # Add metadata
    pdf.set_title(metadata["title"])
    pdf.set_author(metadata["author"])
    pdf.set_subject(metadata["subject"])
    pdf.set_keywords(metadata["keywords"])

    # Add a page
    pdf.add_page()

    # Add title
    pdf.set_font("Arial", 'B', 16)
    pdf.cell(200, 10, txt=metadata["title"], ln=True, align='C')

    # Add content
    pdf.set_font("Arial", '', 12)
    pdf.multi_cell(0, 10, content)

    # Save PDF
    pdf.output("apple_esg_report_2023.pdf")

# Main function
def main():
    # Step 1: Get report content from OpenAI
    report_content = generate_report_content()

    # Step 2: Generate PDF metadata
    metadata = generate_pdf_metadata()

    # Step 3: Create the PDF report
    create_pdf(report_content, metadata)

main()


APIRemovedInV1: 

You tried to access openai.ChatCompletion, but this is no longer supported in openai>=1.0.0 - see the README at https://github.com/openai/openai-python for the API.

You can run `openai migrate` to automatically upgrade your codebase to use the 1.0.0 interface. 

Alternatively, you can pin your installation to the old version, e.g. `pip install openai==0.28`

A detailed migration guide is available here: https://github.com/openai/openai-python/discussions/742
