In [1]:
import os
import PyPDF2
import openai
import faiss
from fpdf import FPDF
import tiktoken
from dotenv import load_dotenv
import json
from langchain_openai.embeddings.base import OpenAIEmbeddings
import numpy as np
import re
import string
from langchain.vectorstores import FAISS

from langchain.chat_models import ChatOpenAI
from langchain.schema import SystemMessage, HumanMessage
import markdown
import pdfkit

In [2]:
load_dotenv()

True

In [3]:
def extract_text_from_pdfs(pdf_folder):
    documents = []
    for filename in os.listdir(pdf_folder):
        if filename.endswith('.pdf'):
            # Assume filename format: CompanyName_Year.pdf
            pdf_name = os.path.splitext(filename)[0]  # Remove .pdf extension
            parts = pdf_name.split('_')
            if len(parts) >= 2:
                company = '_'.join(parts[:-1])
                year = parts[-1]
            else:
                company = pdf_name
                year = 'Unknown'
            pdf_path = os.path.join(pdf_folder, filename)
            with open(pdf_path, 'rb') as f:
                reader = PyPDF2.PdfReader(f)
                text = ''
                for page in reader.pages:
                    text += page.extract_text()
            # Store text with metadata
            doc = {
                'company': company,
                'year': year,
                'text': text
            }
            documents.append(doc)
    return documents

In [4]:
pdf_folder = 'data/sample_reports'
documents = extract_text_from_pdfs(pdf_folder)

In [6]:
def split_text_into_chunks(doc, max_tokens=500):
    tokenizer = tiktoken.get_encoding("cl100k_base")
    tokens = tokenizer.encode(doc['text'])
    chunks = []
    for i in range(0, len(tokens), max_tokens):
        chunk_tokens = tokens[i:i + max_tokens]
        decoded_chunk = tokenizer.decode(chunk_tokens)
        chunk = {
            'company': doc['company'],
            'year': doc['year'],
            'text': decoded_chunk
        }
        chunks.append(chunk)
    return chunks

In [7]:
def clean_text(text):
    # Remove special characters and extra whitespace
    text = text.replace('\n', ' ').replace('\t', ' ')
    text = re.sub(' +', ' ', text)
    # Optionally remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Convert to lowercase
    text = text.lower()
    return text.strip()

In [8]:
all_chunks = []
for doc in documents:
    chunks = split_text_into_chunks(doc)
    for chunk in chunks:
        chunk['text'] = clean_text(chunk['text'])
    all_chunks.extend(chunks)

In [9]:
def save_chunks(chunks, filepath):
    with open(filepath, 'w', encoding='utf-8') as f:
        json.dump(chunks, f, ensure_ascii=False, indent=4)

def load_chunks(filepath):
    with open(filepath, 'r', encoding='utf-8') as f:
        chunks = json.load(f)
    return chunks

In [10]:
def get_embeddings(chunks):
    embeddings_model = OpenAIEmbeddings()  
    texts = [chunk['text'] for chunk in chunks]
    embeddings = embeddings_model.embed_documents(texts)
    ids = []
    for idx, chunk in enumerate(chunks):
        ids.append(idx)
        chunk['id'] = idx 
    return embeddings, ids, embeddings_model



In [11]:
embeddings, ids, embeddings_model = get_embeddings(all_chunks)

In [12]:
chunks_file = 'data/chunks.json'
save_chunks(all_chunks, chunks_file)
print(f"Saved chunks to {chunks_file}")

Saved chunks to data/chunks.json


In [13]:
def save_embeddings(embeddings, ids, embeddings_path, ids_path):
    np.save(embeddings_path, embeddings)
    np.save(ids_path, ids)

def load_embeddings(embeddings_path, ids_path):
    embeddings = np.load(embeddings_path)
    ids = np.load(ids_path)
    return embeddings, ids

In [14]:
embeddings_file = 'data/embedings.npy'
ids_file = 'data/ids.npy'

save_embeddings(embeddings, ids, embeddings_file, ids_file)
print(f"Saved embeddings to {embeddings_file} and IDs to {ids_file}")

Saved embeddings to data/embedings.npy and IDs to data/ids.npy


In [15]:
def build_faiss_index(embeddings, ids):
    dimension = len(embeddings[0])
    index = faiss.IndexIDMap(faiss.IndexFlatL2(dimension))
    index.add_with_ids(np.array(embeddings).astype('float32'), np.array(ids))
    return index

In [16]:
print("Building the FAISS index...")
index = build_faiss_index(embeddings, ids)


Building the FAISS index...


In [17]:
def build_faiss_vectorstore(chunks, embeddings_model):
    texts = [chunk['text'] for chunk in chunks]
    metadatas = [{'company': chunk['company'], 'year': chunk['year'], 'id': chunk['id']} for chunk in chunks]
    vector_store = FAISS.from_texts(texts, embeddings_model, metadatas=metadatas)
    return vector_store

In [18]:
def retrieve_relevant_chunks(prompt, vector_store, embeddings_model, company, year, k):
    # Use metadata filtering
    filter_criteria = {'company': company, 'year': year}
    relevant_docs = vector_store.similarity_search(prompt, k=k, filter=filter_criteria)
    relevant_chunks = [{'company': doc.metadata['company'],
                        'year': doc.metadata['year'],
                        'text': doc.page_content,
                        'id': doc.metadata['id']} for doc in relevant_docs]
    return relevant_chunks


In [19]:
vectorstore_file = 'data/vectorstore.pkl'

print("Building the vector store with metadata...")
vector_store = build_faiss_vectorstore(all_chunks, embeddings_model)
# Save the vector store
vector_store.save_local(vectorstore_file)
print(f"Saved vector store to {vectorstore_file}")

Building the vector store with metadata...
Saved vector store to data/vectorstore.pkl


In [20]:
all_chunks1 = load_chunks('data/chunks.json')
embeddings1, ids1 =load_embeddings('data/embedings.npy', 'data/ids.npy')
index = build_faiss_index(embeddings1, ids1)

In [134]:

company = "Apple"
year = "2021"
prompt = f"Generate an ESG report focusing on environmental factors of {company} for {year}."
# Retrieve relevant chunks

embeddings_model1 = OpenAIEmbeddings()
print("Retrieving relevant chunks...")
relevant_chunks = retrieve_relevant_chunks(prompt, vector_store, embeddings_model, company, year, k=10)
relevant_chunks

Retrieving relevant chunks...


[{'company': 'Apple',
  'year': '2021',
  'text': 'we\xa0can offer technologically innovative products and services while conserving and enhancing resources for future generations apple strives for continuous improvement in our environmental health and safety management systems and in the environmental quality of our products processes and services guiding principles meet or exceed all applicable environmental health and safety requirements we will evaluate our ehs performance by monitoring ongoing performance results and through periodic management reviews where laws and regulations do not provide adequate controls we will adopt our own standards to protect human health and the environment support and promote sound scientific principles and fiscally responsible public policy that enhance environmental quality health and safety advocate the adoption of prudent environmental health and safety principles and practices by our contractors vendors and suppliers communicate environmental hea

In [21]:
section_descriptions = {
    "General Information": "Information about the company's name, legal form, country of incorporation, and financial year.",
    "Scope of the Report": "Details about the boundaries of the report, including entities included and time period covered.",
    "Governance Structure": "This section covers the company's governance structure, including its board of directors, management, and internal control systems.",
    "Business Strategy": "This section describes the company's strategy, including how it aligns with sustainability objectives.",
    "Sustainability Strategy": "This section describes the company's strategy towards becoming more sustainable and beneficial for its stakeholders, including environment and larger society as a whole",
    "Business Model": "This section describes the company's business model, and how it affects environment and society at large.",
    "Overall Environmental Performance": "This section covers the company's environmental impacts and performance, including its greenhouse gas emissions, water use, waste generation, and biodiversity conservation efforts.",
    "Greenhouse Gas Emissions": "Details about its greenhouse gas emissions",
    "Energy": "Details about its total energy consumption, energy intensity of its business model",
    "Water and Marine Resources": "Details about the company's intensity of water usage and potential pollution",
    "Biodiversity and Ecosystems": "Details about how the company affects biological environment and biodiveristy",
    "Circular Economy": "Details about how the company integrates itself into the circular economy",
    "Overall Social Performance": "This section covers the company's social impacts and performance, including its human rights record, labor practices, community engagement, and diversity and inclusion initiatives.",
    "Workforce": "Details on total employee headcount, turnover rate, gender pay gap and diversity",
    "Occupational Health and Safety": "Details on work-related injuries and fatalities, as well as Lost time injury frequency rate (LTIFR)",
    "Training and Development": "Details on average hours of training per employee, as well as details on how regular performance reviews are",
    "Human Rights": "Details about human rights violations the company and measures to prevent them",
    "Overall Governance Performance": "This section covers the company's governance performance, including its risk management, stakeholder engagement, and internal control systems.",
    "Business Conduct": "Details about whistleblowing mechanisms and anti-corruption and anti-bribery measures",
    "Corporate Governance": "Details about board composition and diversity, as well as executive compensation linked to sustainability measures",
    "Risk Management": "Details on sustainability risks and opportunities, as well as integration of sustainability factors into risk management process",
    "ESG reporting due diligence": "This section covers the company's due diligence processes for identifying and addressing risks in its value chain.",
    "Assurance": "This section describes the assurance procedures that the company has in place to verify the accuracy and completeness of its sustainability report."
}

In [22]:
def generate_report(prompt, context_chunks):
    # Combine the text from the relevant chunks
    context = "\n\n".join([chunk['text'] for chunk in context_chunks])
    messages = [
        SystemMessage(content=f"You are an expert in generating ESG reports based on provided information.\n You will have to create a report in Markdown format. Based on the data provided by the user, split the data into these main sections: {section_descriptions}. Please also include additional data available to you which is relevant to the context. If there is not data available for a specific section, please state that explicitly!"),
        HumanMessage(content=f"{context}\n\nPlease generate an ESG report based on the above information. {prompt}")
    ]

    chat = ChatOpenAI(
    model_name='gpt-3.5-turbo',
    temperature=0.2,
    max_tokens=2000,
    openai_api_key=os.getenv('OPENAI_API_KEY')  # Use environment variable for API key
    )

    response = chat(messages)
    return response.content


In [23]:
def generate_markdown(header, footer, content1):
    """Generates a Markdown document from given header, footer, and content strings.
    Args:
        header (str): The header text.
        footer (str): The footer text.
        content (str): The report content
    Returns:
        str: The generated Markdown content.
    """
    markdown = f"# {header}\n\n"  # Add the header
    # Add content for each page, separating them with page breaks (---)
    for content in [content1]:
        markdown += f"{content}\n\n---\n\n"
    markdown += f"\n## {footer}"  # Add the footer
    return markdown

In [24]:
reports = [
    {
        "company": "Apple",
        "year": 2019
    },
    {
        "company": "Apple",
        "year": 2021
    },
    {
        "company": "Apple",
        "year": 2023
    },
    {
        "company": "Amazon",
        "year": 2019
    },
    {
        "company": "Amazon",
        "year": 2021
    },
    {
        "company": "Amazon",
        "year": 2023
    },
    {
        "company": "Google",
        "year": 2019
    },
    {
        "company": "Google",
        "year": 2021
    },
    {
        "company": "Google",
        "year": 2023
    },
    {
        "company": "BP",
        "year": 2019
    },
    {
        "company": "BP",
        "year": 2021
    },
    {
        "company": "BP",
        "year": 2023
    }
]

In [25]:
for report in reports:
    company = report['company']
    year = report['year']

    prompt = f"Generate an ESG report focusing on environmental factors of {company} for {year}."

    relevant_chunks = retrieve_relevant_chunks(prompt, vector_store, embeddings_model, company, year, k=100)
    report = generate_report(prompt, relevant_chunks)
    header = "Regulated Reports"
    footer = "Copyright © 2024"
    markdown_output = generate_markdown(header, footer, report)
    #print(markdown_output)
    with open(f"data/final_reports/{company}_{year}.md", "w") as file:
        file.write(markdown_output) 
    

  chat = ChatOpenAI(
  response = chat(messages)


In [27]:
for report in reports:
    company = report['company']
    year = report['year']

    with open(f'data/final_reports/{company}_{year}.md', 'r') as md_file:
        markdown_content = md_file.read()

    html_content = markdown.markdown(markdown_content)

    # Save the HTML content into a temporary file
    with open('temp.html', 'w') as html_file:
        html_file.write(html_content)

    pdfkit.from_file('temp.html', f'./data/final_reports/{company}_{year}.pdf')
