In [19]:
import pdfplumber

# Define ESG-related keywords
esg_keywords = [
    "ESG", "Environmental", "Social", "Governance",
    "Sustainability", "Climate", "Carbon", "Diversity",
    "Equity", "Inclusion", "Social Impact", "Emissions",
    "Water Stewardship", "Waste Reduction", "Renewable",
    "Human Rights", "Corporate Responsibility"
]

# Path to the annual report PDF
pdf_path = "2024-Annual-Report.pdf"

# Function to extract ESG-related text
def extract_esg_content(pdf_path, keywords):
    esg_content = []
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            text = page.extract_text()
            if text:
                for keyword in keywords:
                    if keyword.lower() in text.lower():
                        esg_content.append(text)
                        break  # Avoid duplicate entries from the same page
    return esg_content

# Extract ESG content
esg_data = extract_esg_content(pdf_path, esg_keywords)

# Save extracted content to a text file
output_path = "ESG_Content_Extracted.txt"
with open(output_path, "w", encoding="utf-8") as file:
    file.write("\n\n".join(esg_data))

# Output file location
print(f"ESG-related content extracted and saved to: {output_path}")

ESG-related content extracted and saved to: ESG_Content_Extracted.txt


In [20]:
import pdfplumber
import spacy
import pandas as pd

# Load the spaCy NER model
nlp = spacy.load("en_core_web_sm")

# Define ESG-related keywords
esg_keywords = [
    "ESG", "Environmental", "Social", "Governance",
    "Sustainability", "Climate", "Carbon", "Diversity",
    "Equity", "Inclusion", "Social Impact", "Emissions",
    "Water Stewardship", "Waste Reduction", "Renewable",
    "Human Rights", "Corporate Responsibility"
]

# Function to extract ESG-related text from the PDF
def extract_esg_text(pdf_path, keywords):
    esg_text = []
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            text = page.extract_text()
            if text:
                for keyword in keywords:
                    if keyword.lower() in text.lower():
                        esg_text.append(text)
                        break  # Avoid duplicate entries from the same page
    return " ".join(esg_text)  # Combine all ESG-related text

# Function to apply Named Entity Recognition (NER)
def extract_entities(text):
    doc = nlp(text)
    entities = {"Organization": [], "Location": [], "Regulation": []}

    for ent in doc.ents:
        if ent.label_ == "ORG":
            entities["Organization"].append(ent.text)
        elif ent.label_ == "GPE":
            entities["Location"].append(ent.text)
        elif ent.label_ == "LAW":
            entities["Regulation"].append(ent.text)

    # Remove duplicates
    for key in entities:
        entities[key] = list(set(entities[key]))

    return entities

# Main function
def main():
    pdf_path = "2024-Annual-Report.pdf"  # Change to your file path
    print("Extracting ESG-related content...")
    esg_text = extract_esg_text(pdf_path, esg_keywords)

    if not esg_text:
        print("No ESG-related content found in the document.")
        return

    print("Applying Named Entity Recognition (NER)...")
    extracted_entities = extract_entities(esg_text)

    # Convert extracted data into a DataFrame
    df = pd.DataFrame(dict([(k, pd.Series(v)) for k, v in extracted_entities.items()]))

    # Save to CSV
    output_csv = "Extracted_ESG_Entities.csv"
    df.to_csv(output_csv, index=False)
    print(f"Extraction complete. Data saved to {output_csv}")

# Run the script
if __name__ == "__main__":
    main()


Extracting ESG-related content...
Applying Named Entity Recognition (NER)...
Extraction complete. Data saved to Extracted_ESG_Entities.csv


In [21]:
import pdfplumber
import textwrap
import pandas as pd
from transformers import pipeline

# Load NLP summarization model
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

# Define ESG-related keywords
esg_keywords = [
    "ESG", "Environmental", "Social", "Governance",
    "Sustainability", "Climate", "Carbon", "Diversity",
    "Equity", "Inclusion", "Social Impact", "Emissions",
    "Water Stewardship", "Waste Reduction", "Renewable",
    "Human Rights", "Corporate Responsibility"
]

# Extract ESG-related text from the PDF
def extract_esg_text(pdf_path, keywords):
    esg_text = []
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            text = page.extract_text()
            if text:
                for keyword in keywords:
                    if keyword.lower() in text.lower():
                        esg_text.append(text)
                        break  # Avoid duplicate entries
    return " ".join(esg_text)  # Combine all ESG-related text

# Summarize extracted ESG content with precise control
def summarize_text(text):
    if len(text.split()) < 50:  # If the content is too short, return as-is
        return text

    wrapped_text = textwrap.wrap(text, width=1024)  # Avoid input size issues
    summary = []

    for chunk in wrapped_text:
        # Adjust max_length dynamically based on input size
        words = len(chunk.split())
        max_len = min(words, 150)  # Limit summary to max 150 words
        min_len = max(50, max_len // 2)  # Ensure meaningful summary
        
        result = summarizer(chunk, max_length=max_len, min_length=min_len, do_sample=False)
        summary.append(result[0]['summary_text'])

    return "\n".join(summary)

# Main function
def main():
    pdf_path = "2024-Annual-Report.pdf"  # Update with your actual file path
    print("\nExtracting ESG-related content...\n")
    esg_text = extract_esg_text(pdf_path, esg_keywords)

    if not esg_text:
        print("No ESG-related content found.")
        return

    print("Summarizing key ESG insights...\n")
    summary = summarize_text(esg_text)

    # Save summary to a text file
    output_txt = "ESG_Summary.txt"
    with open(output_txt, "w", encoding="utf-8") as file:
        file.write(summary)

    # Print preview of output
    print("Summary Preview:\n")
    print(summary[:1000])  # Print first 1000 characters for preview
    print(f"\nSummary saved to: {output_txt}")

# Run the script
if __name__ == "__main__":
    main()


Device set to use mps:0



Extracting ESG-related content...

Summarizing key ESG insights...

Summary Preview:

The Walt Disney Company and SUBSIDIARIES report on its financial condition and results of operations for the year ended December 31, 2013. The report includes the following sections: Risk Factors, Legal Proceedings, Properties, Mine Safety Disclosures, Information About our Executive Officers and Corporate Governance. The table of contents of the report is divided into the following parts: Table of Contents.
The Walt Disney Company, together with the subsidiaries through which businesses are conducted (the Company), is a diversified worldwide entertainment company with operations in three segments: Entertainment, Sports and Experiences. The Entertainment segment generally encompasses the Company’s non-sports focused global film and episodic content production and distribution activities. The terms “Company”, ‘we’, “our” and “us” are used in this report to refer collectively to the parent company and 

In [29]:
import pdfplumber
import spacy
import pandas as pd
from collections import Counter
from transformers import pipeline
from textblob import TextBlob

# Load NLP models
nlp = spacy.load("en_core_web_sm")
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

# Define ESG-related keywords
esg_keywords = [
    "ESG", "Environmental", "Social", "Governance",
    "Sustainability", "Climate", "Carbon", "Diversity",
    "Equity", "Inclusion", "Social Impact", "Emissions",
    "Water Stewardship", "Waste Reduction", "Renewable",
    "Human Rights", "Corporate Responsibility"
]

# Extract ESG-related text from the PDF
def extract_esg_text(pdf_path, keywords):
    esg_text = []
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            text = page.extract_text()
            if text:
                for keyword in keywords:
                    if keyword.lower() in text.lower():
                        esg_text.append(text)
                        break
    return " ".join(esg_text)  # Combine ESG text

# Count occurrences of ESG-related terms
def count_esg_terms(text):
    words = text.split()
    word_counts = Counter(word.lower() for word in words if word.lower() in esg_keywords)
    return word_counts.most_common(5)  # Top 5 ESG terms

# Extract named entities
def extract_entities(text):
    doc = nlp(text)
    entity_counts = Counter()
    
    for ent in doc.ents:
        if ent.label_ in ["ORG", "GPE", "LAW"]:
            entity_counts[ent.text] += 1
    
    return entity_counts.most_common(5)  # Top 5 entities

# Perform sentiment analysis
def analyze_sentiment(text):
    blob = TextBlob(text)
    sentiment_score = blob.sentiment.polarity
    
    if sentiment_score > 0.1:
        return "Positive Sentiment"
    elif sentiment_score < -0.1:
        return "Negative Sentiment"
    else:
        return "Neutral Sentiment"

# Summarize ESG insights
def summarize_esg(text):
    if len(text.split()) < 50:
        return text  # Return as-is if too short

    summary = summarizer(text, max_length=150, min_length=50, do_sample=False)
    return summary[0]['summary_text']

# Generate structured output in Markdown format
def generate_output(pdf_path):
    print("Extracting ESG-related content...\n")
    esg_text = extract_esg_text(pdf_path, esg_keywords)

    if not esg_text:
        print("No ESG-related content found.")
        return

    # Compute results
    esg_terms = count_esg_terms(esg_text)
    named_entities = extract_entities(esg_text)
    sentiment = analyze_sentiment(esg_text)
    esg_summary = summarize_esg(esg_text)

     # Format output properly for Markdown
    output = f"""
### 1. Most Common ESG Terms Found
```diff
{"".join(f"- {term.capitalize()}: {count} occurrences\n" for term, count in esg_terms)}"""
    {"".join(f"- {entity} ({count} mentions)\n" for entity, count in named_entities)}


Device set to use mps:0
