<a href="https://colab.research.google.com/github/vdrakopoulou/vdrakopoulou/blob/main/esg_extract_colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# 1) Install dependency
!pip install PyPDF2

# 2) Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# 3) Imports and path setup
from pathlib import Path
import csv
import re
from PyPDF2 import PdfReader

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/232.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1
Mounted at /content/drive


In [4]:
# >>> CHANGE THESE ONLY IF YOUR FOLDER NAMES ARE DIFFERENT <<<
REPORTS_DIR = Path("/content/drive/MyDrive/adx_ESG_reports")
OUTPUT_CSV = Path("/content/drive/MyDrive/data/raw/esg_messages_raw.csv")

# 4) ESG keyword lists
E_KEYWORDS = [
    "environment", "environmental", "emissions", "carbon", "co2", "ghg",
    "greenhouse gas", "climate", "renewable", "energy efficiency",
    "biodiversity", "waste", "recycling", "water usage", "water use",
    "pollution", "sustainable", "sustainability", "net zero", "decarbonization"
]

S_KEYWORDS = [
    "social", "community", "human rights", "labor", "labour", "workplace",
    "health and safety", "occupational safety", "safety incident",
    "diversity", "equity", "inclusion", "dei", "employee engagement",
    "training hours", "fair wage", "child labor", "forced labor",
    "local communities", "stakeholder", "philanthropy", "donation"
]

G_KEYWORDS = [
    "governance", "board of directors", "board", "audit committee",
    "remuneration committee", "compensation committee", "ethics",
    "code of conduct", "anti-corruption", "anticorruption", "anti bribery",
    "whistleblowing", "whistle-blowing", "compliance", "risk management",
    "internal control", "shareholder rights", "transparency"
]

ALL_KEYWORDS = [kw.lower() for kw in (E_KEYWORDS + S_KEYWORDS + G_KEYWORDS)]

In [3]:
from pathlib import Path

esg_reports_path = Path('/content/drive/MyDrive/adx_ESG_reports')

if esg_reports_path.exists() and esg_reports_path.is_dir():
    print(f"Listing contents of {esg_reports_path}:")
    for item in esg_reports_path.iterdir():
        print(f"- {item.name}{'/' if item.is_dir() else ''}")
else:
    print(f"Error: {esg_reports_path} does not exist or is not a directory.")

Listing contents of /content/drive/MyDrive/adx_ESG_reports:
- ADNOC.pdf
- ADNOC Distribution ESG Report 2024-English-F.pdf
- BILDCO.pdf
- ADNOC_LS Sustainability Report 2024 English.pdf
- ADPORTS_Sustainability-Report-2024.pdf
- KICO_SUSTAINABILITY_REPORT_2024.pdf
- APEX.pdf
- ADCB 2024.pdf
- ADIB ESG report 2023.pdf
- Abu Dhabi Aviation Sustainability-Report-2024-English.pdf
- TAQA Announces 2030 Emissions Reductions Targets as part of New ESG Strategy (English) 13.10.22.pdf
- ADNH 2024.pdf
- ADNIC 2022.pdf
- ADNOC DISTRIBUTION 2024.pdf
- ADIB Sustainability Report 2024 EN (1).pdf
- TKFL_Sustainability_Report_EN_2024.pdf
- ADSB_sustainability_Report_2023.pdf
- ALAIN.pdf
- ADNIC_sustainability-report-english-2023.pdf
- DAFRA_ADIC_Sustainability_ESG_Report_2024_Eng.pdf
- AFNIC 2021_Integrated Report.pdf
- AKIC_Al-Khayyat-Investments-Sustainability-Report-2024.pdf
- ASM_2024-esg-supplement-2.pdf
- AWNIC_2022.pdf
- YAHSAT_ESG Policy 23rd Aug 2023.pdf
- ALPHADHABI_sustainability_report_202

In [None]:
import pandas as pd

df_esg_sentences = pd.read_csv(OUTPUT_CSV)

print(f"Total ESG sentences loaded into DataFrame: {len(df_esg_sentences)}")
display(df_esg_sentences.head())

In [6]:
# 6) Main extraction logic
OUTPUT_CSV.parent.mkdir(parents=True, exist_ok=True)
file_exists = OUTPUT_CSV.exists()
total_pdfs = 0
total_sentences = 0

with OUTPUT_CSV.open("a", newline="", encoding="utf-8") as csvfile:
    writer = csv.writer(csvfile)

    # Write header only if file doesn't exist yet
    if not file_exists:
        writer.writerow(["source_file", "sentence"])

    pdf_files = sorted(REPORTS_DIR.glob("*.pdf"))
    print(f"Found {len(pdf_files)} PDF files in {REPORTS_DIR}")

    for pdf_path in pdf_files:
        total_pdfs += 1
        print(f"Processing: {pdf_path.name}")
        esg_sentences = extract_esg_sentences_from_pdf(pdf_path)
        print(f"  -> Found {len(esg_sentences)} ESG-related sentences")
        total_sentences += len(esg_sentences)
        for sent in esg_sentences:
            writer.writerow([pdf_path.name, sent])

print("\nDone!")
print(f"Processed PDFs: {total_pdfs}")
print(f"Total ESG sentences extracted: {total_sentences}")
print(f"CSV saved to: {OUTPUT_CSV}")

Found 98 PDF files in /content/drive/MyDrive/adx_ESG_reports
Processing: 2POINTZERO_ESG_Report_English_2024.pdf
  -> Found 407 ESG-related sentences
Processing: ADCB 2024.pdf
  -> Found 705 ESG-related sentences
Processing: ADIB ESG report 2023.pdf
  -> Found 503 ESG-related sentences
Processing: ADIB Sustainability Report 2024 EN (1).pdf
  -> Found 751 ESG-related sentences
Processing: ADNH 2024.pdf
  -> Found 186 ESG-related sentences
Processing: ADNIC 2022.pdf
  -> Found 163 ESG-related sentences
Processing: ADNIC_sustainability-report-english-2023.pdf
  -> Found 280 ESG-related sentences
Processing: ADNOC DISTRIBUTION 2024.pdf
  -> Found 612 ESG-related sentences
Processing: ADNOC Distribution ESG Report 2024-English-F.pdf
  -> Found 612 ESG-related sentences
Processing: ADNOC Drilling Annual Report 2024.pdf
  -> Found 763 ESG-related sentences
Processing: ADNOC.pdf
  -> Found 13 ESG-related sentences
Processing: ADNOCGAS_Sustainability_Report_2024.pdf
  -> Found 1524 ESG-related s

[0, IndirectObject(152, 0, 132523756148048)]
[0, IndirectObject(158, 0, 132523756148048)]
[0, IndirectObject(164, 0, 132523756148048)]
[0, IndirectObject(170, 0, 132523756148048)]


  -> Found 231 ESG-related sentences
Processing: AMR Corporate Sustainability.pdf
  -> Found 42 ESG-related sentences
Processing: APEX.pdf
  -> Found 0 ESG-related sentences
Processing: ASM_2024-esg-supplement-2.pdf
  -> Found 113 ESG-related sentences
Processing: AWNIC_2022.pdf
  -> Found 184 ESG-related sentences
Processing: Abu Dhabi Aviation Sustainability-Report-2024-English.pdf
  -> Found 230 ESG-related sentences
Processing: Agility-Global-Sustainability-Report_Restated_July-17-2025_EN.pdf
  -> Found 389 ESG-related sentences
Processing: Alef_Education_Integrated_Annual_Report_2024.pdf
  -> Found 255 ESG-related sentences
Processing: Alpha Data PJSC - Financials 12-2025.pdf
  -> Found 46 ESG-related sentences
Processing: BILDCO.pdf
  -> Found 132 ESG-related sentences
Processing: BOS-ESG-Report-2024.pdf
  -> Found 237 ESG-related sentences
Processing: Borouge Sustainability Report 2023.pdf
  -> Found 584 ESG-related sentences
Processing: DAFRA_ADIC_Sustainability_ESG_Report_2024

In [5]:
# 5) Helper functions
def extract_text_from_pdf(pdf_path: Path) -> str:
    """Extract all text from a PDF file."""
    reader = PdfReader(str(pdf_path))
    texts = []
    for page in reader.pages:
        try:
            page_text = page.extract_text() or ""
        except Exception:
            page_text = ""
        texts.append(page_text)
    return "\n".join(texts)


def split_into_sentences(text: str):
    """Rough sentence splitter using punctuation."""
    text = re.sub(r"\s+", " ", text)  # normalize whitespace
    sentences = re.split(r"(?<=[.!?])\s+", text)
    return [s.strip() for s in sentences if len(s.strip()) > 20]


def is_esg_sentence(sentence: str) -> bool:
    """Check if a sentence contains any ESG-related keyword."""
    s_lower = sentence.lower()
    return any(kw in s_lower for kw in ALL_KEYWORDS)


def extract_esg_sentences_from_pdf(pdf_path: Path):
    full_text = extract_text_from_pdf(pdf_path)
    sentences = split_into_sentences(full_text)
    return [s for s in sentences if is_esg_sentence(s)]

In [2]:
from pathlib import Path

drive_path = Path('/content/drive/MyDrive')

if drive_path.exists():
    print(f"Listing contents of {drive_path}:")
    for item in drive_path.iterdir():
        print(f"- {item.name}{'/' if item.is_dir() else ''}")
else:
    print(f"Error: {drive_path} does not exist. Please ensure Google Drive is mounted correctly.")

Listing contents of /content/drive/MyDrive:
- RESUME.zip
- Curriculum Vitae Drakopoulou 2023.docx
- RESUME2024.zip
- Violet's Publications.zip
- ACC4113 Case Study/
- Colab Notebooks/
- Copy of ORM_Book_Proposal.gdoc
- Copy of On-Demand Course Proposal Template (Make a Copy).gdoc
- RQ2 – AI Literacy and Metacognition: ChatGPT Survey  (File responses)/
- RQ2 – AI Literacy and Metacognition: ChatGPT Survey .gform
- RQ2-AI Literacy and Metacognition: ChatGPT Survey/
- adx_ESG_reports/
