In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
from models import load_model
model = load_model("llama_3_2_3b")

In [4]:
from agents.search_agent import SearchAgent

topic = "ESG-based data for sentence-level classification"
search_string = SearchAgent(model, topic)()
print(search_string)

(esg OR environmental social governance) AND (data sources OR dataset)


In [5]:
from util import WebSearch
from util import split_markdown
import os
from markitdown import MarkItDown

md = MarkItDown()
search = WebSearch()

os.makedirs("tmp", exist_ok=True)
urls = search.get_urls(search_string)
print("fetched urls:", urls)

fetched urls: ['https://www.dnb.com/content/dam/english/economic-and-industry-insight/DnB_eBook_A-Practical-Guide-to-ESG.pdf', 'https://www.spglobal.com/spdji/en/documents/additional-material/spdji-esg-metrics-reference-guide.pdf', 'https://edmcouncil.org/wp-content/uploads/2024/04/EDM-Council_Playbook_ESG-and-Sustainability-Data-Mgmt_April-2024.pdf']


In [16]:
import requests
import shutil

# https://phrasefix.com/tools/random-user-agent-generator/
AGENT = "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:97.0) Gecko/20100101 Firefox/97.0"
headers = {"User-Agent": AGENT}

shutil.rmtree("tmp", ignore_errors=True)
os.makedirs("tmp", exist_ok=True)

for uid, url in enumerate(urls):
    out_path = os.path.join("tmp", f"{uid}.pdf")
    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()  # Raise an error for bad status codes
        with open(out_path, 'wb') as f:
            f.write(response.content)
    except requests.RequestException as e:
        print(f"Failed to download {url}: {e}")


Failed to download https://www.spglobal.com/spdji/en/documents/additional-material/spdji-esg-metrics-reference-guide.pdf: 403 Client Error: Forbidden for url: https://www.spglobal.com/spdji/en/documents/additional-material/spdji-esg-metrics-reference-guide.pdf


In [17]:
split_markdown(
    pdf_folder="tmp",
    out_folder="tmp-markdown",
    MAX_RESULTS=100,
)

Extracting markdown:   0%|          | 0/2 [00:00<?, ?it/s]

Data-loss while decompressing corrupted data
Data-loss while decompressing corrupted data
Data-loss while decompressing corrupted data
Data-loss while decompressing corrupted data
Data-loss while decompressing corrupted data
Data-loss while decompressing corrupted data
Data-loss while decompressing corrupted data
Data-loss while decompressing corrupted data


In [18]:
from typing import List
from tqdm.notebook import tqdm
from agents.dataset_agent import DatasetAgent

language = "English"
dataset_agent = DatasetAgent(model, topic, language)

filtered_datasets = []

outer_tqdm = tqdm(urls, desc="Processing...", leave=False)
for pdf_id, pdf in enumerate(outer_tqdm):
    data_paths = sorted(os.listdir("tmp-markdown"))
    for data_path in tqdm(data_paths, desc=f"Processing document {pdf_id} ({pdf})"):
        md_file = os.path.join("tmp-markdown", data_path)
        datasets = dataset_agent(markdown_file_path=md_file)
        if not datasets:
            continue
        for dataset in datasets:
            if dataset["relevant"] == "yes":
                filtered_datasets.append(dataset)

Processing URLs:   0%|          | 0/3 [00:00<?, ?it/s]

Processing URL 0 (https://www.dnb.com/content/dam/english/economic-and-industry-insight/DnB_eBook_A-Practical-…

Processing URL 1 (https://www.spglobal.com/spdji/en/documents/additional-material/spdji-esg-metrics-reference-…

Processing URL 2 (https://edmcouncil.org/wp-content/uploads/2024/04/EDM-Council_Playbook_ESG-and-Sustainabilit…

In [21]:
from typing import Literal
from agents.labeler_agent import LabelerAgent

generated_labels = LabelerAgent(
    model,
    topic=topic,
    datasets=filtered_datasets,
    num_labels=4,
)()
print(generated_labels)

[{'label_name': 'Climate Change', 'description': 'Sentences related to climate change mitigation or adaptation efforts.', 'possible_values': ['climate_change', 'renewable_energy', 'carbon_emissions']}, {'label_name': 'Sustainability Practices', 'description': "Sentences describing companies' sustainability practices, such as waste reduction or supply chain management.", 'possible_values': ['sustainable_sourcing', 'waste_reduction', 'energy_efficiency']}, {'label_name': 'Social Responsibility', 'description': "Sentences related to a company's social responsibility initiatives, including diversity and inclusion efforts.", 'possible_values': ['diversity_and_inclusion', 'human_rights', 'community_development']}]


super hacky way to dynamically create pydantic schemas from the generated data from the `LabelerAgent`

In [22]:
from pydantic import BaseModel, Field
from typing import Dict, Any, List

def dynamic_label_schema(labels: List[Dict[str, Any]]):
    annotations = {}
    field_definitions = {}

    for label in labels:
        # workaround for python identifiers
        field_name = label["label_name"].replace(" ", "_")
        # we don't know the type, so we use literals for the possible values (instead of str...)
        field_type = Literal[tuple(label["possible_values"])]
        annotations[field_name] = field_type
        field_definitions[field_name] = Field(..., description=label["description"])

    DynamicLabelModel: BaseModel = type(
        "DynamicLabelModel",
        (BaseModel,),
        {"__annotations__": annotations, **field_definitions},
    )
    return DynamicLabelModel

label_schema = dynamic_label_schema(generated_labels)
label_schema.model_json_schema()

{'properties': {'Climate_Change': {'description': 'Sentences related to climate change mitigation or adaptation efforts.',
   'enum': ['climate_change', 'renewable_energy', 'carbon_emissions'],
   'title': 'Climate Change',
   'type': 'string'},
  'Sustainability_Practices': {'description': "Sentences describing companies' sustainability practices, such as waste reduction or supply chain management.",
   'enum': ['sustainable_sourcing', 'waste_reduction', 'energy_efficiency'],
   'title': 'Sustainability Practices',
   'type': 'string'},
  'Social_Responsibility': {'description': "Sentences related to a company's social responsibility initiatives, including diversity and inclusion efforts.",
   'enum': ['diversity_and_inclusion',
    'human_rights',
    'community_development'],
   'title': 'Social Responsibility',
   'type': 'string'}},
 'required': ['Climate_Change',
  'Sustainability_Practices',
  'Social_Responsibility'],
 'title': 'DynamicLabelModel',
 'type': 'object'}

In [24]:
import pandas as pd
from agents.dataset_generator_agent import DataGenerationAgent

data_generation_agent = DataGenerationAgent(
    model=model,
    topic=topic,
    label_schema=label_schema,
    num_samples=10,
    text_strategy="sentences", # sentences | paragraphs | documents
)

generated = data_generation_agent()
generated

[{'text': 'Our company is committed to reducing greenhouse gas emissions by 50% within the next three years through a combination of energy-efficient upgrades and renewable energy investments.',
  'labels': [{'Climate_Change': 'climate_change',
    'Sustainability_Practices': 'energy_efficiency',
    'Social_Responsibility': 'diversity_and_inclusion'}]},
 {'text': 'We have implemented various initiatives to promote sustainable sourcing, including partnering with suppliers who share our commitment to environmentally responsible practices.',
  'labels': [{'Climate_Change': 'renewable_energy',
    'Sustainability_Practices': 'sustainable_sourcing',
    'Social_Responsibility': 'community_development'}]},
 {'text': 'By incorporating ESG-based data into their investment decisions, institutional investors can better align with long-term financial returns and contribute to a more sustainable future.',
  'labels': [{'Climate_Change': 'climate_change',
    'Sustainability_Practices': 'energy_ef

In [25]:
columns = list(generated[0].keys()) + list(generated[0]["labels"][0].keys())
columns.remove("labels")
data = []
for sample in generated:
    # print(sample)
    for label in sample["labels"]:
        data.append({**sample, **label})
df = pd.DataFrame(data, columns=columns)
df

Unnamed: 0,text,Climate_Change,Sustainability_Practices,Social_Responsibility
0,Our company is committed to reducing greenhous...,climate_change,energy_efficiency,diversity_and_inclusion
1,We have implemented various initiatives to pro...,renewable_energy,sustainable_sourcing,community_development
2,By incorporating ESG-based data into their inv...,climate_change,energy_efficiency,human_rights
3,The use of ESG-based data for sentence-level c...,climate_change,waste_reduction,diversity_and_inclusion
4,As part of our corporate social responsibility...,climate_change,sustainable_sourcing,diversity_and_inclusion
5,Our company is committed to reducing carbon em...,carbon_emissions,energy_efficiency,community_development
6,The growing demand for ESG-based data has led ...,climate_change,waste_reduction,human_rights
7,By investing in ESG-based data analytics solut...,climate_change,sustainable_sourcing,diversity_and_inclusion
8,Our sustainability strategy focuses on promoti...,climate_change,energy_efficiency,community_development
9,The integration of ESG-based data into busines...,climate_change,waste_reduction,human_rights
