In [3]:
%load_ext autoreload
%autoreload 2

In [4]:
from models import load_model
model = load_model("llama_3_2_3b")

In [5]:
from agents.search_agent import SearchStringAgent

# ESG is Environmental, Social, and Governance, terms for a set of criteria used to evaluate a company's operations, typically for investment purposes

topic = "ESG-based data for sentence-level classification"
search_string = SearchStringAgent(model, topic)()
print(search_string)

(esg OR environmental social governance) AND (data sources OR dataset)


In [6]:
from util import WebSearch
from util import split_markdown
import os
from markitdown import MarkItDown

md = MarkItDown()
search = WebSearch()

os.makedirs("tmp", exist_ok=True)
urls = search.get_urls(search_string)
print("fetched urls:", urls)

fetched urls: ['https://www.esganalytics.io/insights/top-10-esg-data-providers', 'https://sustainablefinancedaily.com/sustainable-finance-insights/esg-data-and-analytics/10-free-esg-data-sources-and-scores/', 'https://www.pwc.com/us/en/services/esg/library/esg-data-collection-reporting.html']


In [7]:
import requests
import shutil

# https://phrasefix.com/tools/random-user-agent-generator/
AGENT = "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:97.0) Gecko/20100101 Firefox/97.0"
headers = {"User-Agent": AGENT}

shutil.rmtree("tmp", ignore_errors=True)
os.makedirs("tmp", exist_ok=True)

for uid, url in enumerate(urls):
    out_path = os.path.join("tmp", f"{uid}.pdf")
    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()  # Raise an error for bad status codes
        with open(out_path, 'wb') as f:
            f.write(response.content)
    except requests.RequestException as e:
        print(f"Failed to download {url}: {e}")


In [8]:
split_markdown(
    pdf_folder="tmp",
    out_folder="tmp-markdown",
    MAX_RESULTS=100,
)

Extracting markdown:   0%|          | 0/3 [00:00<?, ?it/s]

Error converting 2.pdf


In [9]:
from typing import List
from tqdm.notebook import tqdm
from agents.dataset_agent import DatasetAgent

language = "English"
dataset_agent = DatasetAgent(model, topic, language)

filtered_datasets = []

outer_tqdm = tqdm(urls, desc="Processing...", leave=False)
for pdf_id, pdf in enumerate(outer_tqdm):
    data_paths = sorted(os.listdir("tmp-markdown"))
    for data_path in tqdm(data_paths, desc=f"Processing document {pdf_id} ({pdf})"):
        md_file = os.path.join("tmp-markdown", data_path)
        datasets = dataset_agent(markdown_file_path=md_file)
        if not datasets:
            continue
        for dataset in datasets:
            if dataset["relevant"] == "yes":
                filtered_datasets.append(dataset)

Processing...:   0%|          | 0/3 [00:00<?, ?it/s]

Processing document 0 (https://www.esganalytics.io/insights/top-10-esg-data-providers):   0%|          | 0/57 …

Failed to parse response: {
  "datasets": [
    {
      "name": "",
      "language": "",
      "labels": [],
      "relevant": "no"
    },
    {
      "name": "",
      "language": "",
      "labels": [],
      "relevant": "yes"
    }
  ]
}
  


Processing document 1 (https://sustainablefinancedaily.com/sustainable-finance-insights/esg-data-and-analytics…

Failed to parse response: {
  "datasets": [
    {
      "name": "",
      "language": "",
      "labels": [],
      "relevant": "no"
    },
    {
      "name": "",
      "language": "",
      "labels": [],
      "relevant": "yes"
    }
  ]
}
  


Processing document 2 (https://www.pwc.com/us/en/services/esg/library/esg-data-collection-reporting.html):   0…

Failed to parse response: {
  "datasets": [
    {
      "name": "",
      "language": "",
      "labels": [],
      "relevant": "no"
    },
    {
      "name": "",
      "language": "",
      "labels": [],
      "relevant": "yes"
    }
  ]
}
  


In [10]:
from typing import Literal
from agents.labeler_agent import LabelerAgent

generated_labels = LabelerAgent(
    model,
    topic=topic,
    datasets=filtered_datasets,
    num_labels=3,
)()
print(generated_labels)

[{'label_name': 'ESG Topic', 'description': 'Topic related to ESG (Environmental, Social, Governance) data.', 'possible_values': ['Relevant ESG News Article', 'Governance Practices']}, {'label_name': 'Sentiment Category', 'description': 'Category indicating the sentiment of an article towards ESG-related topics.', 'possible_values': ['Positive ESG Sentiment', 'Negative ESG Sentiment']}, {'label_name': 'Contextual Relationship', 'description': 'Relationship between a sentence and other sentences in terms of relevance to ESG data or contrast with non-ESG content.', 'possible_values': ['Relevant/In Relevant Non-EGS News Article', 'Entailment (positive)', 'Contrast']}]


### Dynamic class-schemas
super hacky way to dynamically create pydantic schemas from the generated data from the `LabelerAgent`

In [11]:
from pydantic import BaseModel, Field
from typing import Dict, Any, List

def dynamic_label_schema(labels: List[Dict[str, Any]]):
    annotations = {}
    field_definitions = {}

    for label in labels:
        # workaround for python identifiers
        field_name = label["label_name"].replace(" ", "_")
        # we don't know the type, so we use literals for the possible values (instead of str...)
        field_type = Literal[tuple(label["possible_values"])]
        annotations[field_name] = field_type
        field_definitions[field_name] = Field(..., description=label["description"])

    DynamicLabelModel: BaseModel = type(
        "DynamicLabelModel",
        (BaseModel,),
        {"__annotations__": annotations, **field_definitions},
    )
    return DynamicLabelModel

label_schema = dynamic_label_schema(generated_labels)
label_schema.model_json_schema()

{'properties': {'ESG_Topic': {'description': 'Topic related to ESG (Environmental, Social, Governance) data.',
   'enum': ['Relevant ESG News Article', 'Governance Practices'],
   'title': 'Esg Topic',
   'type': 'string'},
  'Sentiment_Category': {'description': 'Category indicating the sentiment of an article towards ESG-related topics.',
   'enum': ['Positive ESG Sentiment', 'Negative ESG Sentiment'],
   'title': 'Sentiment Category',
   'type': 'string'},
  'Contextual_Relationship': {'description': 'Relationship between a sentence and other sentences in terms of relevance to ESG data or contrast with non-ESG content.',
   'enum': ['Relevant/In Relevant Non-EGS News Article',
    'Entailment (positive)',
    'Contrast'],
   'title': 'Contextual Relationship',
   'type': 'string'}},
 'required': ['ESG_Topic', 'Sentiment_Category', 'Contextual_Relationship'],
 'title': 'DynamicLabelModel',
 'type': 'object'}

In [12]:
import pandas as pd
from agents.dataset_generator_agent import DataGenerationAgent

data_generation_agent = DataGenerationAgent(
    model=model,
    topic=topic,
    label_schema=label_schema,
    num_samples=1,
    text_strategy="sentences", # sentences | paragraphs | documents
)

generated = data_generation_agent()
generated

[{'text': 'While the Sönchez Foundation prioritizes climate action for its own sustainability initiatives, it has faced criticism from environmental groups over \ud83d\udd0a recent land acquisition that may have implications on local biodiversity.',
  'labels': [{'ESG_Topic': 'Relevant ESG News Article',
    'Sentiment_Category': 'Negative ESG Sentiment',
    'Contextual_Relationship': 'Contrast'}]}]

In [13]:
columns = list(generated[0].keys()) + list(generated[0]["labels"][0].keys())
columns.remove("labels")
data = []
for sample in generated:
    # print(sample)
    for label in sample["labels"]:
        data.append({**sample, **label})
df = pd.DataFrame(data, columns=columns)
df

Unnamed: 0,text,ESG_Topic,Sentiment_Category,Contextual_Relationship
0,While the Sönchez Foundation prioritizes clima...,Relevant ESG News Article,Negative ESG Sentiment,Contrast
