In [1]:
%load_ext autoreload
%autoreload 2

pseudo code for what we want (taken from <https://huggingface.co/docs/smolagents/v1.3.0/en/conceptual_guides/intro_agents>)

```
memory = [user_defined_task]
while llm_should_continue(memory): # this loop is the multi-step part
    action = llm_get_next_action(memory) # this is the tool-calling part
    observations = execute_action(action)
    memory += [action, observations]
```

In [2]:
from src.models import Models, get_models, load_model

model = load_model("llama_3_2_3b")

ModuleNotFoundError: No module named 'src'

## selenium web driver
we use this to control the agents' online presence :-)

In [11]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import time

driver = webdriver.Chrome()

In [12]:
from abc import ABC, abstractmethod
from pydantic import BaseModel, Field, Json
from typing import List, Any


class Agent(ABC):
    @abstractmethod
    def system(self) -> str:
        pass

    @abstractmethod
    def prompt(self, *args, **kwargs) -> str:
        pass

    @abstractmethod
    def schema(self, *args, **kwargs) -> dict[str, Any]:
        pass

In [13]:
from src.generate import generate

# tools = [
#     "search_engine",
#     "dataset_generator"
# ]
# tools_str = ", ".join(f"{i+1}. {tool}" for i, tool in enumerate(tools))

# main_system = f"You are an intelligent assistant that helps the user in the necessary steps required to generate a dataset. Based on the users input, you should create tasks as described by the  {len(tools)} tools you have available: {tools_str}."
# print(main_system)


class SearchAgent(Agent):
    def system(self):
        return "You are tasked with reformulating a topic into a boolean AND/OR search string suitable for search engines. Maximum 3 operations. It MUST include 'dataset' or 'data sources'. Output in JSON according to the provided schema."
    
    def prompt(self, topic: str):
        return f"Generate a Google keyword search that will help the user find datasets on the topic of {topic}. Do not constrain the search to specific sites. Use your knowledge of the topic to generate a comprehensive search string."
    
    def schema(self):
        class SearchSchema(BaseModel):
            search: str
        return SearchSchema.model_json_schema()
    
    def __call__(self, topic: str):
        return generate(
            system_prompt=self.system(),
            prompt=self.prompt(topic),
            schema=self.schema(),
            model=model,
            num_ctx=1000,
            num_predict=200,
            temperature=0.0
        )["search"]


TOPIC = "ESG-based data for sentence-level classification"
search_agent = SearchAgent()

search_string = search_agent(topic=TOPIC)
print(search_string)

ESG data sentence classification AND (dataset OR data sources) AND (text OR natural language processing OR NLP) AND (machine learning OR deep learning) AND (environmental OR social OR governance) AND (dataset OR data sources OR data repository)


In [14]:
from duckduckgo_search import DDGS
from markitdown import MarkItDown

# ddgs = DDGS(proxy="tb", timeout=20) 
# this requires the TOR browser to be installed and running for a proxy
# should be used if you plan on querying a lot.
ddgs = DDGS(timeout=20) 
# prepare the markdown extractor
md = MarkItDown()

In [15]:
def get_urls(search_string: str, max_results: int = 3):
    res = ddgs.text(search_string, max_results=max_results, region="us-en", safesearch="on")
    hrefs = [r["href"] for r in res]
    return hrefs

urls = get_urls(search_string)
urls

['https://huggingface.co/nbroad/ESG-BERT',
 'https://www.sofi.com/learn/content/esg-score/',
 'https://open-research-europe.ec.europa.eu/articles/5-28']

In [23]:
driver.quit()

In [16]:
import os
import shutil

LANGUAGE = "English"
search_for_dataset_info_query = "From the provided markdown text:\n{MARKDOWN}\nFind direct links to datasets, descriptions of datasets, and other information that explains something about the data. The dataset should be in {LANGUAGE}."

def visit_site(url):
    driver.get(url)
    time.sleep(1)
    html = driver.page_source
    return html

TMP_FOLDER = "tmp-markdown"
shutil.rmtree(TMP_FOLDER, ignore_errors=True)
for uid, url in enumerate(urls):
    html = visit_site(url)

    TMP = "tmp.html"
    with open(TMP, "w") as f:
        f.write(html)
    result = md.convert(TMP)

    # to deal with large contexts, split by headlines
    hashed_results = result.text_content.split("\n#")
    hashed_results = [h.strip() for h in hashed_results if len(h.strip().split("\n")) > 1]

    MAX_RESULTS = 100
    if len(hashed_results) > MAX_RESULTS:
        hashed_results = hashed_results[:MAX_RESULTS]

    os.makedirs(TMP_FOLDER, exist_ok=True)

    # we store the markdown unique to each URL in the current session:
    for i, hashed_result in enumerate(hashed_results):
        with open(f"tmp-markdown/{uid}_{i}.md", "w") as f:
            f.write(hashed_result)


In [17]:
from pydantic.json_schema import GenerateJsonSchema, JsonSchemaValue
from typing import Literal
from tqdm.notebook import tqdm

LANGUAGE = "English"

class DatasetAgent(Agent):
    def system(self):
        return f"You are a helpful assistant that extracts information about datasets about {TOPIC}."

    def prompt(self, markdown_file_path):
        data = None
        with open(markdown_file_path) as f:
            data = f.read()
        if not data:
            return "No data provided."
        return f"From the provided information:\n{data}\nFind details about specific datasets (in {LANGUAGE} preferably), including information about its language, links, and labels. Determine if the dataset is relevant to the topic of {TOPIC} and is in the {LANGUAGE} language, specified by the 'relevant' field. Output in JSON."

    def schema(self):
        class DatasetLabel(BaseModel):
            label: str = Field(
                title="Label or category",
                description="The label of the dataset, such as the name of a category."
            )

        class DatasetSchema(BaseModel):
            name: str
            language: str
            labels: List[DatasetLabel]
            relevant: Literal["yes", "no"]

        class DatasetFinderSchema(BaseModel):
            datasets: List[DatasetSchema]

        return DatasetFinderSchema.model_json_schema()

    def __call__(self, markdown_file_path: str, output_key: str = "datasets"):
        output = generate(
            system_prompt=self.system(),
            prompt=self.prompt(markdown_file_path=markdown_file_path),
            schema=self.schema(),
            model=model,
            # num_ctx=1000,
            temperature=0.0,
        )
        if output:
            return output[output_key]
        return None

dataset_agent = DatasetAgent()

filtered_datasets = []

outer_iterator = tqdm(urls, desc="Processing URLs", leave=False)
for uid, url in enumerate(outer_iterator):
    data_path = os.path.join(TMP_FOLDER, f"{uid}_{i}.md")
    data_paths = sorted(os.listdir(TMP_FOLDER))
    
    iterator = tqdm(data_paths, desc=f"Processing URL {uid} ({url})")
    for data_path in iterator:
        md_file = os.path.join(TMP_FOLDER, data_path)
        datasets = dataset_agent(markdown_file_path=md_file)
        if not datasets:
            continue
        for dataset in datasets:
            if dataset["relevant"] == "yes":
                filtered_datasets.append(dataset)

Processing URLs:   0%|          | 0/3 [00:00<?, ?it/s]

Processing URL 0 (https://huggingface.co/nbroad/ESG-BERT):   0%|          | 0/34 [00:00<?, ?it/s]

Failed to parse response: {
  "datasets": []
}
  


Processing URL 1 (https://www.sofi.com/learn/content/esg-score/):   0%|          | 0/34 [00:00<?, ?it/s]

Failed to parse response: {
  "datasets": []
}
  


Processing URL 2 (https://open-research-europe.ec.europa.eu/articles/5-28):   0%|          | 0/34 [00:00<?, ?i…

Failed to parse response: {
  "datasets": []
}
  


In [24]:
filtered_labels = set()
for dataset in filtered_datasets:
    for label in dataset["labels"]:
        filtered_labels.add(label["label"].lower())

filtered_labels

{'contrast',
 'entailment',
 'esg rating: high',
 'esg rating: low',
 'esg rating: medium',
 'esg score: high',
 'esg score: low',
 'esg score: medium',
 'esg-related news',
 'high csr',
 'low csr',
 'masked language modelling',
 'negative',
 'negative esg',
 'negative esg sentiment',
 'negative esg statement',
 'neutral',
 'neutral esg sentiment',
 'neutral esg statement',
 'next sentence prediction',
 'non-esg related news',
 'non-sustainable',
 'positive',
 'positive esg',
 'positive esg sentiment',
 'positive esg statement',
 'relevant',
 'strong csr performance',
 'sustainable',
 'text classification',
 'very negative',
 'very positive',
 'weak csr performance'}

In [25]:
from typing import Literal

class LabelerAgent(Agent):
    def system(self):
        return f"You are an assistant that labels datasets based on the topic of {TOPIC}."

    def prompt(self, labels: List[str], num_labels: int):
        labels_str = ", ".join(labels)
        return f"From the provided list of previously found labels from datasets on {TOPIC}: {labels_str}\nCreate a list of {num_labels} label categories. Determine the name and type for each label. Output in JSON."

    def schema(self):
        class DatasetLabel(BaseModel):
            label_name: str
            description: str
            possible_values: List[str]

        class LabelerSchema(BaseModel):
            labels: List[DatasetLabel]

        return LabelerSchema.model_json_schema()
    
    def __call__(self, labels: List[str], num_labels: int, output_key: str = "labels"):
        output = generate(
            system_prompt=self.system(),
            prompt=self.prompt(labels, num_labels),
            schema=self.schema(),
            model=model,
            num_ctx=4000,
            temperature=0.2
        )
        if output:
            return output[output_key]

        
labeler_agent = LabelerAgent()
labels = list(filtered_labels)
labels = labeler_agent(labels=labels, num_labels=4)
labels

[{'label_name': 'ESG Sentiment',
  'description': 'Sentiment classification of ESG-related text',
  'possible_values': ['positive', 'negative', 'neutral']},
 {'label_name': 'CSR Performance',
  'description': 'Classification of CSR performance',
  'possible_values': ['strong', 'weak', 'low', 'high']},
 {'label_name': 'ESG Rating',
  'description': 'Classification of ESG rating',
  'possible_values': ['low', 'medium', 'high']},
 {'label_name': 'ESG Topic',
  'description': 'Classification of ESG topic',
  'possible_values': ['sustainable', 'non-sustainable']}]

In [26]:
def format_labels(labels: List[dict[str, str]]) -> str:
    sb = []
    for l in labels:
        sb.append(f"{l['label_name']}: {l['description']}")
        sb.append("Possible values:")
        for v in l["possible_values"]:
            sb.append(f"  - {v}")
        sb.append("\n")
    return "\n".join(sb)
# print(format_labels(labels))

super hacky way to dynamically create pydantic schemas from the generated data from the `LabelerAgent`

In [27]:
annotations = {}
field_definitions = {}

for label in labels:
    # workaround for python identifiers
    field_name = label['label_name'].replace(" ", "_")
    # we don't know the type, so we use literals for the possible values (instead of str...)
    field_type = Literal[tuple(label['possible_values'])]   
    annotations[field_name] = field_type
    field_definitions[field_name] = Field(..., description=label['description']) 

# Dynamically create the Pydantic model
DynamicLabelModel = type(
    "DynamicLabelModel", 
    (BaseModel,), 
    {"__annotations__": annotations, **field_definitions}
)
print(DynamicLabelModel.model_json_schema())

{'properties': {'ESG_Sentiment': {'description': 'Sentiment classification of ESG-related text', 'enum': ['positive', 'negative', 'neutral'], 'title': 'Esg Sentiment', 'type': 'string'}, 'CSR_Performance': {'description': 'Classification of CSR performance', 'enum': ['strong', 'weak', 'low', 'high'], 'title': 'Csr Performance', 'type': 'string'}, 'ESG_Rating': {'description': 'Classification of ESG rating', 'enum': ['low', 'medium', 'high'], 'title': 'Esg Rating', 'type': 'string'}, 'ESG_Topic': {'description': 'Classification of ESG topic', 'enum': ['sustainable', 'non-sustainable'], 'title': 'Esg Topic', 'type': 'string'}}, 'required': ['ESG_Sentiment', 'CSR_Performance', 'ESG_Rating', 'ESG_Topic'], 'title': 'DynamicLabelModel', 'type': 'object'}


In [61]:
import pandas as pd

class DataGenerationAgent(Agent):
    def system(self):
        pass

    def prompt(
        self,
        labels: List[dict],
        num_samples: int,
        text_strategy: str,
    ):
        labels_str = format_labels(labels)
        return f"Generate {num_samples} unique sample {text_strategy} as if retrieved from realistic sources, related to the of {TOPIC}. Use the labeling definitions below: {labels_str}. Think creatively, and avoid similar language. Output in JSON."

    def schema(self):
        class DataSample(BaseModel):
            text: str
            # text: str = Field(
            #     title="Text",
            #     description="The text of the data sample on the topic of {TOPIC}.",
            #     # min_length=200,
            #     # max_length=500,
            # )
            labels: List[DynamicLabelModel]

        class DataGenerationSchema(BaseModel):
            samples: List[DataSample]

        return DataGenerationSchema.model_json_schema()

    def __call__(
        self,
        labels: List[dict],
        num_samples: int,
        text_strategy: str,
        output_key: str = "samples",
    ):
        output = generate(
            system_prompt=self.system(),
            prompt=self.prompt(
                labels=labels, num_samples=num_samples, text_strategy=text_strategy
            ),
            schema=self.schema(),
            model=model,
            num_ctx=4000,
            num_predict=3000,
            temperature=1.0,  # we keep a high temp for more "creative" text generation
        )
        if output:
            return output[output_key]


data_generation_agent = DataGenerationAgent()

text_strategy = "sentences"  # sentences | paragraphs | documents
num_samples = 10
batch_size = 10

generated_data = []
for i in range(0, num_samples, batch_size):
    batch = data_generation_agent(
        labels=labels, num_samples=batch_size, text_strategy=text_strategy
    )
    generated_data.extend(batch)

In [62]:
generated_data

[{'text': "The company's new sustainable energy project will create hundreds of jobs in the community, promoting a positive ESG sentiment.",
  'labels': [{'ESG_Sentiment': 'positive',
    'CSR_Performance': 'high',
    'ESG_Rating': 'high',
    'ESG_Topic': 'sustainable'}]},
 {'text': "The city council voted to pass a weak CSR policy, sparking concerns about the city's ESG rating.",
  'labels': [{'ESG_Sentiment': 'negative',
    'CSR_Performance': 'weak',
    'ESG_Rating': 'medium',
    'ESG_Topic': 'non-sustainable'}]},
 {'text': "The company's annual report highlights its strong commitment to sustainable practices, resulting in a high ESG rating.",
  'labels': [{'ESG_Sentiment': 'positive',
    'CSR_Performance': 'high',
    'ESG_Rating': 'high',
    'ESG_Topic': 'sustainable'}]},
 {'text': 'A recent study found that the majority of consumers prioritize non-sustainable business practices, leading to a negative ESG sentiment.',
  'labels': [{'ESG_Sentiment': 'negative',
    'CSR_Perfo

In [63]:
columns = list(generated_data[0].keys()) + list(generated_data[0]["labels"][0].keys())
columns.remove("labels")
data = []
for sample in generated_data:
    # print(sample)
    for label in sample["labels"]:
        data.append({**sample, **label})
df = pd.DataFrame(data, columns=columns)
df

Unnamed: 0,text,ESG_Sentiment,CSR_Performance,ESG_Rating,ESG_Topic
0,The company's new sustainable energy project w...,positive,high,high,sustainable
1,The city council voted to pass a weak CSR poli...,negative,weak,medium,non-sustainable
2,The company's annual report highlights its str...,positive,high,high,sustainable
3,A recent study found that the majority of cons...,negative,weak,low,non-sustainable
4,The company's CSR efforts have not been effect...,negative,low,low,non-sustainable
5,The new sustainable packaging design will redu...,positive,high,high,sustainable
6,A survey of customers found that they prioriti...,positive,strong,medium,sustainable
7,The company's ESG rating has been downgraded d...,negative,low,low,non-sustainable
8,The city's new sustainability initiative aims ...,positive,high,high,sustainable
9,The company's CSR efforts have been effective ...,positive,strong,high,sustainable
