In [36]:
%load_ext autoreload
%autoreload 2

pseudo code for what we want (taken from <https://huggingface.co/docs/smolagents/v1.3.0/en/conceptual_guides/intro_agents>)

```
memory = [user_defined_task]
while llm_should_continue(memory): # this loop is the multi-step part
    action = llm_get_next_action(memory) # this is the tool-calling part
    observations = execute_action(action)
    memory += [action, observations]
```

### experimental!!!
stop the ollama service and start it as follows (assume you have memory for 3 models, we're using lightweight ones!)
```bash
OLLAMA_MAX_LOADED_MODELS=3 OLLAMA_NUM_PARALLEL=3 ollama serve
```

## model loading

In [37]:
from models import Models, show_models, load_model

show_models()
model = load_model("llama_3_1_8b")
# model = load_model("llama_3_2_3b")

['llama_3_2_1b', 'llama_3_2_3b', 'llama_3_1_8b', 'deepseek_qwen_32b']


## selenium web driver
we use this to control the agents' online presence :-)

In [38]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import time

driver = webdriver.Chrome()

KeyboardInterrupt: 

In [12]:
from abc import ABC, abstractmethod
from pydantic import BaseModel, Field, Json
from typing import List, Any


class Agent(ABC):
    @abstractmethod
    def system(self) -> str:
        pass

    @abstractmethod
    def prompt(self) -> str:
        pass

    @abstractmethod
    def schema(self) -> dict[str, Any]:
        pass

In [24]:
from generate import generate

# tools = [
#     "search_engine",
#     "dataset_generator"
# ]
# tools_str = ", ".join(f"{i+1}. {tool}" for i, tool in enumerate(tools))

# main_system = f"You are an intelligent assistant that helps the user in the necessary steps required to generate a dataset. Based on the users input, you should create tasks as described by the  {len(tools)} tools you have available: {tools_str}."
# print(main_system)


class SearchAgent(Agent):
    def system(self):
        return "You are tasked with reformulating a topic into a boolean AND/OR search string suitable for search engines. Maximum 3 operations. It MUST include 'dataset' or 'data sources'. Output in JSON according to the provided schema."
    
    def prompt(self, topic: str):
        return f"Generate a Google keyword search that will help the user find datasets on the topic of {topic}. Do not constrain the search to specific sites. Use your knowledge of the topic to generate a comprehensive search string."
    
    def schema(self):
        class SearchSchema(BaseModel):
            search: str
        return SearchSchema.model_json_schema()
    
    def __call__(self, topic: str):
        return generate(
            system_prompt=self.system(),
            prompt=self.prompt(topic),
            schema=self.schema(),
            model=model,
            num_ctx=1000,
            temperature=0.5
        )["search"]


TOPIC = "hateful speech detection"
main_prompt = f"I want to create a dataset on the topic of {TOPIC}."
search_agent = SearchAgent()

search_string = search_agent(topic=TOPIC)
print(search_string)

hateful speech detection dataset OR data sources OR hate speech detection OR online hate speech OR social media hate speech OR machine learning hate speech OR natural language processing hate speech OR deep learning hate speech OR text classification hate speech


In [25]:
from duckduckgo_search import DDGS
from markitdown import MarkItDown

# ddgs = DDGS(proxy="tb", timeout=20) 
# this requires the TOR browser to be installed and running for a proxy
# should be used if you plan on querying a lot.
ddgs = DDGS(timeout=20) 
# prepare the markdown extractor
md = MarkItDown()

In [None]:
def get_urls(search_string: str, max_results: int = 1):
    res = ddgs.text(search_string, max_results=max_results, region="us-en", safesearch="on")
    hrefs = [r["href"] for r in res]
    return hrefs

urls = get_urls(search_string)
urls

['https://github.com/aymeam/Datasets-for-Hate-Speech-Detection']

In [29]:
import os
import shutil

LANGUAGE = "English"
search_for_dataset_info_query = "From the provided markdown text:\n{MARKDOWN}\nFind direct links to datasets, descriptions of datasets, and other information that explains something about the data. The dataset should be in {LANGUAGE}."

def visit_site(url):
    driver.get(url)
    time.sleep(1)
    html = driver.page_source
    return html

for uid, url in enumerate(urls):
    html = visit_site(urls[0])

    TMP = "tmp.html"
    with open(TMP, "w") as f:
        f.write(html)
    result = md.convert(TMP)

    # to deal with large contexts, split by headlines
    hashed_results = result.text_content.split("\n#")
    hashed_results = [h.strip() for h in hashed_results if len(h.strip().split("\n")) > 1]

    MAX_RESULTS = 10
    if len(hashed_results) > MAX_RESULTS:
        hashed_results = hashed_results[:MAX_RESULTS]

    TMP_FOLDER = "tmp-markdown"
    shutil.rmtree(TMP_FOLDER, ignore_errors=True)
    os.makedirs(TMP_FOLDER, exist_ok=True)

    # we store the markdown unique to each URL in the current session:
    for i, hashed_result in enumerate(hashed_results):
        with open(f"tmp-markdown/{uid}_{i}.md", "w") as f:
            f.write(hashed_result)

    break

In [35]:
from pydantic.json_schema import GenerateJsonSchema, JsonSchemaValue
from typing import Literal

LANGUAGE = "English"

class DatasetAgent(Agent):
    def system(self):
        return "You are a helpful assistant that retrieves relevant information about datasets in a specific topic."

    def prompt(self, markdown_file_path):
        data = None
        with open(markdown_file_path) as f:
            data = f.read()
        if not data:
            return "No data provided."
        return f"From the provided information:\n{data}\nFind details about specific datasets (in {LANGUAGE} preferably), including information about its language, links, and labels. Determine if the dataset is relevant to the topic of {TOPIC} and is in the {LANGUAGE} language, specified by the 'relevant' field. Output in JSON."

    def schema(self):
        class DatasetLabel(BaseModel):
            name: str
            num_labels: int
            description: str

        class DatasetSchema(BaseModel):
            name: str
            language: str
            url: str
            labels: List[DatasetLabel]
            relevant: Literal["yes", "no"]

        class DatasetFinderSchema(BaseModel):
            datasets: List[DatasetSchema]

        return DatasetFinderSchema.model_json_schema()

    def __call__(self, markdown_file_path: str):
        return generate(
            system_prompt=self.system(),
            prompt=self.prompt(markdown_file_path=markdown_file_path),
            schema=self.schema(),
            model=model,
            num_ctx=4096,
            temperature=0.0,
        )["datasets"]

dataset_agent = DatasetAgent()

# data_paths = [f"{TMP_FOLDER}/{i}.md" for i in range(len(hashed_results))]

for uid, url in enumerate(urls):
    data_path = os.path.join(TMP_FOLDER, f"{uid}_{i}.md")
    data_paths = sorted(os.listdir(TMP_FOLDER))
    
    for data_path in data_paths:
        print(data_path)
        md_file = os.path.join(TMP_FOLDER, data_path)
        datasets = dataset_agent(markdown_file_path=md_file)
        print(datasets)
        print("___")

0_0.md
[]
___
0_1.md
[{'name': 'Hateful Speech Detection Dataset', 'language': 'English', 'url': 'https://github.com/your-username/hateful-speech-detection-dataset', 'labels': [{'name': 'hateful', 'num_labels': 1000, 'description': 'Hateful speech labels'}, {'name': 'not_hateful', 'num_labels': 1000, 'description': 'Not hateful speech labels'}], 'relevant': 'yes'}, {'name': 'English Hate Speech Dataset', 'language': 'English', 'url': 'https://github.com/your-username/english-hate-speech-dataset', 'labels': [{'name': 'hate', 'num_labels': 500, 'description': 'Hate speech labels'}, {'name': 'not_hate', 'num_labels': 500, 'description': 'Not hate speech labels'}], 'relevant': 'yes'}]
___
0_2.md
[{'name': 'Hate Speech Detection Dataset', 'language': 'English', 'url': 'https://example.com/hate-speech-dataset', 'labels': [{'name': 'Hate Speech', 'num_labels': 3, 'description': 'Text that is intended to incite hatred or violence against a particular group or individual.'}, {'name': 'Not Hate 

KeyboardInterrupt: 