In [1]:
%load_ext autoreload
%autoreload 2

pseudo code for what we want (taken from <https://huggingface.co/docs/smolagents/v1.3.0/en/conceptual_guides/intro_agents>)

```
memory = [user_defined_task]
while llm_should_continue(memory): # this loop is the multi-step part
    action = llm_get_next_action(memory) # this is the tool-calling part
    observations = execute_action(action)
    memory += [action, observations]
```

### experimental!!!
stop the ollama service and start it as follows (assume you have memory for 3 models, we're using lightweight ones!)
```bash
OLLAMA_MAX_LOADED_MODELS=3 OLLAMA_NUM_PARALLEL=3 ollama serve
```

## model loading

In [2]:
# select a model
from util import load_model, show_models
print(show_models())
model = load_model(model_id="llama-3.1-8b")
model

llama-3.2-1b
llama-3.2-3b
llama-3.1-8b
deepseek-qwen-32b
gemma-2-9b


'hf.co/bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:IQ4_NL'

## selenium web driver
we use this to control the agents' online presence :-)

In [3]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import time

driver = webdriver.Chrome()

WebDriverException: Message: 'chromedriver' executable needs to be in PATH. Please see https://chromedriver.chromium.org/home


In [4]:
from abc import ABC, abstractmethod
from pydantic import BaseModel, Field, Json
from typing import List, Any


class Agent(ABC):
    @abstractmethod
    def system(self) -> str:
        pass

    @abstractmethod
    def prompt(self) -> str:
        pass

    @abstractmethod
    def schema(self) -> dict[str, Any]:
        pass

In [5]:
from generate import generate

# tools = [
#     "search_engine",
#     "dataset_generator"
# ]
# tools_str = ", ".join(f"{i+1}. {tool}" for i, tool in enumerate(tools))

# main_system = f"You are an intelligent assistant that helps the user in the necessary steps required to generate a dataset. Based on the users input, you should create tasks as described by the  {len(tools)} tools you have available: {tools_str}."
# print(main_system)


class SearchAgent(Agent):
    def system(self):
        return "You are tasked with reformulating a topic into a search string. You will aid the user to find more information about the datasets and resources available for said topic. Answer in JSON."
    
    def prompt(self, topic: str):
        return f"Generate a Google keyword search that will help the user find datasets on the topic of {topic}. Do not constrain the search to specific sites. Use your knowledge of the topic to generate a comprehensive search string."
    
    def schema(self):
        class SearchSchema(BaseModel):
            search: str
        return SearchSchema.model_json_schema()
    
    def __call__(self, topic: str):
        return generate(
            system_prompt=self.system(),
            prompt=self.prompt(topic),
            schema=self.schema(),
            model=model,
            num_ctx=1000,
            temperature=0.5
        )["search"]


TOPIC = "hateful speech detection"
main_prompt = f"I want to create a dataset on the topic of {TOPIC}."
search_agent = SearchAgent()

search_string = search_agent(topic=TOPIC)
print(search_string)

hate speech detection datasets OR hateful speech datasets OR toxic language datasets OR abusive language datasets OR offensive language datasets OR hate speech classification datasets OR hate speech detection models OR hate speech datasets for machine learning OR datasets for hate speech detection OR resources for hate speech detection research


In [6]:
from duckduckgo_search import DDGS
# ddgs = DDGS(proxy="tb", timeout=20) 
# this requires the TOR browser to be installed and running for a proxy
# should be used if you plan on querying a lot.
ddgs = DDGS(timeout=20) 

In [7]:
def get_urls(search_string: str, max_results: int = 1):
    res = ddgs.text(search_string, max_results=max_results, region="us-en", safesearch="on")
    hrefs = [r["href"] for r in res]
    return hrefs

urls = get_urls(search_string)
urls

['https://hatespeechdata.com/']

In [8]:
from markitdown import MarkItDown

md = MarkItDown()

In [16]:
import os
import shutil

LANGUAGE = "English"
search_for_dataset_info_query = "From the provided markdown text:\n{MARKDOWN}\nFind direct links to datasets, descriptions of datasets, and other information that explains something about the data. The dataset should be in {LANGUAGE}."

def visit_site(url):
    driver.get(url)
    time.sleep(1)
    html = driver.page_source
    return html

html = visit_site(urls[0])

TMP = "tmp.html"
with open(TMP, "w") as f:
    f.write(html)
result = md.convert(TMP)

# to deal with large contexts, split by headlines
hashed_results = result.text_content.split("\n#")
hashed_results = [h.strip() for h in hashed_results if len(h.strip().split("\n")) > 1]

TMP_FOLDER = "tmp-markdown"
shutil.rmtree(TMP_FOLDER, ignore_errors=True)
os.makedirs(TMP_FOLDER, exist_ok=True)

for i, hashed_result in enumerate(hashed_results):
    with open(f"tmp-markdown/{i}.md", "w") as f:
        f.write(hashed_result)

In [21]:
from pydantic.json_schema import GenerateJsonSchema, JsonSchemaValue
from typing import Literal

LANGUAGE = "English"


class DatasetAgent(Agent):
    def system(self):
        return "You are a helpful assistant that retrieves relevant information about datasets in a specific topic."

    def prompt(self, markdown_file_path):
        data = None
        with open(markdown_file_path) as f:
            data = f.read()
        if not data:
            return "No data provided."
        return f"From the provided information:\n{data}\nFind details about specific datasets (in {LANGUAGE} preferably), including information about its language, links, and labels. Determine if the dataset is relevant to the topic of {TOPIC} and is in the {LANGUAGE} language, specified by the 'relevant' field. Output in JSON."

    def schema(self):
        class DatasetLabel(BaseModel):
            name: str
            num_labels: int
            description: str

        class DatasetSchema(BaseModel):
            name: str
            language: str
            url: str
            labels: List[DatasetLabel]
            relevant: Literal["yes", "no"]

        class DatasetFinderSchema(BaseModel):
            datasets: List[DatasetSchema]

        return DatasetFinderSchema.model_json_schema()

    def __call__(self, markdown_file_path: str):
        return generate(
            system_prompt=self.system(),
            prompt=self.prompt(markdown_file_path=markdown_file_path),
            schema=self.schema(),
            model=model,
            num_ctx=4096,
            temperature=0.1,
        )["datasets"]

dataset_agent = DatasetAgent()

data_paths = [f"{TMP_FOLDER}/{i}.md" for i in range(len(hashed_results))]
for data_path in data_paths:
    print(data_path)
    datasets = dataset_agent(markdown_file_path=data_path)
    print(datasets)
    print("___")

tmp-markdown/0.md
[{'name': 'Hate Speech Dataset Catalogue', 'language': 'English', 'url': 'https://hatespeechdata.github.io/', 'labels': [], 'relevant': 'yes'}, {'name': 'IMDB Dataset', 'language': 'English', 'url': 'https://www.imdb.com/', 'labels': [], 'relevant': 'yes'}, {'name': 'Twitter Hateful Sentiment Dataset', 'language': 'English', 'url': 'https://www.kaggle.com/datasnaek/twitter-sentiment', 'labels': [], 'relevant': 'yes'}, {'name': 'OffensEval Dataset', 'language': 'English', 'url': 'https://competitions.codalab.org/competitions/22120', 'labels': [], 'relevant': 'yes'}, {'name': 'SemEval-2019 Task 6 Dataset', 'language': 'English', 'url': 'https://competitions.codalab.org/competitions/21200', 'labels': [], 'relevant': 'yes'}, {'name': 'Google Hate Speech Dataset', 'language': 'English', 'url': 'https://www.kaggle.com/competitions/google-hate-speech', 'labels': [], 'relevant': 'yes'}, {'name': 'Toxic Comment Classification Dataset', 'language': 'English', 'url': 'https://ww

KeyboardInterrupt: 