<a href="https://colab.research.google.com/github/sutoa/learn-crewai/blob/main/crewAI_agents-dependency_tree.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## This is to compare client ownership structure between the ADV document and crds using crewAI agents

In [1]:
!pip install crewai==0.28.8 crewai_tools==0.1.6 langchain_community==0.0.29 chromadb==0.4.24 "numpy<2.0" networkx==2.8.8



In [5]:
import os
# from utils import get_openai_api_key
from google.colab import userdata

os.environ["OPENAI_API_KEY"] = userdata.get('OPENAI_API_KEY')
os.environ["OPENAI_MODEL_NAME"] = 'gpt-4o-mini'
os.environ["SERPER_API_KEY"] = userdata.get('SERPER_API_KEY')

In [6]:
from crewai_tools import tool
import requests
import os

@tool("DownloadFileTool")
def download_file(url: str, destination_folder: str = ".") -> str:
    """
    Downloads a file from the given URL and saves it locally.

    Args:
        url (str): The URL of the file to download.
        destination_folder (str, optional): Where to save the file. Defaults to current directory.

    Returns:
        str: Full path to the downloaded file or an error message.
    """
    try:
        os.makedirs(destination_folder, exist_ok=True)

        # Extract base filename from URL
        original_filename = url.split("/")[-1]
        base_name, ext = os.path.splitext(original_filename)
        destination_path = os.path.join(destination_folder, original_filename)

        # Handle filename conflict by appending a number
        counter = 1
        while os.path.exists(destination_path):
            destination_path = os.path.join(destination_folder, f"{base_name}_{counter}{ext}")
            counter += 1

        # Download the file
        response = requests.get(url)
        response.raise_for_status()
        with open(destination_path, 'wb') as f:
            f.write(response.content)

        return f"File downloaded successfully: {destination_path}"
    except Exception as e:
        return f"Failed to download file from {url}. Error: {str(e)}"


In [7]:
import requests
from crewai_tools import BaseTool
from typing import Literal

class FileDownloaderTool(BaseTool):
    name: Literal["FileDownloaderTool"] = "FileDownloaderTool"  # Add name attribute with type annotation
    description: str = "Downloads a file from a URL and saves it to a local path."  # Add description attribute with type annotation

    def _run(self, file_url: str, save_path: str):
        """Downloads a file from the given URL and saves it locally."""
        try:
            response = requests.get(file_url, stream=True)
            response.raise_for_status()  # Check for errors
            with open(save_path, "wb") as file:
                for chunk in response.iter_content(chunk_size=8192):
                    file.write(chunk)
            return f"File downloaded successfully: {save_path}"
        except Exception as e:
            return f"Error downloading file: {str(e)}"

In [10]:
import networkx as nx
import matplotlib.pyplot as plt
from crewai import Task, Agent, Crew
from crewai_tools import SerperDevTool, PDFSearchTool

# Define agents
researcher = Agent(
    role="KYC Researcher",
    goal="Download the ADV form for {client} from government website advisorinfo.sec.gov",
    backstory="""As a KYC researcher supporting the onboarding process for client at a major European bank,
        Your responsibilities are to collect the most up-to-date information about the client, {client},
        from internal and external data sources to be analyzed by KYC analyst on the case.
        For client ownership information, you need to download the ADV form for {client} to a local folder.
        The ADV form is usually available at the sec.gov website. For instance, the download link for Goldman Sachs is https://reports.advisorinfo.sec.gov/reports/ADV/361/PDF/361.pdf.
    """,
    verbose=True,
    allow_delegation=False,
    tools=[download_file, SerperDevTool()]
    )


rag_tool = PDFSearchTool(
    top_k=5,                        # Number of search results to return
    chunk_size=1000,                # Token or character length per chunk
    chunk_overlap=200,              # Overlap between chunks
    embedder="openai",             # or "huggingface", etc.
    vector_store="chromadb",       # Currently default and local
)

analyst = Agent(
     role="KYC Analyst",
     goal="Extract the ownership structure information from document or data collected by the research for client {client}",
     backstory="""As a KYC analyst for the bank, one of you jobs is to collect the list of DIRECT Owners and Excecutive Officers for client {client} from its ADV document.
     The information can be found from the Schedule A section in the form. The result should meet the following criteria:
      - if no list is found, then just return an empty list. Do NOT return any fake names.
      - include the name, title and ownership code and description for each person.
      - Note that for ownership, The description for each code can be found also in 'Schedule A'. For instance, for ownership code 'B', the description is '10% but less than 25%'. For 'C', the description is  '25% but less than 50%'. Do NOT guess or give your own ownership description.
     """,
     verbose=True,
     allow_delegation=False,
     tools=[rag_tool]
)
# reviewer = Agent(
#     role="KYC Reviewer",
#     goal="To ensure that differences in ownership structure data from various sources for the same client are reported accurately and thoroughly, listed in bullets with values that are different",
#     backstory="reviewer's job is to ensure the quality of the report by analyst, making sure they are accurate, complete and easily to read.")

# Define tasks
research_task = Task(description="Gather ADV form for {client}",
                     agent=researcher,
                     expected_output="The downloaded copy of the ADV file for client {client} on the local drive and its complete file path")
analyze_task = Task(description="Retrieve the list of Direct Owners and Executive Officers for {client} from its ADV document",
                    agent=analyst,
                    expected_output="A json file with the list of direct owners and executive officers of {client}")
# review_task = Task(description="Review delta report by analyst. if unsatisfied,  ask analyst to improve with detailed feedback. ", agent=reviewer, expected_output="feedback on the delta report")

# Create a task graph
# task_graph = nx.DiGraph()
# task_graph.add_edge(research_task, analyze_task)
# task_graph.add_edge(analyze_task, review_task)

# Visualize the task graph
# nx.draw(task_graph, with_labels=True, node_size=2000, node_color="lightblue", font_size=7)
# plt.show()

# Create crew
crew = Crew(
    agents=[researcher, analyst],
    tasks=[research_task, analyze_task],
    verbose=2
)

# Execute crew
inputs = {
    "client": "Boston Partners Global Investors, Inc.",
    "inquiry": "Please download the ADV form for the client"
}
result = crew.kickoff(inputs)
print(result)



[1m[95m [DEBUG]: == Working Agent: KYC Researcher[00m
[1m[95m [INFO]: == Starting Task: Gather ADV form for Boston Partners Global Investors, Inc.[00m


[1m> Entering new CrewAgentExecutor chain...[0m
[32;1m[1;3mI need to gather the ADV form for Boston Partners Global Investors, Inc. from the SEC website. To do this, I will first search for the specific URL where the ADV form is located.

Action: Search the internet  
Action Input: {"search_query": "Boston Partners Global Investors, Inc. ADV form site:sec.gov"}  
[0m[95m 


Search results: Title: BOSTON PARTNERS - Investment Adviser Firm
Link: https://adviserinfo.sec.gov/firm/summary/124982
Snippet: An ERA is required to file a report using Form ADV, but does not complete all items contained in Form ADV that a registered adviser must complete. Other state ...
---
Title: FORM ADV
Link: https://reports.adviserinfo.sec.gov/reports/ADV/124982/PDF/124982.pdf
Snippet: BOSTON PARTNERS GLOBAL INVESTORS, INC. B. (1) Name under which

Inserting batches in chromadb: 100%|██████████| 3/3 [00:03<00:00,  1.20s/it]


[95m 

Relevant Content:
application or report. Schedule B asks for information about your indirect owners; you must first complete Schedule A, which asks for information about your direct owners. Use Schedule C to amend this information. 2.Indirect Owners. With respect to each owner listed on Schedule A (except individual owners), list below: (a)in the case of an owner that is a corporation, each of its shareholders that beneficially owns, has the right to vote, or has the power to sell or direct the sale of, 25% or more of a class of a voting security of that corporation; For purposes of this Schedule, a person beneficially owns any securities: (i) owned by his/her child, stepchild, grandchild, parent, stepparent, grandparent, spouse, sibling, mother-in-law, father-in-law, son-in-law, daughter-in-law, brother-in-law, or sister-in-law, sharing the same residence; or (ii) that he/she has the right to acquire, within 60 days, through the exercise of any option, warrant, or right to pur

### Below to try layout-aware chunking for better search result

In [None]:
!pip install unstructured sentence-transformers pdfminer


Collecting pdfminer
  Downloading pdfminer-20191125.tar.gz (4.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.2/4.2 MB[0m [31m31.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pycryptodome (from pdfminer)
  Downloading pycryptodome-3.22.0-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.4 kB)
Downloading pycryptodome-3.22.0-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.3/2.3 MB[0m [31m72.0 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: pdfminer
  Building wheel for pdfminer (setup.py) ... [?25l[?25hdone
  Created wheel for pdfminer: filename=pdfminer-20191125-py3-none-any.whl size=6140087 sha256=9c5f4549fc0cc25d2a0d48e9463c8b3e57d998bae47a8ce20584bd5db00b4fc5
  Stored in directory: /root/.cache/pip/wheels/56/24/93/05316c6df89ff210a9a705060277e3acbfd2d1bd3a5853ee19
Success

In [None]:
!pip install "pdfminer.six==20221105" --force-reinstall

Collecting pdfminer.six==20221105
  Downloading pdfminer.six-20221105-py3-none-any.whl.metadata (4.0 kB)
Collecting charset-normalizer>=2.0.0 (from pdfminer.six==20221105)
  Downloading charset_normalizer-3.4.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (35 kB)
Collecting cryptography>=36.0.0 (from pdfminer.six==20221105)
  Downloading cryptography-44.0.2-cp39-abi3-manylinux_2_34_x86_64.whl.metadata (5.7 kB)
Collecting cffi>=1.12 (from cryptography>=36.0.0->pdfminer.six==20221105)
  Downloading cffi-1.17.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting pycparser (from cffi>=1.12->cryptography>=36.0.0->pdfminer.six==20221105)
  Downloading pycparser-2.22-py3-none-any.whl.metadata (943 bytes)
Downloading pdfminer.six-20221105-py3-none-any.whl (5.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m38.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading charset_normalizer-3.4.1-cp311-cp311-ma

In [None]:
from crewai_tools import tool
from unstructured.partition.pdf import partition_pdf
from sentence_transformers import SentenceTransformer
from chromadb import Client
from chromadb.config import Settings
import os

@tool("SmartPDFSearchTool")
def smart_pdf_search(params: dict) -> str:
    """
    Perform layout-aware RAG search across multiple PDFs using a query.

    Args (params dict):
        pdf_paths (list): List of local PDF file paths.
        query (str): Natural language question.
        top_k (int): Number of top results to return (default: 5).

    Returns:
        str: Top relevant content from the PDFs.
    """
    try:
        pdf_paths = params.get("pdf_paths", [])
        query = params.get("query", "")
        top_k = params.get("top_k", 5)

        if not pdf_paths or not query:
            return "Missing 'pdf_paths' or 'query'."

        # Load embedder
        embedder = SentenceTransformer("all-MiniLM-L6-v2")

        # Set up Chroma (in-memory for now)
        chroma_client = Client(Settings(chroma_db_impl="duckdb+parquet", persist_directory=".chroma_smartpdf"))
        collection = chroma_client.get_or_create_collection(name="smart_pdf_chunks")

        # Process all PDFs
        all_texts = []
        ids = []
        for pdf_path in pdf_paths:
            if not os.path.exists(pdf_path):
                continue

            # Use unstructured to parse layout-aware chunks
            elements = partition_pdf(filename=pdf_path)
            texts = [el.text for el in elements if el.text and len(el.text.strip()) > 20]
            vectors = embedder.encode(texts, convert_to_tensor=True).tolist()
            doc_ids = [f"{os.path.basename(pdf_path)}_chunk_{i}" for i in range(len(texts))]

            collection.add(documents=texts, embeddings=vectors, ids=doc_ids)

            all_texts.extend(texts)
            ids.extend(doc_ids)

        if not all_texts:
            return "No valid content extracted from PDFs."

        # Perform search
        results = collection.query(query_texts=[query], n_results=top_k)
        top_chunks = results["documents"][0]

        return "\n\n---\n\n".join(top_chunks)

    except Exception as e:
        return f"❌ Error: {str(e)}"


ImportError: cannot import name 'open_filename' from 'pdfminer.utils' (/usr/local/lib/python3.11/dist-packages/pdfminer/utils.py)

In [None]:
from crewai_tools import tool
from unstructured.partition.pdf import partition_pdf
from sentence_transformers import SentenceTransformer
from chromadb import Client
from chromadb.config import Settings
import os
from pdfminer.pdfparser import PDFParser # This line is updated
from pdfminer.pdfdocument import PDFDocument # This line is updated

@tool("SmartPDFSearchTool")
def smart_pdf_search(params: dict) -> str:
    """
    Perform layout-aware RAG search across multiple PDFs using a query.

    Args (params dict):
        pdf_paths (list): List of local PDF file paths.
        query (str): Natural language question.
        top_k (int): Number of top results to return (default: 5).

    Returns:
        str: Top relevant content from the PDFs.
    """
    try:
        pdf_paths = params.get("pdf_paths", [])
        query = params.get("query", "")
        top_k = params.get("top_k", 5)

        if not pdf_paths or not query:
            return "Missing 'pdf_paths' or 'query'."

        # Load embedder
        embedder = SentenceTransformer("all-MiniLM-L6-v2")

        # Set up Chroma (in-memory for now)
        chroma_client = Client(Settings(chroma_db_impl="duckdb+parquet", persist_directory=".chroma_smartpdf"))
        collection = chroma_client.get_or_create_collection(name="smart_pdf_chunks")

        # Process all PDFs
        all_texts = []
        ids = []
        for pdf_path in pdf_paths:
            if not os.path.exists(pdf_path):
                continue

            # Use unstructured to parse layout-aware chunks
            elements = partition_pdf(filename=pdf_path)
            texts = [el.text for el in elements if el.text and len(el.text.strip()) > 20]
            vectors = embedder.encode(texts, convert_to_tensor=True).tolist()
            doc_ids = [f"{os.path.basename(pdf_path)}_chunk_{i}" for i in range(len(texts))]

            collection.add(documents=texts, embeddings=vectors, ids=doc_ids)

            all_texts.extend(texts)
            ids.extend(doc_ids)

        if not all_texts:
            return "No valid content extracted from PDFs."

        # Perform search
        results = collection.query(query_texts=[query], n_results=top_k)
        top_chunks = results["documents"][0]

        return "\n\n---\n\n".join(top_chunks)

    except Exception as e:
        return f"❌ Error: {str(e)}"

ImportError: cannot import name 'open_filename' from 'pdfminer.utils' (/usr/local/lib/python3.11/dist-packages/pdfminer/utils.py)