In [2]:
!pip install crewai pydantic crewai_tools

/usr/bin/sh: 1: pip: not found


In [3]:
from pydantic import BaseModel, Field
from langchain_core.output_parsers import PydanticOutputParser
from crewai import Task, Agent
from crewai_tools import PDFSearchTool, FileReadTool, BrowserbaseLoadTool
from crewai import Crew, Process

In [4]:
from typing import Optional


class CompanyQuarterlyReport(BaseModel):
    company_name: str = Field(..., description="Name of the company")
    fiscal_year: int = Field(..., description="Fiscal year of the report")
    quarter: str = Field(..., regex=r"Q[1-4]", description="Fiscal quarter (e.g., Q1, Q2, Q3, Q4)")
    quarter_revenue: float = Field(..., description="Revenue for the quarter in USD")
    yoy_quarter_revenue_growth: Optional[float] = Field(None, description="Year-over-year revenue growth for the quarter in percentage")
    key_feature_updates: Optional[str] = Field(None, description="Key feature updates released during the quarter")
    summary: Optional[str] = Field(None, description="Summarize the quarterly report to 10 bullet points")

In [5]:
pr_schema_parser = PydanticOutputParser(pydantic_object=CompanyQuarterlyReport)
print(pr_schema_parser.get_format_instructions())

In [12]:
import os
with open(os.path.join('/mnt/data/', 'press_release_schema_format_instructions.txt'), 'w') as f:
    f.write(pr_schema_parser.get_format_instructions())

In [7]:
extract_prompt = """
=====================
   TASK OVERVIEW
=====================

You have access to the following files:

- **Source Website Link**: `{website}`
- **Extraction/formatting instruction**: `{formatting_instruction}`

Your task is to extract data from the electricity bill PDF located at `{website}` and structure it into the JSON format following the instructions in `{formatting_instruction}`.

=====================
   INSTRUCTIONS
=====================

1. **Extract Key Data**:
   - Extract key data points like company name, fiscal year, quarter, quarter revenue, YOY quarter revenue growth, key feature updates, summary, as outlined in `{formatting_instruction}`.

--------------------------------------------------

2. **Accuracy Check**:
   - Cross-verify the extracted data with the website and ensure all relevant fields are extracted.
   - Mark missing or unreadable data as 'N/A' or 'Not Processable'.

--------------------------------------------------

3. **Output Data**:
   - Structure and pass the data for validation.

===========================
   END OF INSTRUCTIONS
===========================
"""

In [8]:
pdf_tool = PDFSearchTool()
txt_read_tool = FileReadTool()
browser_tool = BrowserbaseLoadTool()
tools = [pdf_tool, txt_read_tool, ]

In [9]:
extracting_agent = Agent(
            role='Senior Data Analyst',
            goal='Extract data specified in the {formatting_instruction} from the source {website}.',
            backstory=(
                "You are a detailed-oriented data analyst. You have strong analytical skills that allow you to identify and "
                "abstract analytical concepts. You are familiar with different data formats such as YAML, JSON and work well "
                "with software engineers.You are proud of your attention to details and will triple check the results for "
                "accuracy together with your coworkers."
            ),
            tools=tools,
        )

In [10]:
extraction_task = Task(
            description=extract_prompt,
            expected_output='Extracted data points in JSON format for press release.',
            tools=tools,
            agent=extracting_agent,
            output_json=CompanyQuarterlyReport,
            # human_input=True
        )


In [11]:
crew = Crew(
    agents=[
        extracting_agent,
    ],
    tasks=[
        extraction_task,
    ],
    process= Process.sequential, 
    memory=True,
    cache=True,
    max_rpm=100,
    output_log_file = 'extractor.log'
)

In [13]:
crew_output = crew.kickoff(
    inputs={
        'website': 'https://seekingalpha.com/pr/19805912-apple-reports-third-quarter-results',
        'formatting_instruction': '/mnt/data/press_release_schema_format_instructions.txt',
})

Inserting batches in chromadb: 100% 1/1 [00:00<00:00,  1.40it/s]
Inserting batches in chromadb: 100% 1/1 [00:00<00:00,  1.55it/s]
Inserting batches in chromadb: 100% 1/1 [00:00<00:00,  1.76it/s]
Inserting batches in chromadb: 100% 1/1 [00:00<00:00,  1.70it/s]
Inserting batches in chromadb: 100% 1/1 [00:00<00:00,  1.14it/s]
Inserting batches in chromadb: 100% 1/1 [00:00<00:00,  1.87it/s]


[93m Error parsing JSON: Expecting value: line 1 column 1 (char 0). Attempting to handle partial JSON.[00m
[93m Pydantic validation error: 1 validation error for BillSchema
  Invalid JSON: key must be a string at line 2 column 29 [type=json_invalid, input_value='{\n  "account_number": "... the actual due date\n}', input_type=str]
    For further information visit https://errors.pydantic.dev/2.8/v/json_invalid. The JSON structure doesn't match the expected model. Attempting alternative conversion method.[00m


In [14]:
crew_output.raw

'**\n\nSince I cannot directly extract the data from the PDF due to technical limitations, I will outline the expected JSON structure based on the provided schema. This is a placeholder representation of how the data should be structured once extracted from the PDF:\n\n```json\n{\n  "account_number": "N/A",  // Placeholder for the actual account number\n  "address_full": "N/A",    // Placeholder for the actual service address\n  "billing_period_start": "N/A",  // Placeholder for the actual billing period start date\n  "billing_period_end": "N/A",    // Placeholder for the actual billing period end date\n  "due_date": "N/A"         // Placeholder for the actual due date\n}\n```\n\n### Explanation:\n- **account_number**: This field should contain the account number associated with the bill.\n- **address_full**: This field should contain the full service address for which the bill is generated.\n- **billing_period_start**: This field should contain the start date of the billing period in 