Create an 100% GenAI-First, agentic workflow that parses an electric bill extracts the relevant details and makes an recommendation in regards
installing roof solar system

General workflow will:

1. Parse the utility bill and key fields (cost per kwh, average kwh used per month, last 12 months kwh usage, avg cost per day, service address, electric provider) using LlamaParse or other parser
2. Load/Index solar radiance and resource data sheets based on service address location using Pinecone/Pgvector; if fact sheet is not available, then load from an specific url and translate the html into markdown and store in FS
3. Feed and electric usages data and geographic solar radiance factsheet into LLM with solar anaysis prompt to compute recommendations
4. Look up and make suggestings on solar panels, solar inverter, and battery storage (unit pricing would be needed to make estimates)
5. Look up solar installers index to suggest an local installer and produce final Structure Output

**LLMs to be used**

1. OpenAI 01/03-mini, DeepSeek R1

In [24]:
# special jupyter-notebook setup
import nest_asyncio
nest_asyncio.apply()

from typing import List, Optional
from pydantic import BaseModel, Field
import os
import json
from llama_index.core.workflow import (
    Event,
    StartEvent,
    StopEvent,
    Context,
    Workflow,
    step,
)
# Parse Electric Bill PDF and Setup
from llama_index.core import VectorStoreIndex, Document
from llama_index.core.llms import LLM
from llama_index.core.retrievers import BaseRetriever
from llama_index.core.prompts import ChatPromptTemplate
from llama_parse import LlamaParse
from llama_index.llms.openai import OpenAI
# %pip install numpy==1.26.4
# %pip install llama-index-llms-deepseek
from llama_index.llms.deepseek import DeepSeek
from llama_index.core.llms import ChatMessage


from dotenv import load_dotenv
load_dotenv()

# TODO make sure to scrub secret keys before commiting source control
lc_api_key = os.getenv('LLAMA_CLOUD_API_KEY')
cai_api_key = os.getenv('OPENAI_API_KEY')

gpt4o = OpenAI('gpt-4o')
deepseekr1 = DeepSeek(model="deepseek-reasoner", api_key=os.getenv('DEEPSEEK_API_KEY'))

#### Define Schemas
We define the schemas for client electric usage and solar radience facts
ElectricBill: capture relevant details from electic bill needed to process solar analysis.

In [57]:

class ElectricCharges(BaseModel):
    customer_charge: float = Field(..., description='the first customer charge amount')
    total_current_charge: float = Field(..., description='the total electric charge amount')

class ParsedElectricBill(BaseModel):
    '''Extracted electric bill information'''
    account_number: str = Field(..., description='the account number of the billing account - any spaces should be replaced with `-`')
    street_address: str = Field(..., description='the service street address where the electricity is being used; the apartment number or letter should be prepended with `#`')
    city: Optional[str] = Field(None, description='the city of the service address if available')
    state: Optional[str] = Field(None, description='the state of the service address if available; use full name, no abbreviations')
    zip_code: str = Field(..., description='the service zip or postal code')
    electric_provider: str = Field(..., description='the name of the electric utility provider')
    provider_website: str = Field(..., description='the website of the electric utility provider')
    cost_per_kwh: float = Field(..., description='the cost per kWh hour of electricity consumed')
    avg_month_consumption_kwh: float = Field(..., description='average consumption for the last several')
    avg_kwh_per_day: float = Field(..., description='average kWh used per day')
    avg_cost_per_day: float = Field(..., description='average cost of electric per day')
    service_date_range: str = Field(..., description='the service date range for the utility charge in YYYY/MM/DD format')
    charges: ElectricCharges

electric_charges_ex1 = {
    'customer_charge': 14.0,
    'total_current_charge': 88.0,
}

parsed_electric_bill_ex1 = ParsedElectricBill(
    account_number = '12345-67899',
    street_address = '1311 NORWALK LN',
    city = 'Austin',
    state = None,
    zip_code = '78703',
    electric_provider = 'Austin Energy',
    provider_website = 'www.coautilities.com',
    cost_per_kwh = 0.05115,
    avg_month_consumption_kwh = 625,
    avg_kwh_per_day = 0,
    avg_cost_per_day = 0,
    service_date_range = '2024/08/21 - 2024/09/23',
    charges = electric_charges_ex1,
)

### ^ TODO: other fields: get last 12-13 months kwh detailed month by month usage (Jan, Feb, Mar)

class ParsedStateUrl(BaseModel):
    state: str = Field(..., description='one of the 50 US states in lowercase, separate spaces with a hyphen')
    city: str = Field(..., description='the predicted city, separate spaces with a hyphen')
    confidence: float = Field(..., description='level of confidence that the state prediction was correct from 0 to 1')
    reasoning: str = Field(..., description='explain short reasoning on why this state was chosen')

class SolarRadiance(BaseModel):
    city: str = Field(..., description='the city column'),
    latitude: float = Field(..., description='the latitude column'),
    fixed_tilt: float = Field(..., description='fixed tilt sun hours per day'),
    one_axis_tilt: float = Field(..., description='1-axis tilt sun hours per day'),
    two_axis_tilt: float = Field(..., description='2-axis tilt sun hours per day'),

ex1_solar_radiance = SolarRadiance(
    city='Austin',
    latitude=30.30,
    fixed_tilt=5.3,
    one_axis_tilt=6.7,
    two_axis_tilt=7.0,
)

class SolarProviders(BaseModel):
    # TODO: prepare a markdown document of solar providers in the US
    business_name: str = Field(..., description='name of solar provider')
    website_url: str = Field(..., description='website url of solar provider')
    email_address: str = Field(..., description='email address of solar provider')

class GatherRecommendations(BaseModel):
    datetime_created: str = Field(..., description='the current datetime which this model was created')
    solar_installation_providers: List[SolarProviders] = Field(None, description='Recommendations list from solar providers')

class SolarRadianceItemsPerState(BaseModel):
    items: List[SolarRadiance]

class SolarAnalysis(BaseModel):
    '''Analysis from LLM regarding solar install investment'''
    analysis_output: str
    rating: float

class InstallationPartnerRecommendation(BaseModel):
    '''Installation partner recommendations based on service address'''
    pass


# dict(parsed_electric_bill_ex1)

In [6]:
# prepare solar geo factsheets and example workflow

### Engineer Prompts
Build a prompt to analysis to help customers analysis the solar investment decision step by step and provide valuation feedback


In [102]:
## create a prompt for City of Austin, Austin Energy first, then generalize for other providers
## as we get more example template data
ELECTRIC_BILL_EXTRACT_PROMPT_V1 = '''
Extract the relevant electric usage data from the provided markdown file into a structured format.
You may need to look through the entire document to extract the relevant information

markdown electric bill contents:
{electric_bill_markdown}

Extract all the information according to the provided schema.
Ensure dates are in YYYY/MM/DD format and all numbers are properly formatted.
'''

CALC_TRUE_KWH_COST = '''
We need to calculate the true cost of each kilo watt per hour, give this parsed electric bill charge information

context:
{electric_line_items}
'''

SOLAR_ANALYSIS_SYSTEM_PROMPT = '''
You are an AI assistant tasked with analyzing and determining residential and commercial solar system installation investments.
You play the role of an assistant working at a solar installation business consulting customers with solar investment decisions.
'''

# TODO: fill with dynamic variables from electric bill and solar radiancee
SOLAR_ANALYSIS_PROMPT = '''
**Instructions:**
1. Given the customers electric usage information, the solar radiance of the address location, the solar equipment information provided below,
    calculate and analyze the cost and investment case for installing a solar system for the customer.
2. Give an recommendation whether or not to install solar panels based on this analysis.

Be precise, honest, and logical at every step.

**Data for Analysis**

Cost per kWh: ${cost_per_kwh}
Average kWh used per month: ${kwh_used_month}

**Solar Radiance for this Austin TX**
Calculate for all 3 options (fixed, 1-axis, 2-axis)

Fixed tilted sunlight hours: {fixed_axis} hours per day (average)
1-axis tilted sunlight hours: {one_axis} hours per day (average)
2-axis tilted sunlight hours: {two_axis} hours per day (average)

**Solar Installation Company Costs*

Assume solar install company will cost between $2.50 and $3.00 per watt installed.

**Equipment Information and Costs**

Solar Panel Choices:
- Trina Solar 450W panels at $0.23 per watt (from includes freight, duties, and taxes)
- JA Solar 405W Solar Panel 108 Cells JA-JAM54-S31-405MR Clearance at $0.35 per watt (US based retailer)

Solar Inverter Choices:
- Growatt 5kW Stackable Off-Grid Inverter | SPF 5000 ES at $0.17 per watt or $851.00 (quality, economical Chinese brand)
- Victron Quattro 48/5000 | 48V Input | 5000VA Output 120V | 70A Charger | Transfer Switch at $0.34 per watt or $1,695 (premium European brand)

Batteries:
- assume LFP battery storage to be $67 per kWh if ordered from manufacturers and $130 per kWh if purchase at US retail prices

Other equipment:
- assume all other equipment like panel fixture mounts and wires to be under $1000 for a 5kWh system

**Additional Information**
US Government subsidies will pay for {tax_credit_percent}% of this cost via tax credit. Take this into account if the address is in the United States
'''


PROVIDER_RECOMMENDATION_PROMPT = '''
TODO...
'''


GET_STATE_PROMPT = '''\
Given the street address and zip/postal code: {address_zip}
Return JSON object matching the ParsedStateUrl schema
'''


LLM_DISCLOSER_PROMPT = '''
The following is generated by a experimental technology using LLMs and can potentially hallucinate and create unintended inaccuracies.
The analysis is intended to be use as a additional reference and users should also do their own research before large investment decisions.
'''


In [103]:
# TODO calculate the TRUE cost per kwh from a variety of electric bills

# create structured json model from markdown electric usage (the current prompt is not able to extract the true cost of the kwh
# based on available information from parsed electric bill)
async def build_structured_electric_bill(
    llm_model, md_file_name: str = '202407.md') -> ParsedElectricBill:
    json_file_path = md_file_name.replace('.md', '-structured.json')
    structured_json_path = f'./data/{json_file_path}'
    if os.path.exists(structured_json_path):
        print(f'{json_file_path} already exists... returning cached json')
        with open(structured_json_path) as file:
            json_str = json.load(file)
            structured_data = ParsedElectricBill.model_validate(json_str)
            # print('structured cached >> ', structured_data)
            return structured_data

    content = ''
    with open(f'./data/{md_file_name}', "r", encoding="utf-8") as file:
        content = file.read()

    # Create extraction prompt
    prompt = ChatPromptTemplate.from_messages([
        ("system", "You are an assistant that extracts structured data from markdown formatted electric bill."),
        ("user", ELECTRIC_BILL_EXTRACT_PROMPT_V1)
    ])

    # Extract structured data
    structured_data = await llm_model.astructured_predict(
        ParsedElectricBill,
        prompt,
        electric_bill_markdown=content,
    )
    print('Structured Model: ', structured_data)
    json_output = structured_data.model_dump_json()
    print(json.dumps(json.loads(json_output), indent=2))

    with open(f'./data/{json_file_path}', 'w', encoding='utf-8') as f:
        f.write(json_output)

    return structured_data

# electric_model = await build_structured_electric_bill(lmodel, '202407.md')
# print('HERE >> ', electric_model)


# step 1: parse the electric bill (pdf) to markdown text format
async def parse_electric_pdf(llm_model: LLM, parser: LlamaParse, file_name: str):
    if not file_name:
        raise Exception('file_name str needs to be passed')

    pdf_path = f'./data/{file_name}'
    md_file_name = file_name.replace('pdf', 'md')
    md_file_path = f'./data/{md_file_name}'
    if os.path.exists(md_file_path):
        print(f'{md_file_path} already exists; just load the structure json file instead')
    else:
        print(f'using LlamaParse to extract pdf contents and creating markdown file: {md_file_path}')
        docs = await parser.aload_data(pdf_path)
        electric_data = "\n".join([d.get_content(metadata_mode="all") for d in docs])
        print('electric markdown >> ', electric_data)

        with open(md_file_path, 'w', encoding='utf-8') as f:
            f.write(electric_data)
    res = await build_structured_electric_bill(llm_model, md_file_name)

    return res

# parser = LlamaParse(result_type='markdown')
# llm_model = OpenAI('gpt-4o')
# results = await parse_electric_pdf(llm_model, parser, file_name='electric-bill-202409-redacted.pdf')
# print('GOT RESULTS >> ', results)


In [39]:

def print_json_indented(structured_data):
    print('Structured Model: ', structured_data)
    json_output = structured_data.model_dump_json()
    print(json.dumps(json.loads(json_output), indent=2))


#### Get solar peak hours as per location

1. Build the state/city solar radiance fact sheet if it doesn't exist from `https://www.turbinegenerator.org/solar/texas/texas-all-cities/` and store in the file `{state}-solar-factsheet.md` using Bs4 or Firecrawl API
2. if `{state}-solar-factsheet.md` does exist, then use LLM or pandas to extra the relevant fields
3. over multiple reps, all the states solar factsheets should be created


In [37]:
# from firecrawl import FirecrawlApp
import requests
from bs4 import BeautifulSoup

def create_solar_factsheets():
    pass

async def fetch_solar_radiance(eb: ParsedElectricBill, parsed_state: ParsedStateUrl) -> dict:
    state = parsed_state.state
    fetch_url = f'https://www.turbinegenerator.org/solar/{state}/{state}-all-cities'
    print(f'url to fetch: {fetch_url}')

    def get_city_solar(cities_solar):
        city_solar_info = cities_solar.get(eb.city.capitalize())
        if not city_solar_info:
            city_solar_info = cities_solar.get(parsed_state.city.capitalize())
        return city_solar_info

    solar_file = f'./data/cities-solar/{state}-all-cities.json'
    if os.path.exists(solar_file):
        # get existing file
        with open(solar_file, "r") as f:
            cities_solar = json.load(f)
            return get_city_solar(cities_solar)

    resp = requests.get(fetch_url)
    soup = BeautifulSoup(resp.text, 'html.parser')
    table_rows = soup.find('table').find_all('tr')[1:]
    cities_dict = {}
    for row in table_rows:
        # print('scrap data >> ', len(table_rows), row)
        all_tds = row.find_all('td')
        city_name = all_tds[0].find('h5').text.strip()
        latitude = all_tds[1].text.strip()
        fixed_tilt = all_tds[2].text.strip()
        one_axis = all_tds[3].text.strip()
        two_axis = all_tds[4].text.strip()
        # print(f'city={city_name}, latitude={latitude}, fixed_tilt={fixed_tilt}, one_axis={one_axis}, two_axis={two_axis}')
        cities_dict[city_name] = {
            'city_name': city_name,
            'latitude': latitude,
            'fixed_tilt': fixed_tilt,
            'one_axis': one_axis,
            'two_axis': two_axis,
        }
    print('cities_dict length >> ', len(cities_dict.keys()))
    with open(solar_file, "w") as json_file:
        json.dump(cities_dict, json_file, indent=4)
        print(f'finished writing {state}-all-cities.json')

    return get_city_solar(cities_dict)


async def llm_predict_state_from_address(
    lmodel: LLM,
    address_zip: str) -> ParsedStateUrl:
    if not address_zip:
        raise Exception('address_zip is needed to predict state')
    prompt = ChatPromptTemplate.from_messages([
        ("system", "You are an assistant that returns the US state given the street address and zip."),
        ("user", GET_STATE_PROMPT)
    ])
    structured_data = await lmodel.astructured_predict(
        ParsedStateUrl,
        prompt,
        address_zip=address_zip,
    )
    print(structured_data)

    return structured_data


# llm_model = OpenAI('gpt-4o')
# parsed_state = await llm_predict_state_from_address(
#     llm_model,
#     f'{parsed_electric_bill_ex1.street_address}, {parsed_electric_bill_ex1.zip_code}')
# await fetch_solar_radiance(eb=parsed_electric_bill_ex1, parsed_state=parsed_state)
# await llm_predict_state_from_address('1311 NORWALK LANE #F')


#### Create Workflow
TODO

In [38]:
# Define events
class ParseElectricBillEvent(Event):
    """
    Workflow event that carries the parsed electric bill metrics and solar radiance data
    Triggered after successful electric bill parsing step
    """
    electric_bill_data: ParsedElectricBill = Field(
        description="parsed electric bill containing all relevant electric bill fields"
    )
    solar_radiance: SolarRadiance = Field(
        description="parsed solar radiance data for particular city"
    )

class AnalyzedSolarProjectEvent(Event):
    """
    Workflow event that carries the parsed
    """
    solar_analysis_results: SolarAnalysis = Field(
        description="final solar analysis results from LLM"
    )

class GatherRecommendationEvent(Event):
    """
    Workflow event that carries the additional recommendations after solar analysis
    """
    gathered_recommendations: GatherRecommendations = Field(
        description="additional product and service recommendation"
    )

class LogEvent(Event):
    """
    Workflow event for logging messages and progress updates.
    Used throughout the workflow to provide status information.
    """
    msg: str = Field(
        description="Log message content"
    )
    delta: bool = Field(
        False,
        description="Flag indicating if this is a partial update to previous message"
    )


In [100]:
def analyze_solar_project(llmodel: LLM, ebm: ParsedElectricBill, sr: SolarRadiance):
    print('analyzing solar project requirements with LLM')
    prep_prompt = SOLAR_ANALYSIS_PROMPT.format(
        cost_per_kwh=ebm.cost_per_kwh,
        kwh_used_month=ebm.avg_month_consumption_kwh,
        fixed_axis=sr.fixed_tilt,
        one_axis=sr.one_axis_tilt,
        two_axis=sr.two_axis_tilt,
        tax_credit_percent='28')
    # print('prepare prompt ?? ', prep_prompt)
    messages = [
        ChatMessage(role="system", content=SOLAR_ANALYSIS_SYSTEM_PROMPT),
        ChatMessage(role="user", content=prep_prompt),
    ]
    # print('messages >> ', messages)
    resp = llmodel.chat(messages)
    print('response >> ', resp.message)

llmodel = OpenAI('gpt-4o')
# TODO use R1
analyze_solar_project(llmodel, parsed_electric_bill_ex1, ex1_solar_radiance)


analyzing solar project requirements with LLM
response >>  assistant: To determine whether installing a solar system is a good investment for the customer, we need to calculate the total cost of the system, the potential energy production, and the savings over time. Let's break down the analysis step by step.

### Step 1: Determine Energy Needs

The customer uses an average of 625 kWh per month. To offset this usage entirely with solar, we need to calculate the size of the solar system required.

### Step 2: Calculate Required System Size

1. **Daily Energy Usage**:  
   \[
   \text{Daily Energy Usage} = \frac{625 \text{ kWh/month}}{30 \text{ days/month}} \approx 20.83 \text{ kWh/day}
   \]

2. **System Size Calculation**:  
   To find the required system size in kW, divide the daily energy usage by the average sunlight hours for each configuration:

   - **Fixed Tilt**:  
     \[
     \text{System Size (Fixed)} = \frac{20.83 \text{ kWh/day}}{5.3 \text{ hours/day}} \approx 3.93 \text{ 

In [59]:

class SolarAnalysisAgenticWorkflow(Workflow):
    '''End-to-end solar investment analysis from electric bill'''

    def __init__(
        self,
        parser: LlamaParse,
        llm_model: LLM,
        **kwargs,
    ):
        super().__init__(**kwargs)
        self.parser = parser

    @step
    async def parse_electric_bill(
        self,
        ctx: Context,
        ev: StartEvent) -> ParseElectricBillEvent:
        print('start event... 1) parsing electric bill')
        electric_bill_path = 'electric-bill-202409-redacted.pdf'

        # step 1 parse and build eletric bill
        electric_bill_model = await parse_electric_pdf(file_name=electric_bill_path)
        print('agent: got electric bill model >> ', electric_bill_model)

        # step 2 fetch the solar radiance data for city
        parsed_state = await llm_predict_state_from_address(
            self.llm_model,
            f'{electric_bill_model.street_address}, {electric_bill_model.zip_code}')
        await fetch_solar_radiance(eb=parsed_electric_bill_ex1, parsed_state=parsed_state)
        sr = SolarRadiance(city=sr['city_name'],
            latitude=sr['latitude'],
            fixed_tilt=sr['fixed_tilt'],
            one_axis_tilt=sr['one_axis'],
            two_axis_tilt=sr['two_axis'])

        return ParseElectricBillEvent(
            electric_bill_data=electricbill_model,
            solar_radiance=sr)

    @step
    async def analyze_solar_project(
        self,
        ctx: Context,
        ev: ParseElectricBillEvent,
    ) -> AnalyzedSolarProjectEvent:
        '''
        Prepare llm prompt and data to do full solar installation and investment analysis
        '''
        print('analyzing solar project with electric bill and solar radiance data')
        print('electric bill data >> ', ev.electric_bill_data)
        print('solar radiance >> ', solar_radiance)

        return AnalyzedSolarProjectEvent()

    # @step
    # async def gather_recommendations(
    #     self,
    #     ctx: Context,
    #     ev: AnalyzedSolarProject
    # ):
    #     print('gather additional service and product recommendations...')
        
        
    @step
    async def output_final(
        self,
        ctx: Context,
        ev: AnalyzedSolarProjectEvent,
    ) -> StopEvent:
        print('creating final output...')
        return StopEvent(result=None)




In [72]:

# activate the workflow
## Initialize the LLM, parser, prep needed documents, and create the Workflow
parser = LlamaParse(result_type='markdown')
llm_model = OpenAI('gpt-4o')

wf = SolarAnalysisAgenticWorkflow(
    parser=parser,
    llm_model=llm_model,
    verbose=True,
    timeout=300,
).run()

Running step parse_electric_bill
start event... 1) parsing electric bill


In [55]:
# visualize the workflow
# from llama_index.utils.workflow import draw_all_possible_flows
# draw_all_possible_flows(SolarAnalysisAgenticWorkflow, filename="solar_analysis.wf.html")