# Setup

In [1]:
!pip install -q gradio firecrawl-py google-generativeai

In [2]:

import gradio as gr
from firecrawl import FirecrawlApp
import google.generativeai as genai
import os
import getpass
import re
import json
from google.colab import userdata


In [3]:
# Get Firecrawl API Key
FIRECRAWL_API_KEY = userdata.get('firecrawl')
os.environ['FIRECRAWL_API_KEY'] = userdata.get('firecrawl')
genai.configure(api_key=userdata.get('GOOGLE_API_KEY'))

In [None]:
class CFG:
  model = 'gemini-1.5-flash'

# Functions

In [4]:
def parse_price(price_str):
    """
    Attempts to parse a price string (e.g., "€500,000", "$650k", "£400.000")
    into a numerical value (float). Returns None if parsing fails.
    """
    if not isinstance(price_str, str):
        return None
    try:
        # Remove currency symbols, common words, whitespace
        cleaned_price = re.sub(r"[€$£]|approx|pcm|per month|p.m.|vraagprijs|price on application|contact us", "", price_str, flags=re.IGNORECASE)
        cleaned_price = cleaned_price.strip()

        # Handle 'k' for thousands and 'm' for millions
        multiplier = 1
        if 'k' in cleaned_price.lower():
            multiplier = 1000
            cleaned_price = cleaned_price.lower().replace('k', '')
        elif 'm' in cleaned_price.lower():
             multiplier = 1000000
             cleaned_price = cleaned_price.lower().replace('m', '')

        # Remove thousands separators (commas or periods depending on locale - handle both)
        # Need to be careful if '.' is a decimal separator
        if '.' in cleaned_price and ',' in cleaned_price:
             # Assume '.' is thousand sep if it comes before ','
             if cleaned_price.rfind('.') < cleaned_price.rfind(','):
                  cleaned_price = cleaned_price.replace('.', '')
                  cleaned_price = cleaned_price.replace(',', '.') # Use '.' as decimal sep
             else: # Assume ',' is thousand sep
                  cleaned_price = cleaned_price.replace(',', '')
        elif '.' in cleaned_price:
             # If multiple '.', assume they are thousand separators unless it's the last char
             parts = cleaned_price.split('.')
             if len(parts) > 2:
                  cleaned_price = "".join(parts[:-1]) + "." + parts[-1] # Keep last '.' as decimal
             # Consider a case like 500.000 - likely 500000 not 500.0
             # If there's only one '.' and >= 3 digits after, treat as thousand sep
             elif len(parts) == 2 and len(parts[1]) >= 3:
                  cleaned_price = "".join(parts)


        elif ',' in cleaned_price:
             # Treat ',' as decimal separator if it's the only one and near the end? Risky.
             # Safer: assume ',' is thousand separator if present. Convert to '.' for float conversion later.
              # Let's remove ',' as thousand separator for now
              cleaned_price = cleaned_price.replace(',', '')


        # Final attempt to convert to float
        numeric_price = float(cleaned_price) * multiplier
        return numeric_price
    except (ValueError, TypeError):
        # Handle cases like empty strings, non-numeric values after cleaning
        return None

In [5]:
def extract_data_with_prompt(location, max_price, currency="PLN"):
    """
    Uses Firecrawl's app.extract with a prompt to get structured data.
    The prompt instructs the LLM to also filter by location and price.
    Includes robustness checks for response processing.
    """

    dynamic_prompt = f"""
    Extract 50 different properties in city {location} that cost less than {max_price} PLN

        Requirements:
        - Location: {location}
        - Maximum Price: {max_price} PLN
        - Include complete property details with exact location
        - Format as a list of properties with their respective details
        - provide the url for each listing

    """

    urls_to_scrape = [
        f"https://www.otodom.pl/pl/oferty/sprzedaz/mieszkanie/{location}/*",
        f"https://gratka.pl/nieruchomosci/mieszkania/{location}/sprzedaz/*",
    ]

    extracted_results = []
    response_data = None # Initialize response_data
    try:
        app = FirecrawlApp(api_key=FIRECRAWL_API_KEY)
        print(f"  - Calling Firecrawl app.extract...")
        response_data = app.extract(
             urls=urls_to_scrape,
             params={
                 # 'mode': 'llm-extraction',
                 'prompt': dynamic_prompt,
             },

         )

        print(response_data)

    except Exception as e:
        print(f"Tool 1: Error during Firecrawl app.extract call: {e}")
        return []

    # --- Process the response (Now expecting a single dictionary) ---
    if isinstance(response_data, dict) and response_data.get('success'):
        print(f"  - Processing successful response dictionary...")
        data_payload = response_data.get('data') # Access the nested 'data' dictionary

        if isinstance(data_payload, dict):
            properties_list = data_payload.get('properties') # Access the 'properties' list

            if isinstance(properties_list, list):
                print(f"    - Found {len(properties_list)} listings in 'data.properties'.")
                for listing in properties_list:
                    if isinstance(listing, dict):
                        # Add the valid listing dictionary to results
                        # No need to add source_url manually as 'url' seems present
                        extracted_results.append(listing)
                    else:
                        print(f"  - Warning: Found non-dictionary item within 'properties' list: {listing}")
            else:
                print(f"  - Warning: Expected a list under 'data.properties', but found {type(properties_list)}.")
        else:
            print(f"  - Warning: Expected a dictionary under 'data', but found {type(data_payload)}.")
    elif isinstance(response_data, dict) and not response_data.get('success'):
         print(f"  - Warning: Firecrawl response indicates failure: {response_data}")
    elif response_data is not None:
        # Log if response_data is not None but also not a dict
        print(f"  - Warning: Expected dict from app.extract based on new info, but received {type(response_data)}. Response: {response_data}")
    else:
        print("  - Info: Firecrawl app.extract returned None or the call failed.")

    print(f"Tool 1: Extraction processing finished. Total potential listings gathered: {len(extracted_results)}")

    # --- Basic Deduplication (using 'url' field within listing) ---
    seen_urls = set()
    deduplicated_results = []
    for listing in extracted_results:
        if isinstance(listing, dict):
             url = listing.get('url')
             if url and url != 'N/A' and url not in seen_urls:
                  deduplicated_results.append(listing)
                  seen_urls.add(url)
             elif not url or url == 'N/A': # Keep if no URL? Or discard? Keep for now.
                   print(f"  - Info: Keeping listing without unique URL: {listing.get('location') or listing.get('details')}")
                   deduplicated_results.append(listing)
        else:
            print(f"  - Warning: Found non-dictionary item during deduplication phase: {listing}")

    print(f"Tool 1: Deduplicated listings count: {len(deduplicated_results)}")
    return deduplicated_results

In [6]:
def summarize_results_with_llm(extracted_listings, location, max_price, currency="PLN"):
    """
    Uses Gemini to generate a summary of the listings extracted via prompt.
    Assumes the listings are already filtered by the extraction prompt.
    """
    print(f"Tool 2: Summarizing {len(extracted_listings)} extracted listings with Gemini...")
    if not extracted_listings:
        return "No listings were extracted based on the prompt criteria. The websites might not contain matching properties, or the extraction failed."

    # Prepare data for the prompt
    prompt_listings_text = []
    # Limit the number of listings sent to the LLM
    listings_to_send = extracted_listings[:15]
    print(f"  - Preparing summary for {len(listings_to_send)} listings...")

    for i, listing in enumerate(listings_to_send):
         if isinstance(listing, dict):
              # Try to display parsed price for clarity, fallback to original string
              price_str = listing.get('price', 'N/A')
              parsed_p = parse_price(price_str)
              display_price = f"{price_str}"
              if parsed_p is not None:
                    display_price += f" (~{parsed_p:,.0f} {currency})"

              listing_info = f"  - Property {i+1}:\n"
              listing_info += f"    Address/Title: {listing.get('address', 'N/A')}\n"
              listing_info += f"    Price: {display_price}\n"
              listing_info += f"    Details: {listing.get('details', 'N/A')}\n"
              listing_info += f"    URL: {listing.get('url', 'N/A')}"
              prompt_listings_text.append(listing_info)
         else:
              prompt_listings_text.append(f"  - Item {i+1}: Invalid data format received - {listing}")



    prompt = f"""You are a helpful real estate assistant summarizing search results.

    User's Search Criteria (used for extraction):
    Location: {location}
    Maximum Price: {max_price:,.0f} {currency}

    The following list contains property listings that were extracted from websites based on a prompt asking for properties matching the above criteria. **Assume these listings generally meet the criteria**, although the extraction process might not be perfect.

    Please provide a brief summary of these findings for the user in Markdown format.
    - Start with a short sentence confirming the search criteria.
    - Briefly mention the number of properties found (up to the limit provided below).
    - Highlight 10 potentially interesting options, mentioning their address/title, price, and url. Use the extracted details for highlighting. Format in a human readable way.
    - Keep the summary concise and easy to read.
    - Do not add properties not present in the list below.

    Extracted Listings (limit {len(listings_to_send)} shown):
    {chr(10).join(prompt_listings_text)}

    """

    try:
        model = genai.GenerativeModel(CFG.model)
        response = model.generate_content(prompt)
        if response.parts:
             return response.text
        elif response.prompt_feedback.block_reason:
             print(f"Tool 2: Gemini response blocked due to: {response.prompt_feedback.block_reason}")
             return f"Error: The AI summary response was blocked ({response.prompt_feedback.block_reason})."
        else:
             return "Error: Received an empty summary response from the AI assistant."

    except Exception as e:
        print(f"Tool 2: Error calling Gemini API: {e}")
        # Fallback: Simple list if LLM fails
        fallback_output = f"## Found {len(extracted_listings)} Listings (AI Summary Failed)\n\n"
        fallback_output += f"An error occurred generating the summary: {e}\n\nHere are the raw extracted listings (up to 10 shown):\n\n"
        for listing in extracted_listings[:10]:
             if isinstance(listing, dict):
                  fallback_output += f"- **{listing.get('address', 'N/A')}**\n"
                  fallback_output += f"  Price: {listing.get('price', 'N/A')}\n"
                  fallback_output += f"  Details: {listing.get('details', 'N/A')}\n"
                  fallback_output += f"  URL: {listing.get('url', 'N/A')}\n\n"
             else:
                  fallback_output += f"- Invalid listing data: {listing}\n\n"
        return fallback_output

In [7]:
def find_properties(location, max_price_input):
    """
    Main function: Extracts (using prompt) and summarizes property listings.
    """
    print(f"\n--- New Request ---")
    # Assuming price is in PLN based on user example, adjust if needed
    currency = "PLN"
    max_price = float(max_price_input) # Ensure max_price is float
    print(f"Received request: Location='{location}', Max Price='{max_price:,.0f} {currency}'")



    # --- Orchestration ---
    # 1. Extract Data (using prompt for extraction AND filtering)
    # Pass location/price to function to build the prompt
    extracted_listings = extract_data_with_prompt(location, max_price, currency)

    if not extracted_listings:
        print("Orchestrator: No listings extracted by the prompt. Returning message.")
        # Provide a more informative message
        return f"Could not extract any property listings matching your criteria (Location: {location}, Max Price: {max_price:,.0f} {currency}) from the target websites using the extraction prompt. \n\nPossible reasons:\n- No listings matched the criteria on the pages.\n- The websites' structure might be incompatible with automated extraction.\n- The extraction process may have been blocked or timed out."

    # 2. Summarize with LLM
    # Pass the extracted (supposedly pre-filtered) listings
    summary = summarize_results_with_llm(extracted_listings, location, max_price, currency)

    print("Orchestrator: Request processed. Returning summary.")
    return summary

# Gradio app

In [8]:
iface = gr.Interface(
     fn=find_properties,
     inputs=[
          gr.Textbox(label="Lokalizacja", placeholder="n.p. Warszawa, Kraków, Gdańsk", value="Warszawa"),
          gr.Number(label="Maksymalna cena (PLN)", value = 800000)
     ],
     outputs=gr.Markdown(label="Property Summary"),
)

In [None]:
iface.launch(debug=True, share=True)

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://dcb206489cf6e1c2c8.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)



--- New Request ---
Received request: Location='Warszawa', Max Price='800,000 PLN'
  - Calling Firecrawl app.extract...
{'success': True, 'data': {'properties': [{'url': 'https://www.otodom.pl/pl/oferta/3-pokoje-anin-ID4tFiR', 'price': 629000, 'details': '3 Pokoje Anin, Powierzchnia 46.3m², Cena za metr kwadratowy 13 585 zł/m², Piętro parter', 'location': 'ul. Zorzy, Anin, Wawer, Warszawa, mazowieckie'}, {'url': 'https://www.otodom.pl/pl/oferta/mieszkanie-2p-rodzinne-osiedle-las-cisza-ID4w4q1', 'price': 620000, 'details': 'Mieszkanie 2P / Rodzinne Osiedle / Las Cisza, Powierzchnia 50.7m², Cena za metr kwadratowy 12 229 zł/m², Piętro 2 piętro', 'location': 'ul. Romana Pazińskiego, Anin, Wawer, Warszawa, mazowieckie'}, {'url': 'https://www.otodom.pl/pl/oferta/2-3-pokoje-w-sercu-zielonego-anina-ID4tX3Y', 'price': 590000, 'details': '2/3 Pokoje W Sercu Zielonego Anina, Powierzchnia 48.3m², Cena za metr kwadratowy 12 215 zł/m², Piętro 1 piętro', 'location': 'ul. Romana Pazińskiego, Anin, W