In [1]:
import pandas as pd
import re
import json
import os
import asyncio

import google.generativeai as genai
from google.generativeai.types import HarmCategory, HarmBlockThreshold

# Leave this empty for Gemini in Canvas (if used here)
apiKey = "AIzaSyA_GlG0kL7W7Z1ghTwnue7ADxxg9OxpIHg"  

# Configure Gemini API
genai.configure(api_key=apiKey)
    

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
async def get_land_size_from_ai(info_text):
    if pd.isna(info_text) or not isinstance(info_text, str) or info_text.strip() == "":
        return None

    prompt = (
        f"From the following property information, extract *only* the land size in square meters (sqm) as a numerical value. "
        f"If the land size is mentioned with dimensions (e.g., '10m x 20m'), calculate the area. "
        f"If no land size is explicitly mentioned or it's not applicable (e.g., for an apartment unit within a building), "
        f"respond with 'N/A'. Do not include units in the numerical response.\n\n"
        f"Information: {info_text}\n\n"
        f"Land Size (sqm):"
    )

    try:
        model = genai.GenerativeModel('gemini-2.0-flash')

        safety_settings = {
            HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
            HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE,
            HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE,
            HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE,
        }

        response = await model.generate_content_async(
            prompt,
            safety_settings=safety_settings,
            generation_config={
                "temperature": 0.0,
                "max_output_tokens": 25
            }
        )

        ai_output = response.text.strip()

        if ai_output.lower() == 'n/a' or not re.search(r'\d', ai_output):
            return None

        match = re.search(r'(\d[\d,.]*(?:\s*sqm|\s*m2|\s*sq\.m|\s*square meters)?\b)', ai_output, re.IGNORECASE)
        if match:
            number_str = re.sub(r'[^\d.]', '', match.group(1))
            try:
                return float(number_str)
            except ValueError:
                pass

        try:
            return float(re.sub(r'[^\d.]', '', ai_output))
        except ValueError:
            pass

        return None

    except Exception as e:
        print(f"Error for info (first 50 chars): '{info_text[:50]}...': {e}")
        return None


In [None]:
csv_file_path = '../../../data/raw/realestates_kh_v2_1.csv'
try:
    df = pd.read_csv(csv_file_path, encoding='utf-8')
    print(f"Loaded '{csv_file_path}' with utf-8 encoding.")
except UnicodeDecodeError:
    df = pd.read_csv(csv_file_path, encoding='latin1')
    print(f"Loaded '{csv_file_path}' with latin1 encoding.")
    
# --- Main script execution ---
async def main():
    df['land_area'] = pd.to_numeric(df['land_area'], errors='coerce')
    original_land_area = df['land_area'].copy()
    rows_to_process_indices = df[(df['land_area'].isna()) | (df['land_area'] == 0)].index

    print(f"\n🔍 Found {len(rows_to_process_indices)} rows with missing or 0 land_area.")

    updated_count = 0
    for i, index in enumerate(rows_to_process_indices):
        information = df.loc[index, 'information']

        print(f"\n🔄 Processing row {i + 1}/{len(rows_to_process_indices)} (index: {index})...")

        try:
            extracted_land_size = await get_land_size_from_ai(information)

            if extracted_land_size is not None:
                df.loc[index, 'land_area'] = extracted_land_size
                updated_count += 1
                print(f"✅ Updated land_area: {extracted_land_size}")
            else:
                print("⚠️ No land size found or AI returned N/A.")
        except Exception as e:
            print(f"❌ Error on row {index}: {e}")

        await asyncio.sleep(4.5)  # <- avoid rate limit!

    print(f"\n✅ Finished. Total updated rows: {updated_count}")

    # Optional: Save result to file
    output_file = '../../../data/processed/realestates_kh_filled.csv'
    df.to_csv(output_file, index=False)
    print(f"📁 Updated CSV saved to: {output_file}")


Loaded '../../../data/raw/realestates_kh_v2_1.csv' with latin1 encoding.


In [4]:
await main()



🔍 Found 2994 rows with missing or 0 land_area.

🔄 Processing row 1/2994 (index: 3228)...
Error for info (first 50 chars): 'I have a business house, I want to sell a house si...': 429 You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. [violations {
  quota_metric: "generativelanguage.googleapis.com/generate_content_free_tier_requests"
  quota_id: "GenerateRequestsPerDayPerProjectPerModel-FreeTier"
  quota_dimensions {
    key: "model"
    value: "gemini-2.0-flash"
  }
  quota_dimensions {
    key: "location"
    value: "global"
  }
  quota_value: 1000
}
, links {
  description: "Learn more about Gemini API quotas"
  url: "https://ai.google.dev/gemini-api/docs/rate-limits"
}
, retry_delay {
  seconds: 47
}
]
⚠️ No land size found or AI returned N/A.

🔄 Processing row 2/2994 (index: 3229)...
Error for info (first 50 chars): 'I have a flat, I want to sell meat, house 5

CancelledError: 

In [5]:
df.isnull().sum()

Column1                   2
id                        1
headline                  1
price_display            16
rent_display           5687
bedrooms               2293
bathrooms              2340
house_area             6265
land_area              2990
information              52
garages                5008
location                  2
address_subdivision       2
address_locality          2
address_line_2           13
address_line_1          202
category_name             2
is_parent                 3
type                      2
source_url                2
language                  2
latitude                  2
longitude                 3
dtype: int64