In [None]:

import pandas as pd
import time
import os
from llama_cpp import Llama

In [None]:
llm = Llama(
    model_path="D:/Ollama/models/mistral/mistral-7b-instruct-v0.2.Q2_K.gguf",
    n_ctx=2048,
    n_threads=4,
    verbose=True,
)

In [None]:
# csv_file_path = '../../../data/raw/realestates_kh_v2_1.csv'
csv_file_path = '../../../data/processed/realestates_kh_filled_test.csv'

output_file = '../../../data/processed/realestates_kh_filled_test.csv'

In [None]:
# === Cell 2: Load Data and Prepare ===
try:
    df = pd.read_csv(csv_file_path, encoding='utf-8')
    print(f"Loaded '{csv_file_path}' with utf-8 encoding.")
except UnicodeDecodeError:
    df = pd.read_csv(csv_file_path, encoding='latin1')
    print(f"Loaded '{csv_file_path}' with latin1 encoding.")

# Make sure land_area column is numeric
df['land_area'] = pd.to_numeric(df['land_area'], errors='coerce')

# Filter rows where land_area is missing or zero
rows_to_process = df[(df['land_area'].isna()) | (df['land_area'] == 0)]
print(f"Found {len(rows_to_process)} rows with missing or zero land_area.")


In [None]:
import re

def extract_land_area_from_text(text):
    prompt = f"""Extract only the land size in square meters (sqm) from the text below. 
If it mentions dimensions (e.g., "20m x 30m", "8.4 x 18 meters"), multiply to get sqm. 
If it says "98,514 sqm", return that number.
If no land size is present, respond only with: N/A

Text:
{text}
Answer:"""

    try:
        response = llm(prompt=prompt, max_tokens=32, stop=["\n"])
        result = response["choices"][0]["text"].strip()

        if result.lower() == 'n/a':
            return None

        # Use regex to find all valid numbers (ignore malformed)
        numbers = re.findall(r"\d+(?:\.\d+)?", result)
        if not numbers:
            return None

        # Pick the largest plausible value as land area (heuristic)
        land_area = max(float(n) for n in numbers)
        return land_area

    except Exception as e:
        print(f"❌ AI extraction failed: {e}")
        return None


In [6]:

# --- Process Rows ---
updated_count = 0

for i, (idx, row) in enumerate(rows_to_process.iterrows(), 1):
    print(f"\n🔄 Processing [{i}/{len(rows_to_process)}] index {idx}...")

    info_text = row['information']
    land_area = extract_land_area_from_text(info_text)

    if land_area is not None:
        df.at[idx, 'land_area'] = land_area
        updated_count += 1
        print(f"✅ Extracted land_area: {land_area}")
    else:
        print("⚠️ No land area found.")

    # Rewrite full output file with updated df
    df.to_csv(output_file, index=False)
    print(f"💾 File '{output_file}' saved with updated index {idx}.")

    time.sleep(0.05)

print(f"\n✅ Finished! Total updated rows this run: {updated_count}")

✅ Extracted land_area: 92.25
💾 File '../../../data/processed/realestates_kh_filled_test.csv' saved with updated index 5692.

🔄 Processing [1581/2114] index 5693...


Llama.generate: prefix-match hit


✅ Extracted land_area: 317.0
💾 File '../../../data/processed/realestates_kh_filled_test.csv' saved with updated index 5693.

🔄 Processing [1582/2114] index 5694...


Llama.generate: prefix-match hit


✅ Extracted land_area: 105.0
💾 File '../../../data/processed/realestates_kh_filled_test.csv' saved with updated index 5694.

🔄 Processing [1583/2114] index 5695...


Llama.generate: prefix-match hit


✅ Extracted land_area: 80.0
💾 File '../../../data/processed/realestates_kh_filled_test.csv' saved with updated index 5695.

🔄 Processing [1584/2114] index 5696...


Llama.generate: prefix-match hit


⚠️ No land area found.
💾 File '../../../data/processed/realestates_kh_filled_test.csv' saved with updated index 5696.

🔄 Processing [1585/2114] index 5697...


Llama.generate: prefix-match hit


✅ Extracted land_area: 790.0
💾 File '../../../data/processed/realestates_kh_filled_test.csv' saved with updated index 5697.

🔄 Processing [1586/2114] index 5698...


Llama.generate: prefix-match hit


✅ Extracted land_area: 575.0
💾 File '../../../data/processed/realestates_kh_filled_test.csv' saved with updated index 5698.

🔄 Processing [1587/2114] index 5699...


Llama.generate: prefix-match hit


⚠️ No land area found.
💾 File '../../../data/processed/realestates_kh_filled_test.csv' saved with updated index 5699.

🔄 Processing [1588/2114] index 5700...


Llama.generate: prefix-match hit


✅ Extracted land_area: 105.0
💾 File '../../../data/processed/realestates_kh_filled_test.csv' saved with updated index 5700.

🔄 Processing [1589/2114] index 5701...


Llama.generate: prefix-match hit


✅ Extracted land_area: 150.0
💾 File '../../../data/processed/realestates_kh_filled_test.csv' saved with updated index 5701.

🔄 Processing [1590/2114] index 5702...


Llama.generate: prefix-match hit


✅ Extracted land_area: 83.0
💾 File '../../../data/processed/realestates_kh_filled_test.csv' saved with updated index 5702.

🔄 Processing [1591/2114] index 5703...


Llama.generate: prefix-match hit


✅ Extracted land_area: 100.0
💾 File '../../../data/processed/realestates_kh_filled_test.csv' saved with updated index 5703.

🔄 Processing [1592/2114] index 5704...


Llama.generate: prefix-match hit


✅ Extracted land_area: 237.04
💾 File '../../../data/processed/realestates_kh_filled_test.csv' saved with updated index 5704.

🔄 Processing [1593/2114] index 5705...


Llama.generate: prefix-match hit


✅ Extracted land_area: 560.0
💾 File '../../../data/processed/realestates_kh_filled_test.csv' saved with updated index 5705.

🔄 Processing [1594/2114] index 5706...


Llama.generate: prefix-match hit


✅ Extracted land_area: 261.0
💾 File '../../../data/processed/realestates_kh_filled_test.csv' saved with updated index 5706.

🔄 Processing [1595/2114] index 5707...


Llama.generate: prefix-match hit


✅ Extracted land_area: 206.0
💾 File '../../../data/processed/realestates_kh_filled_test.csv' saved with updated index 5707.

🔄 Processing [1596/2114] index 5708...


Llama.generate: prefix-match hit


✅ Extracted land_area: 282.0
💾 File '../../../data/processed/realestates_kh_filled_test.csv' saved with updated index 5708.

🔄 Processing [1597/2114] index 5709...


Llama.generate: prefix-match hit


✅ Extracted land_area: 142.8
💾 File '../../../data/processed/realestates_kh_filled_test.csv' saved with updated index 5709.

🔄 Processing [1598/2114] index 5710...


Llama.generate: prefix-match hit


✅ Extracted land_area: 86.0
💾 File '../../../data/processed/realestates_kh_filled_test.csv' saved with updated index 5710.

🔄 Processing [1599/2114] index 5711...


Llama.generate: prefix-match hit


✅ Extracted land_area: 201.0
💾 File '../../../data/processed/realestates_kh_filled_test.csv' saved with updated index 5711.

🔄 Processing [1600/2114] index 5712...


Llama.generate: prefix-match hit


✅ Extracted land_area: 176.0
💾 File '../../../data/processed/realestates_kh_filled_test.csv' saved with updated index 5712.

🔄 Processing [1601/2114] index 5713...


Llama.generate: prefix-match hit


✅ Extracted land_area: 92.0
💾 File '../../../data/processed/realestates_kh_filled_test.csv' saved with updated index 5713.

🔄 Processing [1602/2114] index 5714...


Llama.generate: prefix-match hit


✅ Extracted land_area: 739.21
💾 File '../../../data/processed/realestates_kh_filled_test.csv' saved with updated index 5714.

🔄 Processing [1603/2114] index 5715...


Llama.generate: prefix-match hit


✅ Extracted land_area: 834.0
💾 File '../../../data/processed/realestates_kh_filled_test.csv' saved with updated index 5715.

🔄 Processing [1604/2114] index 5716...


Llama.generate: prefix-match hit


⚠️ No land area found.
💾 File '../../../data/processed/realestates_kh_filled_test.csv' saved with updated index 5716.

🔄 Processing [1605/2114] index 5717...


Llama.generate: prefix-match hit


⚠️ No land area found.
💾 File '../../../data/processed/realestates_kh_filled_test.csv' saved with updated index 5717.

🔄 Processing [1606/2114] index 5718...


Llama.generate: prefix-match hit


✅ Extracted land_area: 373.0
💾 File '../../../data/processed/realestates_kh_filled_test.csv' saved with updated index 5718.

🔄 Processing [1607/2114] index 5719...


Llama.generate: prefix-match hit


✅ Extracted land_area: 613.0
💾 File '../../../data/processed/realestates_kh_filled_test.csv' saved with updated index 5719.

🔄 Processing [1608/2114] index 5720...


Llama.generate: prefix-match hit


✅ Extracted land_area: 716.0
💾 File '../../../data/processed/realestates_kh_filled_test.csv' saved with updated index 5720.

🔄 Processing [1609/2114] index 5721...


Llama.generate: prefix-match hit


✅ Extracted land_area: 522.0
💾 File '../../../data/processed/realestates_kh_filled_test.csv' saved with updated index 5721.

🔄 Processing [1610/2114] index 5722...


Llama.generate: prefix-match hit


✅ Extracted land_area: 120.75
💾 File '../../../data/processed/realestates_kh_filled_test.csv' saved with updated index 5722.

🔄 Processing [1611/2114] index 5723...


Llama.generate: prefix-match hit


⚠️ No land area found.
💾 File '../../../data/processed/realestates_kh_filled_test.csv' saved with updated index 5723.

🔄 Processing [1612/2114] index 5724...


Llama.generate: prefix-match hit


✅ Extracted land_area: 619.0
💾 File '../../../data/processed/realestates_kh_filled_test.csv' saved with updated index 5724.

🔄 Processing [1613/2114] index 5725...


Llama.generate: prefix-match hit


✅ Extracted land_area: 2.0
💾 File '../../../data/processed/realestates_kh_filled_test.csv' saved with updated index 5725.

🔄 Processing [1614/2114] index 5726...


Llama.generate: prefix-match hit


✅ Extracted land_area: 912.0
💾 File '../../../data/processed/realestates_kh_filled_test.csv' saved with updated index 5726.

🔄 Processing [1615/2114] index 5727...


Llama.generate: prefix-match hit


✅ Extracted land_area: 37500.0
💾 File '../../../data/processed/realestates_kh_filled_test.csv' saved with updated index 5727.

🔄 Processing [1616/2114] index 5728...


Llama.generate: prefix-match hit


✅ Extracted land_area: 273.0
💾 File '../../../data/processed/realestates_kh_filled_test.csv' saved with updated index 5728.

🔄 Processing [1617/2114] index 5729...


Llama.generate: prefix-match hit


✅ Extracted land_area: 578.0
💾 File '../../../data/processed/realestates_kh_filled_test.csv' saved with updated index 5729.

🔄 Processing [1618/2114] index 5730...


Llama.generate: prefix-match hit


✅ Extracted land_area: 502.0
💾 File '../../../data/processed/realestates_kh_filled_test.csv' saved with updated index 5730.

🔄 Processing [1619/2114] index 5731...


Llama.generate: prefix-match hit


✅ Extracted land_area: 1980.0
💾 File '../../../data/processed/realestates_kh_filled_test.csv' saved with updated index 5731.

🔄 Processing [1620/2114] index 5732...


Llama.generate: prefix-match hit


✅ Extracted land_area: 120.0
💾 File '../../../data/processed/realestates_kh_filled_test.csv' saved with updated index 5732.

🔄 Processing [1621/2114] index 5733...


Llama.generate: prefix-match hit


✅ Extracted land_area: 150.0
💾 File '../../../data/processed/realestates_kh_filled_test.csv' saved with updated index 5733.

🔄 Processing [1622/2114] index 5734...


Llama.generate: prefix-match hit


✅ Extracted land_area: 3000.0
💾 File '../../../data/processed/realestates_kh_filled_test.csv' saved with updated index 5734.

🔄 Processing [1623/2114] index 5735...


Llama.generate: prefix-match hit


✅ Extracted land_area: 363.0
💾 File '../../../data/processed/realestates_kh_filled_test.csv' saved with updated index 5735.

🔄 Processing [1624/2114] index 5736...


Llama.generate: prefix-match hit


✅ Extracted land_area: 1286.0
💾 File '../../../data/processed/realestates_kh_filled_test.csv' saved with updated index 5736.

🔄 Processing [1625/2114] index 5737...


Llama.generate: prefix-match hit


✅ Extracted land_area: 3490.0
💾 File '../../../data/processed/realestates_kh_filled_test.csv' saved with updated index 5737.

🔄 Processing [1626/2114] index 5738...


Llama.generate: prefix-match hit


⚠️ No land area found.
💾 File '../../../data/processed/realestates_kh_filled_test.csv' saved with updated index 5738.

🔄 Processing [1627/2114] index 5739...


Llama.generate: prefix-match hit


✅ Extracted land_area: 936.0
💾 File '../../../data/processed/realestates_kh_filled_test.csv' saved with updated index 5739.

🔄 Processing [1628/2114] index 5740...


Llama.generate: prefix-match hit


✅ Extracted land_area: 149.0
💾 File '../../../data/processed/realestates_kh_filled_test.csv' saved with updated index 5740.

🔄 Processing [1629/2114] index 5741...


Llama.generate: prefix-match hit


⚠️ No land area found.
💾 File '../../../data/processed/realestates_kh_filled_test.csv' saved with updated index 5741.

🔄 Processing [1630/2114] index 5742...


Llama.generate: prefix-match hit


✅ Extracted land_area: 696.0
💾 File '../../../data/processed/realestates_kh_filled_test.csv' saved with updated index 5742.

🔄 Processing [1631/2114] index 5743...


Llama.generate: prefix-match hit


✅ Extracted land_area: 403.0
💾 File '../../../data/processed/realestates_kh_filled_test.csv' saved with updated index 5743.

🔄 Processing [1632/2114] index 5744...


Llama.generate: prefix-match hit


✅ Extracted land_area: 3019.2
💾 File '../../../data/processed/realestates_kh_filled_test.csv' saved with updated index 5744.

🔄 Processing [1633/2114] index 5745...


Llama.generate: prefix-match hit


✅ Extracted land_area: 2745.0
💾 File '../../../data/processed/realestates_kh_filled_test.csv' saved with updated index 5745.

🔄 Processing [1634/2114] index 5746...


Llama.generate: prefix-match hit


✅ Extracted land_area: 130.0
💾 File '../../../data/processed/realestates_kh_filled_test.csv' saved with updated index 5746.

🔄 Processing [1635/2114] index 5747...


Llama.generate: prefix-match hit


✅ Extracted land_area: 200.0
💾 File '../../../data/processed/realestates_kh_filled_test.csv' saved with updated index 5747.

🔄 Processing [1636/2114] index 5748...


Llama.generate: prefix-match hit


✅ Extracted land_area: 13.5
💾 File '../../../data/processed/realestates_kh_filled_test.csv' saved with updated index 5748.

🔄 Processing [1637/2114] index 5749...


Llama.generate: prefix-match hit


✅ Extracted land_area: 67.0
💾 File '../../../data/processed/realestates_kh_filled_test.csv' saved with updated index 5749.

🔄 Processing [1638/2114] index 5750...


Llama.generate: prefix-match hit


✅ Extracted land_area: 288.0
💾 File '../../../data/processed/realestates_kh_filled_test.csv' saved with updated index 5750.

🔄 Processing [1639/2114] index 5751...


Llama.generate: prefix-match hit


⚠️ No land area found.
💾 File '../../../data/processed/realestates_kh_filled_test.csv' saved with updated index 5751.

🔄 Processing [1640/2114] index 5752...


Llama.generate: prefix-match hit


✅ Extracted land_area: 600.0
💾 File '../../../data/processed/realestates_kh_filled_test.csv' saved with updated index 5752.

🔄 Processing [1641/2114] index 5753...


Llama.generate: prefix-match hit


✅ Extracted land_area: 409.0
💾 File '../../../data/processed/realestates_kh_filled_test.csv' saved with updated index 5753.

🔄 Processing [1642/2114] index 5754...


Llama.generate: prefix-match hit


⚠️ No land area found.
💾 File '../../../data/processed/realestates_kh_filled_test.csv' saved with updated index 5754.

🔄 Processing [1643/2114] index 5755...


Llama.generate: prefix-match hit


⚠️ No land area found.
💾 File '../../../data/processed/realestates_kh_filled_test.csv' saved with updated index 5755.

🔄 Processing [1644/2114] index 5756...


Llama.generate: prefix-match hit


✅ Extracted land_area: 500.0
💾 File '../../../data/processed/realestates_kh_filled_test.csv' saved with updated index 5756.

🔄 Processing [1645/2114] index 5757...


Llama.generate: prefix-match hit


✅ Extracted land_area: 8000.0
💾 File '../../../data/processed/realestates_kh_filled_test.csv' saved with updated index 5757.

🔄 Processing [1646/2114] index 5758...


Llama.generate: prefix-match hit


✅ Extracted land_area: 691.0
💾 File '../../../data/processed/realestates_kh_filled_test.csv' saved with updated index 5758.

🔄 Processing [1647/2114] index 5759...


Llama.generate: prefix-match hit


✅ Extracted land_area: 396.0
💾 File '../../../data/processed/realestates_kh_filled_test.csv' saved with updated index 5759.

🔄 Processing [1648/2114] index 5760...


Llama.generate: prefix-match hit


✅ Extracted land_area: 293.0
💾 File '../../../data/processed/realestates_kh_filled_test.csv' saved with updated index 5760.

🔄 Processing [1649/2114] index 5761...


Llama.generate: prefix-match hit


✅ Extracted land_area: 105.0
💾 File '../../../data/processed/realestates_kh_filled_test.csv' saved with updated index 5761.

🔄 Processing [1650/2114] index 5762...


Llama.generate: prefix-match hit


✅ Extracted land_area: 632.0
💾 File '../../../data/processed/realestates_kh_filled_test.csv' saved with updated index 5762.

🔄 Processing [1651/2114] index 5763...


Llama.generate: prefix-match hit


✅ Extracted land_area: 176.4
💾 File '../../../data/processed/realestates_kh_filled_test.csv' saved with updated index 5763.

🔄 Processing [1652/2114] index 5764...


Llama.generate: prefix-match hit


✅ Extracted land_area: 8750.0
💾 File '../../../data/processed/realestates_kh_filled_test.csv' saved with updated index 5764.

🔄 Processing [1653/2114] index 5765...


Llama.generate: prefix-match hit


✅ Extracted land_area: 125.0
💾 File '../../../data/processed/realestates_kh_filled_test.csv' saved with updated index 5765.

🔄 Processing [1654/2114] index 5766...


Llama.generate: prefix-match hit


✅ Extracted land_area: 450.0
💾 File '../../../data/processed/realestates_kh_filled_test.csv' saved with updated index 5766.

🔄 Processing [1655/2114] index 5767...


Llama.generate: prefix-match hit


✅ Extracted land_area: 800.0
💾 File '../../../data/processed/realestates_kh_filled_test.csv' saved with updated index 5767.

🔄 Processing [1656/2114] index 5768...


Llama.generate: prefix-match hit


✅ Extracted land_area: 3801.0
💾 File '../../../data/processed/realestates_kh_filled_test.csv' saved with updated index 5768.

🔄 Processing [1657/2114] index 5769...


Llama.generate: prefix-match hit


✅ Extracted land_area: 132.0
💾 File '../../../data/processed/realestates_kh_filled_test.csv' saved with updated index 5769.

🔄 Processing [1658/2114] index 5770...


Llama.generate: prefix-match hit


✅ Extracted land_area: 1708.0
💾 File '../../../data/processed/realestates_kh_filled_test.csv' saved with updated index 5770.

🔄 Processing [1659/2114] index 5771...


Llama.generate: prefix-match hit


✅ Extracted land_area: 79.2
💾 File '../../../data/processed/realestates_kh_filled_test.csv' saved with updated index 5771.

🔄 Processing [1660/2114] index 5772...


Llama.generate: prefix-match hit


✅ Extracted land_area: 1902.0
💾 File '../../../data/processed/realestates_kh_filled_test.csv' saved with updated index 5772.

🔄 Processing [1661/2114] index 5773...


Llama.generate: prefix-match hit


✅ Extracted land_area: 716.0
💾 File '../../../data/processed/realestates_kh_filled_test.csv' saved with updated index 5773.

🔄 Processing [1662/2114] index 5774...


Llama.generate: prefix-match hit


KeyboardInterrupt: 