In [1]:
import pandas as pd
import time
import os
from llama_cpp import Llama

In [2]:
llm = Llama(
    model_path="D:/Ollama/models/mistral/mistral-7b-instruct-v0.2.Q2_K.gguf",
    n_ctx=2048,
    n_threads=4,
    verbose=True,
)

AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 0 | VSX = 0 | 


In [3]:
csv_file_path = '../../../data/raw/realestates_kh_v2_1.csv'
output_file = '../../../data/processed/realestates_kh_filled.csv'

# Batch size for saving
batch_size = 100

In [4]:
# === Cell 2: Load Data and Prepare ===
try:
    df = pd.read_csv(csv_file_path, encoding='utf-8')
    print(f"Loaded '{csv_file_path}' with utf-8 encoding.")
except UnicodeDecodeError:
    df = pd.read_csv(csv_file_path, encoding='latin1')
    print(f"Loaded '{csv_file_path}' with latin1 encoding.")

# Make sure land_area column is numeric
df['land_area'] = pd.to_numeric(df['land_area'], errors='coerce')

# Filter rows where land_area is missing or zero
rows_to_process = df[(df['land_area'].isna()) | (df['land_area'] == 0)]
print(f"Found {len(rows_to_process)} rows with missing or zero land_area.")


Loaded '../../../data/raw/realestates_kh_v2_1.csv' with latin1 encoding.
Found 2994 rows with missing or zero land_area.


In [5]:
import re

def extract_land_area_from_text(text):
    prompt = f"""Extract only the land size in square meters (sqm) from the text below. 
If it mentions dimensions (e.g., "20m x 30m", "8.4 x 18 meters"), multiply to get sqm. 
If it says "98,514 sqm", return that number.
If no land size is present, respond only with: N/A

Text:
{text}
Answer:"""

    try:
        response = llm(prompt=prompt, max_tokens=32, stop=["\n"])
        result = response["choices"][0]["text"].strip()

        if result.lower() == 'n/a':
            return None

        # Use regex to find all valid numbers (ignore malformed)
        numbers = re.findall(r"\d+(?:\.\d+)?", result)
        if not numbers:
            return None

        # Pick the largest plausible value as land area (heuristic)
        land_area = max(float(n) for n in numbers)
        return land_area

    except Exception as e:
        print(f"❌ AI extraction failed: {e}")
        return None


In [6]:
# === Cell 4: Process rows with checkpointing and batch saving ===
batch = []
updated_count = 0

# Load processed indices if file exists
if os.path.exists(output_file):
    processed_df = pd.read_csv(output_file)
    # Assuming original index preserved - if not, adjust accordingly
    processed_indices = set(processed_df.index)
    print(f"Loaded {len(processed_indices)} processed rows from output file.")
else:
    processed_indices = set()

for i, (idx, row) in enumerate(rows_to_process.iterrows(), 1):
    if idx in processed_indices:
        print(f"⏭ Skipping already processed index {idx}.")
        continue

    print(f"\n🔄 Processing [{i}/{len(rows_to_process)}] index {idx}...")

    info_text = row['information']
    land_area = extract_land_area_from_text(info_text)

    if land_area is not None:
        df.at[idx, 'land_area'] = land_area
        updated_count += 1
        print(f"✅ Extracted land_area: {land_area}")
    else:
        print("⚠️ No land area found.")

    batch.append(df.loc[idx])

    if i % batch_size == 0 or i == len(rows_to_process):
        pd.DataFrame(batch).to_csv(
            output_file,
            mode='a',
            header=not os.path.exists(output_file),
            index=False
        )
        print(f"📥 Appended batch of {len(batch)} rows to '{output_file}'")
        batch = []

    time.sleep(0.05)  # Optional small delay to avoid overload

print(f"\n✅ Done! Total new rows updated this session: {updated_count}")


Loaded 1 processed rows from output file.

🔄 Processing [1/2994] index 3228...
⚠️ No land area found.

🔄 Processing [2/2994] index 3229...


Llama.generate: prefix-match hit


⚠️ No land area found.

🔄 Processing [3/2994] index 3230...


Llama.generate: prefix-match hit


⚠️ No land area found.

🔄 Processing [4/2994] index 3231...


Llama.generate: prefix-match hit


✅ Extracted land_area: 153.2

🔄 Processing [5/2994] index 3232...


Llama.generate: prefix-match hit


⚠️ No land area found.

🔄 Processing [6/2994] index 3233...


Llama.generate: prefix-match hit


⚠️ No land area found.

🔄 Processing [7/2994] index 3234...


Llama.generate: prefix-match hit


✅ Extracted land_area: 116.0

🔄 Processing [8/2994] index 3235...


Llama.generate: prefix-match hit


⚠️ No land area found.

🔄 Processing [9/2994] index 3236...


Llama.generate: prefix-match hit


✅ Extracted land_area: 90.96

🔄 Processing [10/2994] index 3237...


Llama.generate: prefix-match hit


⚠️ No land area found.

🔄 Processing [11/2994] index 3238...


Llama.generate: prefix-match hit


⚠️ No land area found.

🔄 Processing [12/2994] index 3239...


Llama.generate: prefix-match hit


✅ Extracted land_area: 70.0

🔄 Processing [13/2994] index 3240...


Llama.generate: prefix-match hit


✅ Extracted land_area: 65.8

🔄 Processing [14/2994] index 3241...


Llama.generate: prefix-match hit


⚠️ No land area found.

🔄 Processing [15/2994] index 3242...


Llama.generate: prefix-match hit


✅ Extracted land_area: 58.0

🔄 Processing [16/2994] index 3243...


Llama.generate: prefix-match hit


⚠️ No land area found.

🔄 Processing [17/2994] index 3244...


Llama.generate: prefix-match hit


⚠️ No land area found.

🔄 Processing [18/2994] index 3245...


Llama.generate: prefix-match hit


⚠️ No land area found.

🔄 Processing [19/2994] index 3246...


Llama.generate: prefix-match hit


⚠️ No land area found.

🔄 Processing [20/2994] index 3247...


Llama.generate: prefix-match hit


⚠️ No land area found.

🔄 Processing [21/2994] index 3248...


Llama.generate: prefix-match hit


✅ Extracted land_area: 45.0

🔄 Processing [22/2994] index 3249...


Llama.generate: prefix-match hit


⚠️ No land area found.

🔄 Processing [23/2994] index 3250...


Llama.generate: prefix-match hit


⚠️ No land area found.

🔄 Processing [24/2994] index 3251...


Llama.generate: prefix-match hit


⚠️ No land area found.

🔄 Processing [25/2994] index 3252...


Llama.generate: prefix-match hit


✅ Extracted land_area: 40.0

🔄 Processing [26/2994] index 3253...


Llama.generate: prefix-match hit


✅ Extracted land_area: 38.0

🔄 Processing [27/2994] index 3254...


Llama.generate: prefix-match hit


✅ Extracted land_area: 30.33

🔄 Processing [28/2994] index 3255...


Llama.generate: prefix-match hit


⚠️ No land area found.

🔄 Processing [29/2994] index 3256...


Llama.generate: prefix-match hit


⚠️ No land area found.

🔄 Processing [30/2994] index 3257...


Llama.generate: prefix-match hit


⚠️ No land area found.

🔄 Processing [31/2994] index 3258...


Llama.generate: prefix-match hit


⚠️ No land area found.

🔄 Processing [32/2994] index 3259...


Llama.generate: prefix-match hit


⚠️ No land area found.

🔄 Processing [33/2994] index 3260...


Llama.generate: prefix-match hit


⚠️ No land area found.

🔄 Processing [34/2994] index 3266...


Llama.generate: prefix-match hit


⚠️ No land area found.

🔄 Processing [35/2994] index 3267...


Llama.generate: prefix-match hit


⚠️ No land area found.

🔄 Processing [36/2994] index 3268...


Llama.generate: prefix-match hit


⚠️ No land area found.

🔄 Processing [37/2994] index 3269...


Llama.generate: prefix-match hit


⚠️ No land area found.

🔄 Processing [38/2994] index 3270...


Llama.generate: prefix-match hit


⚠️ No land area found.

🔄 Processing [39/2994] index 3271...


Llama.generate: prefix-match hit


⚠️ No land area found.

🔄 Processing [40/2994] index 3272...


Llama.generate: prefix-match hit


KeyboardInterrupt: 