In [None]:
import pandas as pd
import json
import os
from llama_cpp import Llama

In [None]:
llm = Llama(
    model_path="D:/Ollama/models/mistral/mistral-7b-instruct-v0.2.Q2_K.gguf",
    n_ctx=2048,
    n_threads=4,
    verbose=True,
) 

In [None]:
csv_path = "../../data/raw/realestates_kh_v2.csv"  # ✅ Replace with your actual file path
progress_path = "../../data/processed/cleaned_structured_realestate.csv"

In [None]:
df = pd.read_csv(csv_path)

In [None]:
if os.path.exists(progress_path):
    df_progress = pd.read_csv(progress_path)

    def is_filled(row):
        for col in ["land_size", "house_size", "bedrooms", "bathrooms"]:
            val = str(row.get(col)).strip().lower()
            if val in ["", "na", "n/a", "null"] or pd.isna(val):
                return False
        return True

    processed_rows = [i for i, row in df_progress.iterrows() if is_filled(row)]
    print(f"🔁 Resuming from previous progress. Skipping {len(processed_rows)} rows.")
else:
    df_progress = df.copy()
    processed_rows = []

In [None]:

# === AI Extraction Function ===
def extract_with_ai(text):
    prompt = f"""You are a data extractor. From the text below, extract and clean:
- land_size (in sqm)
- house_size (in sqm)
- bedrooms
- bathrooms
If any value is not mentioned, return \"N/A\".

Format the output as JSON like this:
{{
  \"land_size\": \"...\",
  \"house_size\": \"...\",
  \"bedrooms\": \"...\",
  \"bathrooms\": \"...\"
}}

Text:
{text}

Answer:"""
    try:
        response = llm(prompt=prompt, max_tokens=150, stop=["}"])
        raw = response['choices'][0]['text'].strip() + "}"
        return json.loads(raw)
    except Exception as e:
        print(f"❌ Failed to extract from text: {e}")
        return {
            "land_size": "N/A",
            "house_size": "N/A",
            "bedrooms": "N/A",
            "bathrooms": "N/A"
        }

In [18]:
for i, row in df.iterrows():
    if i in processed_rows:
        continue

    info_text = row.get("information", "")
    if isinstance(info_text, str) and info_text.strip():
        print(f"\n🔄 Cleaning row {i+1}/{len(df)}...")
        cleaned = extract_with_ai(info_text)
        for key in ["land_size", "house_size", "bedrooms", "bathrooms"]:
            if pd.isna(row.get(key)) or str(row.get(key)).strip().lower() in ["", "na", "n/a", "null"]:
                if cleaned[key] != "N/A":
                    df_progress.at[i, key] = cleaned[key]

    # Save progress after each row
    df_progress.to_csv(progress_path, index=False)
    print(f"✅ Saved progress to '{progress_path}'")

print(f"\n✅ Cleaning complete. Final data saved to '{progress_path}'")

✅ Saved progress to '../../data/processed/cleaned_structured_realestate.csv'

🔄 Cleaning row 334/6369...


Llama.generate: prefix-match hit


✅ Saved progress to '../../data/processed/cleaned_structured_realestate.csv'

🔄 Cleaning row 335/6369...


Llama.generate: prefix-match hit


❌ Failed to extract from text: Expecting value: line 1 column 1 (char 0)
✅ Saved progress to '../../data/processed/cleaned_structured_realestate.csv'

🔄 Cleaning row 336/6369...


Llama.generate: prefix-match hit


✅ Saved progress to '../../data/processed/cleaned_structured_realestate.csv'

🔄 Cleaning row 337/6369...


Llama.generate: prefix-match hit


❌ Failed to extract from text: Expecting value: line 1 column 1 (char 0)
✅ Saved progress to '../../data/processed/cleaned_structured_realestate.csv'

🔄 Cleaning row 338/6369...


Llama.generate: prefix-match hit


✅ Saved progress to '../../data/processed/cleaned_structured_realestate.csv'

🔄 Cleaning row 339/6369...


Llama.generate: prefix-match hit


✅ Saved progress to '../../data/processed/cleaned_structured_realestate.csv'

🔄 Cleaning row 340/6369...


Llama.generate: prefix-match hit


✅ Saved progress to '../../data/processed/cleaned_structured_realestate.csv'

🔄 Cleaning row 341/6369...


Llama.generate: prefix-match hit


✅ Saved progress to '../../data/processed/cleaned_structured_realestate.csv'

🔄 Cleaning row 342/6369...


Llama.generate: prefix-match hit


✅ Saved progress to '../../data/processed/cleaned_structured_realestate.csv'

🔄 Cleaning row 343/6369...


Llama.generate: prefix-match hit


✅ Saved progress to '../../data/processed/cleaned_structured_realestate.csv'

🔄 Cleaning row 344/6369...


Llama.generate: prefix-match hit


❌ Failed to extract from text: Expecting value: line 1 column 1 (char 0)
✅ Saved progress to '../../data/processed/cleaned_structured_realestate.csv'

🔄 Cleaning row 345/6369...


Llama.generate: prefix-match hit


❌ Failed to extract from text: Expecting value: line 1 column 1 (char 0)
✅ Saved progress to '../../data/processed/cleaned_structured_realestate.csv'

🔄 Cleaning row 346/6369...


Llama.generate: prefix-match hit


✅ Saved progress to '../../data/processed/cleaned_structured_realestate.csv'

🔄 Cleaning row 347/6369...


Llama.generate: prefix-match hit


✅ Saved progress to '../../data/processed/cleaned_structured_realestate.csv'

🔄 Cleaning row 348/6369...


Llama.generate: prefix-match hit


✅ Saved progress to '../../data/processed/cleaned_structured_realestate.csv'

🔄 Cleaning row 349/6369...


Llama.generate: prefix-match hit


❌ Failed to extract from text: Expecting ',' delimiter: line 5 column 18 (char 79)
✅ Saved progress to '../../data/processed/cleaned_structured_realestate.csv'

🔄 Cleaning row 350/6369...


Llama.generate: prefix-match hit


KeyboardInterrupt: 