# HousingMind: Instruction Dataset Preview & Sample Fine-Tuning Prep

This notebook demonstrates how to load, inspect, and prepare HousingMind instruction data for use in training or fine-tuning a language model for housing policy applications.

In [None]:
import json
import pandas as pd
from pathlib import Path

## Load Instruction Data (JSONL)

In [None]:
instruction_path = Path("../instruction_data/housingmind_instructions.jsonl")

# Read JSONL into a DataFrame
records = []
with open(instruction_path, 'r') as file:
    for line in file:
        records.append(json.loads(line.strip()))

df = pd.DataFrame(records)
df.head()

## Data Schema Check

In [None]:
print("Total Records:", len(df))
print("Columns:", df.columns.tolist())
print(df.isnull().sum())

## Sample Prompt-Response Pair

In [None]:
sample = df.sample(1).iloc[0]
print("🧠 Prompt:", sample["prompt"])
print("✅ Response:", sample["response"])

## Prepare Data for Alpaca-Style Fine-Tuning

In [None]:
def to_alpaca_format(df):
    return [
        {
            "instruction": row["prompt"],
            "input": "",
            "output": row["response"]
        }
        for _, row in df.iterrows()
    ]

alpaca_data = to_alpaca_format(df)

# Save formatted output
with open("formatted_housingmind_alpaca.json", "w") as f:
    json.dump(alpaca_data, f, indent=2)

## Next Steps
- Upload to Hugging Face Datasets
- Integrate with LangChain / RAG
- Expand dataset from raw_documents or lookup_tables