# 💰 Step 4: Extract Housing Price Mentions
Use regular expressions to identify price-related data like:
- Dollar amounts (e.g., $850,000, CAD 1.2M)
- Percent changes (e.g., -5%, +12%)

In [None]:
import pandas as pd
import re

# Load cleaned data
df = pd.read_csv("../data/processed/cleaned_articles.csv")

# Combine text from all articles (if needed)
text = " ".join(df["cleaned_content"].dropna().tolist())

## 🧠 Define Regex Patterns

In [None]:
# Price patterns like $850,000 or CAD 1.2M
price_pattern = r"(\$|CAD\s?)?\d{1,3}(?:,\d{3})*(?:\.\d+)?\s?[MK]?"

In [None]:
# Percentage changes like -5%, +12%
percent_pattern = r"[+-]?\d{1,3}(?:\.\d+)?%"

## 📥 Extract Price Mentions from Text

In [None]:
prices = re.findall(price_pattern, text)
percents = re.findall(percent_pattern, text)

print(f"Found {len(prices)} price mentions and {len(percents)} percent changes.")
print("Sample prices:", prices[:10])
print("Sample percents:", percents[:10])

## 📊 Optional: Save Extracted Data

In [None]:
# Save to file if needed
pd.DataFrame({"price_mentions": prices}).to_csv(
    "../data/processed/price_mentions.csv", index=False
)
pd.DataFrame({"percent_changes": percents}).to_csv(
    "../data/processed/percent_changes.csv", index=False
)