# 📊 Step 3: Exploratory Data Analysis (EDA)
Explore word frequency, sentiment, and patterns in cleaned housing articles.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter
from wordcloud import WordCloud
from textblob import TextBlob
import seaborn as sns
import re

# Load cleaned data
df = pd.read_csv("../data/processed/cleaned_articles.csv")

## 🔠 Word Frequency (Top 30 Words)

In [None]:
# Combine all content
text = " ".join(df["cleaned_content"].dropna().tolist()).lower()
text = re.sub(r"[^a-zA-Z\s]", "", text)

# Tokenize and count
words = text.split()
word_freq = Counter(words)
common_words = word_freq.most_common(30)

# Plot
plt.figure(figsize=(12, 6))
sns.barplot(x=[w[1] for w in common_words], y=[w[0] for w in common_words])
plt.title("Top 30 Most Common Words")
plt.xlabel("Frequency")
plt.ylabel("Word")
plt.tight_layout()
plt.show()

## ☁️ Word Cloud

In [None]:
wordcloud = WordCloud(width=800, height=400, background_color="white").generate(text)
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.title("Word Cloud of Housing Articles")
plt.show()

## 😐 Sentiment Analysis (Polarity)

In [None]:
# Sentiment polarity per article
df["sentiment"] = df["cleaned_content"].apply(
    lambda x: TextBlob(str(x)).sentiment.polarity
)

# Plot sentiment
plt.figure(figsize=(10, 4))
sns.histplot(df["sentiment"], bins=20, kde=True, color="skyblue")
plt.title("Distribution of Sentiment in Articles")
plt.xlabel("Polarity (-1 negative to +1 positive)")
plt.ylabel("Article Count")
plt.grid(True)
plt.tight_layout()
plt.show()