# 04 — NLP on Housing Texts

In [None]:

import pandas as pd, re
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.decomposition import LatentDirichletAllocation

texts = pd.read_csv('data/policy_texts.csv')
texts['label'] = texts['text'].str.contains(r'eviction|shelter|voucher|rehousing|legal', case=False, regex=True).astype(int)

def clean(t):
    t = t.lower()
    t = re.sub(r'[^a-z0-9\s]', ' ', t)
    return re.sub(r'\s+', ' ', t).strip()

texts['clean'] = texts['text'].map(clean)

tfidf = TfidfVectorizer(min_df=1, ngram_range=(1,2))
X = tfidf.fit_transform(texts['clean']); y = texts['label']
Xtr, Xte, ytr, yte = train_test_split(X, y, test_size=0.3, random_state=13, stratify=y)

clf = LogisticRegression(max_iter=500).fit(Xtr, ytr)
pred = clf.predict(Xte)
print(classification_report(yte, pred))

coefs = clf.coef_[0]
fn = tfidf.get_feature_names_out()
top_pos = [fn[i] for i in coefs.argsort()[-10:][::-1]]
top_neg = [fn[i] for i in coefs.argsort()[:10]]
print('Top positive indicators:', top_pos)
print('Top negative indicators:', top_neg)

cv = CountVectorizer(min_df=1, stop_words='english')
Xc = cv.fit_transform(texts['clean'])
lda = LatentDirichletAllocation(n_components=3, random_state=0).fit(Xc)
terms = cv.get_feature_names_out()
for k, comp in enumerate(lda.components_):
    top = comp.argsort()[-8:][::-1]
    print(f'Topic {k+1}:', [terms[i] for i in top])
