In [3]:
import requests
import os
from sklearn.datasets import fetch_20newsgroups
import json
from bertopic import BERTopic
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM, AutoModelForSeq2SeqLM
from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance, TextGeneration
from sklearn.feature_extraction.text import CountVectorizer
from hdbscan import HDBSCAN
from lxml import etree
import regex as re
from nltk.tokenize import sent_tokenize, word_tokenize

In [58]:
# rule text
resp = requests.get("https://www.federalregister.gov/documents/full_text/xml/2024/04/10/2024-06218.xml")

In [2]:
def get_text(element):
    text = element.text.strip() if element.text else ''
    for child in element:
        text += get_text(child)
    return text

In [59]:
rule_text = re.sub(r'<[^<>]+>', '', resp.text)
rule_text = re.sub(r'\n|\s{2,}', ' ', rule_text)


In [8]:
summary = "This proposed rule would amend the regulations for certain HUD Public and Indian Housing and Housing Programs. The proposed amendments would revise existing regulations that govern admission for applicants with criminal records or a history of involvement with the criminal justice system and eviction or termination of assistance of persons on the basis of illegal drug use, drug-related criminal activity, or other criminal activity. The proposed revisions would require that prior to any discretionary denial or termination for criminal activity, PHAs and assisted housing owners take into consideration multiple sources of information, including but not limited to the recency and relevance of prior criminal activity. They are intended to minimize unnecessary exclusions from these programs while allowing providers to maintain the health, safety, and peaceful enjoyment of their residents, their staffs, and their communities. The proposed rule is intended to both clarify existing PHA and owner obligations and reduce the risk of violation of nondiscrimination laws."

In [9]:
with open("../HUD-2024-0031-0001_comments.json", "r") as f:
    comments = json.loads(f.read())

In [10]:
comment_text = []

for comment in comments:
    comment_text.append(comment['data']['attributes']['comment'])

# remove empty comment
docs = [comment for comment in comment_text if comment]
sentences = [sent_tokenize(comment) for comment in docs]
sentences = [sentence for doc in sentences for sentence in doc]

In [28]:
# hdbscan_model = HDBSCAN(min_cluster_size=150, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
vectorizer_model = CountVectorizer(stop_words="english", min_df=2, ngram_range=(1, 2))

keybert_model = KeyBERTInspired()
# # MMR
# mmr_model = MaximalMarginalRelevance(diversity=0.3)

tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-large")
model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-large")

prompt = f"""
Here is a proposed rule change from the federal government: {summary}
The following documents represent public comments on this rule: [DOCUMENTS] 
Topics from these documents are described by the following keywords: [KEYWORDS]

Take these topics and create a single word label for each grouping.
"""
tokenizer_kwargs = {'truncation': True, 'max_length': 512}

generator = pipeline('text2text-generation', tokenizer=tokenizer, model=model, **tokenizer_kwargs)
flan_model = TextGeneration(generator, prompt=prompt, nr_docs=12)

models = {
    "KeyBERT": keybert_model,
    # "MMR": mmr_model,
    "Flan": flan_model
}

topic_model = BERTopic(vectorizer_model=vectorizer_model, representation_model=models)
topics, probs = topic_model.fit_transform(sentences)

In [32]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,KeyBERT,Flan,Representative_Docs
0,0,860,0_housing_criminal_people_individuals,"[housing, criminal, people, individuals, acces...","[hud housing, ensure criminal, criminal record...","[Housing is a human right, but too many people...","[<br/>&nbsp;<br/>Housing is a human right, but..."
1,1,56,1_tenancy_outcomes_evidence_housing makes,"[tenancy, outcomes, evidence, housing makes, c...","[criminal record, criminal records, conviction...",[br/>br/>A criminal record has no bearing on t...,[<br/><br/>A criminal record has no bearing on...
2,2,56,2_adopt lookback_hud adopt_urge hud_period years,"[adopt lookback, hud adopt, urge hud, period y...","[hud, hud adopt, lookback period, years regula...",[i urge hud to adopt a lookback period of no m...,"[<br/><br/>Further, I urge HUD to adopt a look..."
3,3,54,3_fair chance_chance housing_fair_chance,"[fair chance, chance housing, fair, chance, in...","[individualized review, public housing, housin...",[individualized review process will give peopl...,[The individualized review process will give p...
4,4,50,4_removed_contact_manner_criminal legal,"[removed, contact, manner, criminal legal, tre...","[lookback period, lookback, background checks,...","[importantly, a lookback period will ensure th...","[Importantly, a lookback period will ensure th..."
5,5,48,5_dignity_millions people_right direction_dign...,"[dignity, millions people, right direction, di...","[lookback period, lookback, human rights, rest...",[Enforcing a lookback period is a move in the ...,[Enforcing a lookback period is a move in the ...
6,6,47,6_surface_screening_biases_housing providers,"[surface, screening, biases, housing providers...","[lookback period, lookback, screening, individ...",[Requiring a lookback period will ensure unifo...,[Requiring a lookback period will ensure unifo...
7,7,33,7_increase housing_effects increase_homelessne...,"[increase housing, effects increase, homelessn...","[barriers housing, homelessness decrease, hous...","[In fact, barriers to housing have negative ef...","[In fact, barriers to housing have negative ef..."
8,8,22,8_right people_need criminal_access fundamenta...,"[right people, need criminal, access fundament...","[hud housing, people conviction, ensure crimin...",[I support HUD&rsquo;s proposed regulations th...,"[Housing is a human right, but too many people..."
9,9,18,9_negative_housing negative_negative effects_p...,"[negative, housing negative, negative effects,...","[barriers housing, homelessness decrease, hous...","[In fact, barriers to housing have negative ef...","[In fact, barriers to housing have negative ef..."


In [37]:
from llama_cpp import Llama

llama = Llama.from_pretrained(
    repo_id="TheBloke/zephyr-7B-alpha-GGUF",
    filename="zephyr-7b-alpha.Q4_K_M.gguf",
    n_ctx=4096,
    verbose=False
)

In [22]:
tokenizer = AutoTokenizer.from_pretrained("HuggingFaceH4/zephyr-7b-alpha")


tokenizer_config.json:   0%|          | 0.00/1.43k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/168 [00:00<?, ?B/s]

In [31]:
tokens = tokenizer(sentences, return_tensors="np")

In [35]:
tokens['input_ids']

array([array([    1,   382, 23291,   349,   264,  2930,  1103, 28723]),
       array([    1,  1387, 27440, 28770, 28774, 28745, 28713,   708, 13406,
                970,   478,   541,  3317,   354,   312, 23005,  5174,  1671,
               7501,   272,  5537,   298,  2647, 11854,  3208, 28725,  2490,
              11821, 28723])                                                ,
       array([    1,  5671,   283,  2742,   352,   704,   729,   483,   296,
               1999, 26336,  4777, 28725,  7402,  2164, 28725,  1756, 20164,
               5619, 28725,   799,   905,   302,  3181, 28725,   905,   395,
                704,  7773, 28725,   304,   799,  4264,  1944, 25286,  1332,
                304, 23759,  8065,  9750, 28725,   690, 26336,  1012,  3252,
                299,   302,   652,  1411, 28725,   304,  1671, 11854,  3208,
                737,  2887,   304, 11821,  2735, 28725,   272, 13290,   302,
                937,   313,   449,  1443,   460, 15702,  7483,   304,   264,
   

In [29]:
for token_set in tokens['input_ids']:
    if len(token_set) >= 512:
        print('too long')

In [38]:
from bertopic.representation import LlamaCPP

prompt = f"""
Q: Here is a proposed rule change from the federal government: {summary}
The following documents represent public comments on this rule: 
[DOCUMENTS] 

The topic is described by the following keywords: '[KEYWORDS]'.

Based on the above information, can you give a short label of the topic?
A: 
"""
llama_args = {"max_tokens" : 512}
representation_model = LlamaCPP(llama, prompt=prompt)
topic_model = BERTopic(representation_model=representation_model)

In [39]:
topics, probs = topic_model.fit_transform(documents=sentences)

KeyboardInterrupt: 

In [7]:
# llama 3 testing - too slow to run locally
acess_token = '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'

tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B", token=acess_token)
model = AutoModelForCausalLM.from_pretrained("meta-llama/Meta-Llama-3-8B", token=acess_token)

generator = pipeline(
    model=model, tokenizer=tokenizer,
    task='text-generation',
    temperature=0.1,
    max_new_tokens=500,
    repetition_penalty=1.1
)

prompt = "Could you explain to me how 4-bit quantization works as if I am 5?"
res = generator(prompt)