# UTILITIES:

In [None]:
! pip install bertopic datasets -q
! pip install langchain -q
! pip install sentence-transformers chromadb openai -q

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from IPython.display import HTML, display

def set_css():
  display(HTML('''
  <style>
    pre {
        white-space: pre-wrap;
    }
  </style>
  '''))
get_ipython().events.register('pre_run_cell', set_css)

In [None]:
import numpy as np
import pandas as pd
import re
from pprint import pprint as pp

# BERTOPIC:
from sklearn.feature_extraction.text import CountVectorizer
from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired
from bertopic.vectorizers import ClassTfidfTransformer
from transformers import pipeline
from sentence_transformers import SentenceTransformer
from umap import UMAP
from hdbscan import HDBSCAN

# LANCHAIN, CHROMADB & OPENAI:
from langchain.text_splitter import RecursiveCharacterTextSplitter, SentenceTransformersTokenTextSplitter
import chromadb
from chromadb.utils.embedding_functions import SentenceTransformerEmbeddingFunction

import openai
from openai import OpenAI

## BERTOPIC FUNCS:

In [None]:
def f_make_text_chunks(raw_txt):

    txt = raw_txt.replace('\ufeff', '')
    txt_chunks = re.split('\.\s*\n', txt)
    txt_chunks = [txt_chunk.replace('\n', '') for txt_chunk in txt_chunks]
    return txt_chunks


def f_get_topics_dict(text_chunks, topic_model, ner_model, n_top = 10):

    df_docs = topic_model.get_document_info(text_chunks)

    topic_dict = {}
    for i in topic_model.get_topics().keys():

        filter = df_docs.Topic == i
        df_top_docs = df_docs[filter].sort_values(by = 'Probability', ascending = False)
        df_top_docs = df_top_docs.iloc[:n_top].Document

        keywords = topic_model.get_topic(i)
        keywords = [pair[0] for pair in keywords]
        keywords = set([i.lower() for i in keywords])

        ner_words = ner_model(list(df_top_docs.values))
        ner_words_ = [i[0] for i in ner_words if len(i) > 0]
        ner_words_ = set([i['word'] for i in ner_words_])
        ner_words_ = set([i.lower() for i in ner_words_])

        topic_words = list(ner_words_ | keywords)
        topic_dict[i] = topic_words

    return topic_dict



def f_display_topics(topic_dict):

    for i in topic_dict:
        print(f'TOPIC: {i}')
        print('---------')
        topics = topic_dict[i]
        topics = ", ".join(topics)
        pp(topics)
        print()
        print()




def f_get_topic_docs(topic_n, topic_model):

    df_topics = topic_model.get_document_info(txt_chunks)
    df_topics.columns = [i.lower() for i in df_topics.columns]

    df_representative = (df_topics
    [df_topics.topic == topic_n]
    [df_topics.representative_document == True]
    .sort_values(by = 'probability', ascending = False)
    .document)

    df_not_representative = (df_topics
    [df_topics.topic == topic_n]
    [df_topics.representative_document == False]
    .sort_values(by = 'probability', ascending = False)
    .document)

    df_topic_docs = (pd.concat
     ([df_representative, df_not_representative])
    .reset_index(drop = True))

    return df_topic_docs




def f_explore_topic_docs(topic_n, topic_model, ner_model, txt_chunks, n_samples = 3):

    topic_dict = f_get_topics_dict(txt_chunks, topic_model, ner_model, n_top = 10)
    df_top_docs = f_get_topic_docs(topic_n, topic_model)

    print('TOPIC KEYWORDS:')
    words = topic_dict[topic_n]
    words = ", ".join(words)
    pp(words)
    print()

    n_samples = n_samples if len(df_top_docs) >= n_samples else len(df_top_docs)

    top_docs = df_top_docs.iloc[:n_samples]
    for doc in top_docs:
        print('------------')
        pp(doc)
        print()

## TOPIC MODEL:

In [None]:
def f_fit_topic_model(txt_chunks, n_neighs_umap = 3, n_dims_umap = 10,
                   min_cluster_size = 3, dist_metric = 'euclidean'):

    embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

    # Step 2 - Reduce dimensionality
    umap_model = UMAP(n_neighbors=n_neighs_umap,
                    n_components=n_dims_umap,
                    random_state=42,
                    min_dist=0.0,
                    metric='cosine')

    # Step 3 - Cluster reduced embeddings
    hdbscan_model = HDBSCAN(min_cluster_size=min_cluster_size,
                            metric=dist_metric,
                            cluster_selection_method='eom',
                            prediction_data=True)

    # Step 4 - Tokenize topics
    vectorizer_model = CountVectorizer(stop_words="english")

    # Step 5 - Create topic representation
    ctfidf_model = ClassTfidfTransformer()

    # Step 6 - (Optional) Fine-tune topic representations with
    # a `bertopic.representation` model
    representation_model = KeyBERTInspired()

    # All steps together
    topic_model = BERTopic(
        embedding_model=embedding_model,
        umap_model=umap_model,
        hdbscan_model=hdbscan_model,
        vectorizer_model=vectorizer_model,
        ctfidf_model=ctfidf_model,
        representation_model=representation_model
    )

    topics, probs = topic_model.fit_transform(txt_chunks)

    ner_model = pipeline("token-classification",
                          model="wizardofchance/ner-model",
                          aggregation_strategy="simple")

    topic_dict = f_get_topics_dict(txt_chunks, topic_model, ner_model, n_top = 10)

    return topic_model, ner_model, topic_dict

## RAG

In [None]:
def f_chroma_instantiations():

    embedding_function = SentenceTransformerEmbeddingFunction()

    character_splitter = RecursiveCharacterTextSplitter(
        separators=["\n\n", "\n", ". ", " ", ""],
        chunk_size=1000,
        chunk_overlap=0)

    token_splitter = SentenceTransformersTokenTextSplitter(chunk_overlap=0,
                                                        tokens_per_chunk=256)

    return embedding_function, character_splitter, token_splitter



def f_chroma_input(topic_n, topic_model, character_splitter, token_splitter):

    topic_docs = f_get_topic_docs(topic_n, topic_model).to_list()
    topic_doc_consolidated  = '\n\n'.join(topic_docs)

    topic_chunks = character_splitter.split_text(topic_doc_consolidated)
    chroma_input_docs = []
    for chunk in topic_chunks:
        chroma_input_docs += token_splitter.split_text(chunk)

    return chroma_input_docs




def f_create_chroma_collection(topic_n, topic_model, collection_name,
                               chroma_client, embedding_function,
                               character_splitter, token_splitter):

    collection = chroma_client.create_collection(
                        collection_name,
                        embedding_function=embedding_function)



    # Add documents to the collection
    chroma_input_docs = f_chroma_input(topic_n, topic_model,
                                       character_splitter, token_splitter)
    ids = [str(i) for i in range(len(chroma_input_docs))]
    collection.add(ids=ids, documents=chroma_input_docs)

    return collection




def f_create_chroma_collections(topic_dict, topic_model,
                               chroma_client, embedding_function,
                               character_splitter, token_splitter):

    for i in chroma_client.list_collections():
        chroma_client.delete_collection(i.name)

    for topic_n in topic_dict:

        collection_name = f'topic_{topic_n}'

        f_create_chroma_collection(topic_n, topic_model, collection_name,
                                    chroma_client, embedding_function,
                                    character_splitter, token_splitter)
        print(f"""collection '{collection_name}' created sucessfully""")






def rag(query, prompt, retrieved_documents, openai_client, llm):

    information = "\n\n".join(retrieved_documents)
    user_content = f"Question: {query}. \n Information: {information}"

    messages = [{"role": "system", "content": prompt},
                {"role": "user", "content": user_content}]

    response = openai_client.chat.completions.create(
        model=llm,
        messages=messages)

    content = response.choices[0].message.content
    return content




def f_get_ans(query, topic_n, prompt, openai_client, llm, n_results = 10):

    collection_name = f'topic_{topic_n}'
    chroma_collection = chroma_client.get_collection(collection_name)

    results = chroma_collection.query(query_texts=[query], n_results=n_results)
    retrieved_documents = results['documents'][0]
    output = rag(query, prompt, retrieved_documents, openai_client, llm)

    print()
    print('ANSWER: ')
    print('--------')
    print(output)

***

In [None]:
from datasets import load_dataset

dataset = load_dataset("billsum")
dataset

Downloading readme:   0%|          | 0.00/7.27k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/91.8M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/15.8M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/6.12M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/18949 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3269 [00:00<?, ? examples/s]

Generating ca_test split:   0%|          | 0/1237 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'summary', 'title'],
        num_rows: 18949
    })
    test: Dataset({
        features: ['text', 'summary', 'title'],
        num_rows: 3269
    })
    ca_test: Dataset({
        features: ['text', 'summary', 'title'],
        num_rows: 1237
    })
})

In [22]:
df = pd.DataFrame(dataset['train'])
df.head()

Unnamed: 0,text,summary,title
0,SECTION 1. LIABILITY OF BUSINESS ENTITIES PROV...,Shields a business entity from civil liability...,A bill to limit the civil liability of busines...
1,SECTION 1. SHORT TITLE.\n\n This Act may be...,Human Rights Information Act - Requires certai...,Human Rights Information Act
2,SECTION 1. SHORT TITLE.\n\n This Act may be...,Jackie Robinson Commemorative Coin Act - Direc...,Jackie Robinson Commemorative Coin Act
3,SECTION 1. NONRECOGNITION OF GAIN WHERE ROLLOV...,Amends the Internal Revenue Code to provide (t...,To amend the Internal Revenue Code to provide ...
4,SECTION 1. SHORT TITLE.\n\n This Act may be...,Native American Energy Act - (Sec. 3) Amends t...,Native American Energy Act


# A. TEXT DOC:

In [28]:
txt_doc = df.iloc[4]['text']
print(txt_doc)

SECTION 1. SHORT TITLE.

    This Act may be cited as the ``Native American Energy Act''.

SEC. 2. TABLE OF CONTENTS.

    The table of contents for this Act is as follows:

Sec. 1. Short title.
Sec. 2. Table of contents.
Sec. 3. Appraisals.
Sec. 4. Standardization.
Sec. 5. Environmental reviews of major Federal actions on Indian lands.
Sec. 6. BLM oil and gas fees.
Sec. 7. Bonding requirements and nonpayment of attorneys' fees to 
                            promote Indian energy projects.
Sec. 8. Tribal biomass demonstration project.
Sec. 9. Tribal resource management plans.
Sec. 10. Leases of restricted lands for the Navajo Nation.
Sec. 11. Nonapplicability of certain rules.

SEC. 3. APPRAISALS.

    (a) Amendment.--Title XXVI of the Energy Policy Act of 1992 (25 
U.S.C. 3501 et seq.) is amended by adding at the end the following:

``SEC. 2607. APPRAISAL REFORMS.

    ``(a) Options to Indian Tribes.--With respect to a transaction 
involving Indian land or the trust assets of an Indi

# 1. FIT TOPIC MODEL TO TEXT:

In [29]:
txt_chunks = f_make_text_chunks(txt_doc)

len(txt_chunks)

63

In [69]:
kwargs = dict(n_neighs_umap = 3,
              n_dims_umap = 10,
              min_cluster_size = 6,
              dist_metric = 'euclidean')

%time topic_model, ner_model, topic_dict = f_fit_topic_model(txt_chunks, **kwargs)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


CPU times: user 15.4 s, sys: 3.34 s, total: 18.7 s
Wall time: 19.4 s


In [70]:
f_display_topics(topic_dict)

TOPIC: -1
---------
('injunction, pursuant, native american energy act, prevail, law, defendant, '
 'damages, land, indian, united states code, federal, energy, settlement, '
 'plaintiff')


TOPIC: 0
---------
('criteria, tribe, pursuant, national environmental policy act of 1969, '
 'approval, agreements, energy policy act of, tribes, indian, appraisal, '
 'federal, congress, regulations, amended')


TOPIC: 1
---------
('sustainability, navajo, biomass, tribal forest protection act of, lands, '
 'tribal, de, resource, agricultural, federal, harvested, resources, '
 'restricted, navajo nation, sustainable')


TOPIC: 2
---------
('oil, blm oil, inspection, department of the interior, secretary, oversight, '
 'blm, indian, acreage, gas, bureau of land management, permit, lands, fee, '
 'fees')




# 2. CREATE TOPIC BASED CHROMADB COLLECTIONS:
(LOCAL VECTOR_DBs)

In [38]:
embedding_function, character_splitter, token_splitter = f_chroma_instantiations()

In [71]:
chroma_client = chromadb.Client()

f_create_chroma_collections(topic_dict, topic_model, chroma_client,
                            embedding_function, character_splitter, token_splitter)

collection 'topic_-1' created sucessfully
collection 'topic_0' created sucessfully
collection 'topic_1' created sucessfully
collection 'topic_2' created sucessfully


# 3. OPENAI INSTANTIATIONS:

In [40]:
from google.colab import userdata
import os

os.environ["OPENAI_API_KEY"] = userdata.get('OPENAI_API_KEY')
os.environ["HUGGINGFACE_API_KEY"] = userdata.get('HF_TOKEN')

openai_client = OpenAI()
llm = "gpt-3.5-turbo"

In [41]:
prompt = """\

You are a multi talented helpful assistant, \
who is an experts in multiple fields, some of them being: \
social science, economics, finance, world affairs, law, artificial intelligence, \
physics, chemistry, biology, and more. \
You are known for answering questions by first thinking about the question deeply \
and then synthesizing relevant information from a collection of documents. \
your answers are articulate and to the point. \
You make intense use of the following principle: \
---Do not answer to anything not asked in the question--- \
You will be shown the user's question, and the relevant set of \
documents to derive and synthesize your answer from. \

Answer the user's question using ONLY these set of documents, please keep your answer \
limited to what is specifically being asked and do not talk about anything more. \
Make doubly sure that you are sticking to the facts from the documents provided and \
that your answer does not contain any information about anything that is \
not specifically mentioned in the question.
"""

# 4. GET TOPIC BASED OVERVIEW OF THE DOCUMENT:  
(GIVES US CONTEXT FOR ASKING QUESTIONS)

In [72]:
f_display_topics(topic_dict)

TOPIC: -1
---------
('injunction, pursuant, native american energy act, prevail, law, defendant, '
 'damages, land, indian, united states code, federal, energy, settlement, '
 'plaintiff')


TOPIC: 0
---------
('criteria, tribe, pursuant, national environmental policy act of 1969, '
 'approval, agreements, energy policy act of, tribes, indian, appraisal, '
 'federal, congress, regulations, amended')


TOPIC: 1
---------
('sustainability, navajo, biomass, tribal forest protection act of, lands, '
 'tribal, de, resource, agricultural, federal, harvested, resources, '
 'restricted, navajo nation, sustainable')


TOPIC: 2
---------
('oil, blm oil, inspection, department of the interior, secretary, oversight, '
 'blm, indian, acreage, gas, bureau of land management, permit, lands, fee, '
 'fees')




# 6. QUERY ANY TOPIC:

### TOPIC 0:

In [73]:
topic_n = 0
query = "what does the doc say about: national environmental policy act of 1969?"

f_get_ans(query, topic_n, prompt, openai_client, llm)


ANSWER: 
--------
The document provides information about the amendment to the National Environmental Policy Act of 1969 related to major federal actions on Indian lands. Specifically, it mentions that for major federal actions on Indian lands of an Indian tribe requiring the preparation of a statement under the act, the statement shall only be available for review and comment by the members of the Indian tribe and individuals residing within the affected area. Additionally, it outlines the eligibility criteria for Indian tribes to enter into contracts or agreements for demonstration projects to promote biomass energy production. The document highlights the importance of environmental reviews of major federal actions on Indian lands and procedures for review and approval of appraisals related to Indian tribes' projects.


In [74]:
topic_n = 0
query = "what regulations does the document discuss about?"

f_get_ans(query, topic_n, prompt, openai_client, llm)


ANSWER: 
--------
The document discusses regulations related to the development and implementation of demonstration projects involving Indian tribes for biomass energy production. These regulations include ensuring that certain criteria are publicly available, consulting with Indian tribes in project development, entering into contracts with Indian tribes, developing regulations for implementing the section, and establishing standards for approving or disapproving appraisals. Additionally, the document mentions regulations for environmental reviews of major federal actions on Indian lands and time limits for secretarial review and action on tribal appraisals.


In [75]:
topic_n = 0
query = "what acts have been amended?"

f_get_ans(query, topic_n, prompt, openai_client, llm)


ANSWER: 
--------
The acts that have been amended include the National Environmental Policy Act of 1969, the Energy Policy Act of 1992, and the Act of August 9, 1955. These amendments cover various aspects such as review of major federal actions on Indian lands, appraisal reforms, demonstration projects for biomass energy production on Indian forest land, long-term leasing regulations including leases for mineral resources, criteria for demonstration projects, waiver of appraisal requirements by Indian tribes, and secretarial review and approval processes within specified time limits.


### TOPIC 1:

In [77]:
topic_n = 1
query = "what do the documents say about the tribal forest protection act? \
what other acts are mentioned in the documents"

f_get_ans(query, topic_n, prompt, openai_client, llm)




ANSWER: 
--------
The documents mention the Tribal Forest Protection Act of 2004, which includes provisions related to tribal resource management plans and a tribal biomass demonstration project. Other acts mentioned in the documents are the National Indian Forest Resources Management Act and the American Indian Agricultural Resource Management Act. Additionally, the leases of restricted lands for the Navajo Nation are referenced.


In [79]:
topic_n = 1
query = "what is the American Indian Agricultural Resource Management Act about?"

f_get_ans(query, topic_n, prompt, openai_client, llm)




ANSWER: 
--------
The American Indian Agricultural Resource Management Act pertains to tribal resource management plans and integrated resource management plans approved by the Secretary of the Interior. It allows activities conducted or resources harvested under these plans to be considered sustainable management practices for federal standards, benefits, or requirements related to demonstrating sustainability. Additionally, the Act includes provisions for a tribal biomass demonstration project and leases of restricted lands for the Navajo Nation.


In [78]:
topic_n = 1
query = "what is being said about navajo nation"

f_get_ans(query, topic_n, prompt, openai_client, llm)




ANSWER: 
--------
The information provided mentions that leases of restricted lands for the Navajo Nation are addressed in section 10. Additionally, concerning tribal resource management plans, any activities conducted or resources harvested pursuant to an approved tribal resource management plan are considered sustainable management practices for federal purposes unless explicitly exempted by federal law enacted after the specified date. The document also refers to the Tribal Biomass Demonstration Project, which is detailed in section 8.


### TOPIC 2

In [80]:
topic_n = 2
query = "what does the document say about the department of the interior?"

f_get_ans(query, topic_n, prompt, openai_client, llm)




ANSWER: 
--------
The document states that the Secretary of the Interior, under the Department of the Interior, is directed to implement procedures to ensure that agencies within the department involved in overseeing oil and gas activities on Indian lands use a uniform system of reference numbers and tracking systems for oil and gas wells. Additionally, any rule regarding hydraulic fracturing in oil or gas development will not impact lands held in trust for Native Americans without the explicit consent of the beneficiary. Furthermore, fees are not to be collected for certain activities related to oil and gas operations on Indian lands, such as permit applications, inspection activities, and non-producing acreage leases.


In [81]:
topic_n = 2
query = "What does the document say about the bureau of land management?"

f_get_ans(query, topic_n, prompt, openai_client, llm)




ANSWER: 
--------
The document states that the Bureau of Land Management, which operates under the Department of the Interior, is involved in the review, approval, and oversight of oil and gas activities on Indian lands. It mentions that the Bureau of Land Management, through the Secretary of the Interior, is not allowed to collect fees for certain activities related to oil and gas on Indian land, such as permit applications for drilling, oil or gas inspection activities, and nonproducing acreage on Indian land. Additionally, any rules regarding hydraulic fracturing must not affect land held in trust or restricted status for the benefit of Indians without their express consent.
