### References

https://python.langchain.com/en/latest/modules/indexes/vectorstores/examples/chroma.html


In [62]:
from langchain.document_loaders import TextLoader
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter

from langchain.indexes import VectorstoreIndexCreator

import os

# Determine the name of the environment variable you want to use for the OpenAPI key
env_var_name = "OPENAI_API_KEY"
env_var_name_huggingface = "HUGGINGFACEHUB_API_TOKEN"
# Get the value of your OpenAPI key from the provider of the API
key_value = "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"
keyvalue_huggingface = "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"
# Set the environment variable with the key value
os.environ[env_var_name] = key_value
os.environ[env_var_name_huggingface] = keyvalue_huggingface
loader = TextLoader('article.txt' )
index_creator = VectorstoreIndexCreator(
    vectorstore_cls=Chroma, 
    embedding=OpenAIEmbeddings(),
    text_splitter=CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
).from_loaders([loader])



Using embedded DuckDB without persistence: data will be transient


In [34]:
index_creator.__class__.mro()
[attr for attr in dir(index_creator) if attr.startswith('__') is False]

['Config',
 '_abc_impl',
 '_calculate_keys',
 '_copy_and_set_values',
 '_decompose_class',
 '_enforce_dict_if_root',
 '_get_value',
 '_init_private_attributes',
 '_iter',
 'construct',
 'copy',
 'dict',
 'from_orm',
 'json',
 'parse_file',
 'parse_obj',
 'parse_raw',
 'query',
 'query_with_sources',
 'schema',
 'schema_json',
 'update_forward_refs',
 'validate',
 'vectorstore']

In [69]:
from langchain.chains import LLMChain
from langchain.llms.fake import FakeListLLM
from langchain import HuggingFaceHub
from langchain.prompts import PromptTemplate

prompt_template = """Use the context below to write a 100 word blog post about the topic below:
    Context: {context}
    Topic: {topic}
    Blog post:"""
PROMPT = PromptTemplate(
    template=prompt_template, input_variables=["context", "topic"]
)

llm = FakeListLLM(responses = ['France', 'Germany', 'Italy', 'Spain', 'United Kingdom', 'USA', 'China', 'India', 'Japan', 'Russia'])
llm = HuggingFaceHub()

chain = LLMChain(llm=llm, prompt=PROMPT)

In [72]:
def getattributes(obj):
    return [attr for attr in dir(obj) if attr.startswith('__') is False]

def generate_blog_post(topic):
    docs = index_creator.vectorstore.similarity_search(topic, k=4)
    for k in range(4):
        print(docs[k].page_content[:50])
    inputs = [{"context": doc.page_content, "topic": topic} for doc in docs]
    print(chain.apply(inputs))

generate_blog_post('donor countries promised')

In 2009, donor countries promised to mobilize $100
The World Bank and the donor countries that contro
Getting new money in the door is important, but it
For years, climate financing took a back seat to t
[{'text': ' What'}, {'text': ' In'}, {'text': ' What'}, {'text': ' In'}]


### Chroma VectorStore Testing 

In [75]:
from langchain.schema import Document
def getDocs():
    for file in os.listdir():
        if file.endswith(".txt"):
            with open(file, "r") as f:
                github_url = f"{file}"
                yield Document(page_content=f.read(), metadata={"source": github_url})


In [85]:

sources = getDocs()

source_chunks = []
splitter = CharacterTextSplitter(separator=" ", chunk_size=512, chunk_overlap=0)
for source in sources:
    print(source.metadata)
    print(source.page_content[:50])
    for chunk in splitter.split_text(source.page_content):
        print(len(chunk))
        source_chunks.append(Document(page_content=chunk, metadata=source.metadata))
        

{'source': 'article.txt'}
Humanity Is Facing a Great Injustice. The World Ba
507
507
508
510
508
507
508
511
508
509
65
{'source': 'article2.txt'}
The Income Gap Is Becoming a Physical-Activity Div
508
510
510
510
507
509
507
511
510
288
{'source': 'article3.txt'}
Israel Is Courting Disaster
March 5, 2023
By Micha
512
510
508
508
499
512
512
511
510
504
510
507
222


In [86]:
source_chunks

[Document(page_content='Humanity Is Facing a Great Injustice. The World Bank Must Respond.\nMarch 18, 2023\n\nBy The Editorial Board\n\nIt’s one of the great injustices of this era that countries contributing negligible amounts to global carbon emissions are now feeling the most harrowing impacts of climate change. Pakistan, which makes up less than 1 percent of the world’s carbon footprint, had a third of its territory under water in last year’s floods. Parts of Kenya, Ethiopia and Somalia are experiencing the worst drought in 70', metadata={'source': 'article.txt'}),
 Document(page_content='years of record-keeping, threatening millions with famine, even though the entire continent of Africa contributes less than 4 percent of global carbon emissions. \n\nThe World Bank and the donor countries that control it can do more to step up and tackle this generational challenge. To make the World Bank and other multilateral lending institutions fit for purpose in the 21st century, leaders need

In [100]:
import pickle

search_index = Chroma.from_documents(source_chunks, OpenAIEmbeddings(), persist_directory = 'dbdir')
search_index.persist()


Using embedded DuckDB with persistence: data will be stored in: dbdir


['_LANGCHAIN_DEFAULT_COLLECTION_NAME', '_abc_impl', '_client', '_client_settings', '_collection', '_embedding_function', '_persist_directory', 'add_documents', 'add_texts', 'as_retriever', 'delete_collection', 'from_documents', 'from_texts', 'max_marginal_relevance_search', 'max_marginal_relevance_search_by_vector', 'persist', 'similarity_search', 'similarity_search_by_vector', 'similarity_search_with_score']


In [92]:
getattributes(search_index)

['_LANGCHAIN_DEFAULT_COLLECTION_NAME',
 '_abc_impl',
 '_client',
 '_client_settings',
 '_collection',
 '_embedding_function',
 '_persist_directory',
 'add_documents',
 'add_texts',
 'as_retriever',
 'delete_collection',
 'from_documents',
 'from_texts',
 'max_marginal_relevance_search',
 'max_marginal_relevance_search_by_vector',
 'persist',
 'similarity_search',
 'similarity_search_by_vector',
 'similarity_search_with_score']

### Adding new docs 

In [108]:
search_index.add_texts(["Ankush went to Princeton"])
search_index.add_documents([Document(page_content="Ankush went to Princeton", metadata={})])


['55600658-d45f-11ed-821f-22414b0296f5']

### Testing Similarity Scores

In [109]:
print(getattributes(search_index))

['_LANGCHAIN_DEFAULT_COLLECTION_NAME', '_abc_impl', '_client', '_client_settings', '_collection', '_embedding_function', '_persist_directory', 'add_documents', 'add_texts', 'as_retriever', 'delete_collection', 'from_documents', 'from_texts', 'max_marginal_relevance_search', 'max_marginal_relevance_search_by_vector', 'persist', 'similarity_search', 'similarity_search_by_vector', 'similarity_search_with_score']


In [113]:
topic = 'love'
docs = search_index.similarity_search(topic, k=4)
