### References

https://python.langchain.com/en/latest/modules/indexes/vectorstores/examples/chroma.html


In [5]:
from langchain.document_loaders import TextLoader
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter

from langchain.indexes import VectorstoreIndexCreator

import os
from langchain.chains import LLMChain
from langchain.llms.fake import FakeListLLM
from langchain import HuggingFaceHub
from langchain.prompts import PromptTemplate

# Determine the name of the environment variable you want to use for the OpenAPI key

os.environ['OPENAI_API_KEY'] = "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"
print(os.environ['OPENAI_API_KEY'])
loader = TextLoader('article.txt' )
index_creator = VectorstoreIndexCreator(
    vectorstore_cls=Chroma, 
    embedding=OpenAIEmbeddings(),
    text_splitter=CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
).from_loaders([loader])



Using embedded DuckDB without persistence: data will be transient


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


In [4]:
index_creator.__class__.mro()
[attr for attr in dir(index_creator) if attr.startswith('__') is False]

['Config',
 '_abc_impl',
 '_calculate_keys',
 '_copy_and_set_values',
 '_decompose_class',
 '_enforce_dict_if_root',
 '_get_value',
 '_init_private_attributes',
 '_iter',
 'construct',
 'copy',
 'dict',
 'from_orm',
 'json',
 'parse_file',
 'parse_obj',
 'parse_raw',
 'query',
 'query_with_sources',
 'schema',
 'schema_json',
 'update_forward_refs',
 'validate',
 'vectorstore']

In [3]:


prompt_template = """Use the context below to write a 100 word blog post about the topic below:
    Context: {context}
    Topic: {topic}
    Blog post:"""
PROMPT = PromptTemplate(
    template=prompt_template, input_variables=["context", "topic"]
)

llm = FakeListLLM(responses = ['France', 'Germany', 'Italy', 'Spain', 'United Kingdom', 'USA', 'China', 'India', 'Japan', 'Russia'])
llm = HuggingFaceHub()

chain = LLMChain(llm=llm, prompt=PROMPT)

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
def getattributes(obj):
    return [attr for attr in dir(obj) if attr.startswith('__') is False]

def generate_blog_post(topic):
    docs = index_creator.vectorstore.similarity_search(topic, k=4)
    for k in range(4):
        print(docs[k].page_content[:50])
    inputs = [{"context": doc.page_content, "topic": topic} for doc in docs]
    return chain.apply(inputs)

generate_blog_post('donor countries promised')

In 2009, donor countries promised to mobilize $100
The World Bank and the donor countries that contro
Getting new money in the door is important, but it
For years, climate financing took a back seat to t
[{'text': ' What'}, {'text': ' In'}, {'text': ' Give'}, {'text': ' In'}]


### Chroma VectorStore Testing 

In [18]:
from langchain.schema import Document
def getDocs():
    for file in os.listdir():
        if file.endswith(".txt"):
            with open(file, "r") as f:
                github_url = f"{file}"
                yield Document(page_content=f.read(), metadata={"source": github_url})


In [15]:

sources = getDocs()

source_chunks = []
splitter = CharacterTextSplitter(separator=" ", chunk_size=512, chunk_overlap=0)
for source in sources:
    print(source.metadata)
    print(source.page_content[:50])
    for chunk in splitter.split_text(source.page_content):
        print(len(chunk))
        source_chunks.append(Document(page_content=chunk, metadata=source.metadata))
        

NameError: name 'getDocs' is not defined

In [16]:
source_chunks

NameError: name 'source_chunks' is not defined

In [17]:


search_index = Chroma.from_documents(source_chunks, OpenAIEmbeddings(), persist_directory = 'dbdir')
search_index.persist()


NameError: name 'source_chunks' is not defined

In [14]:
getattributes(search_index)

NameError: name 'getattributes' is not defined

### Adding new docs 

In [10]:
search_index.add_texts(["Ankush went to Princeton"])
search_index.add_documents([Document(page_content="Ankush went to Princeton", metadata={'source': 'sentence1'})])


['3be928ca-d47e-11ed-8349-22414b0296f5']

### Testing Similarity Scores

In [11]:
print(getattributes(search_index))

['_LANGCHAIN_DEFAULT_COLLECTION_NAME', '_abc_impl', '_client', '_client_settings', '_collection', '_embedding_function', '_persist_directory', 'add_documents', 'add_texts', 'as_retriever', 'delete_collection', 'from_documents', 'from_texts', 'max_marginal_relevance_search', 'max_marginal_relevance_search_by_vector', 'persist', 'similarity_search', 'similarity_search_by_vector', 'similarity_search_with_score']


In [12]:
topic = 'love'
docs = search_index.similarity_search(topic, k=4)


### New index from saved pickle

In [13]:
llm = HuggingFaceHub()
chain = LLMChain(llm=llm, prompt=PROMPT)
def generatefromsaved():
    docs = index_creator.vectorstore.similarity_search(topic, k=4)
    inputs = [{"context": doc.page_content, "topic": topic} for doc in docs]
    print(chain.apply(inputs))

In [16]:
vectorstore2 =  Chroma(persist_directory='dbdir', embedding_function=OpenAIEmbeddings())


Using embedded DuckDB with persistence: data will be stored in: dbdir


In [6]:
template = """Question: {question}

Answer: Let's think step by step."""

prompt = PromptTemplate(template=template, input_variables=["question"])

In [9]:
from langchain.llms import LlamaCpp

llm = LlamaCpp(model_path="./ggml-model-q4_0.bin")


NameError: Could not load Llama model from path: ./ggml-model-q4_0.bin

In [12]:
search_index

NameError: name 'search_index' is not defined

In [27]:
def getDocs():
    for file in os.listdir():
        if file.endswith(".txt"):
            with open(file, "r") as f:
                github_url = f"{file}"
                yield Document(page_content=f.read(), metadata={"source": github_url})

sources = getDocs()
source_chunks = []
splitter = CharacterTextSplitter(separator=" ", chunk_size=512, chunk_overlap=0)
for source in sources:
    print(source.metadata)
    print(source.page_content[:50])
    for chunk in splitter.split_text(source.page_content):
        print(len(chunk))
        source_chunks.append(Document(page_content=chunk, metadata=source.metadata))
        
vectorstore = Chroma.from_documents(getDocs(), OpenAIEmbeddings(), persist_directory = 'dbdir2')

Using embedded DuckDB with persistence: data will be stored in: dbdir2


{'source': 'TODO.txt'}
- update getChunks. From a number of temp .txt fil
501
{'source': 'article.txt'}
Humanity Is Facing a Great Injustice. The World Ba
507
507
508
510
508
507
508
511
508
509
65
{'source': 'article4.txt'}
Le mod`ele lin ́eaire est souvent le premier outil
511
503
502
90
{'source': 'article2.txt'}
The Income Gap Is Becoming a Physical-Activity Div
508
510
510
510
507
509
507
511
510
288
{'source': 'article3.txt'}
Israel Is Courting Disaster
March 5, 2023
By Micha
512
510
508
508
499
512
512
511
510
504
510
507
222


### Counting number of documents

In [29]:
vectorstore.persist()

In [51]:
vectorstore.add_documents([Document(page_content="Ankush went to Princeton a fourht time!", metadata={'source': 'sentence4'})])

['4cc8a692-d529-11ed-969b-22414b0296f5']

In [52]:
vectorstore.persist()

In [53]:
collection = vectorstore._client.list_collections()[0]
vectorstore._client._count('langchain')

No embedding_function provided, using default embedding function: SentenceTransformerEmbeddingFunction


8