### References

https://python.langchain.com/en/latest/modules/indexes/vectorstores/examples/chroma.html


In [31]:
from langchain.document_loaders import TextLoader
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
import langchain
from langchain.indexes import VectorstoreIndexCreator

import os
from langchain.chains import LLMChain
from langchain.llms.fake import FakeListLLM
from langchain import HuggingFaceHub
from langchain.llms import GPT4All,OpenAI
from langchain.prompts import PromptTemplate
from langchain.schema import Document

# Determine the name of the environment variable you want to use for the OpenAPI key


In [10]:

os.environ['OPENAI_API_KEY'] = "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"
print(os.environ['OPENAI_API_KEY'])
loader = TextLoader('testarticles/articleIsrael.txt' )
index_creator = VectorstoreIndexCreator(
    vectorstore_cls=Chroma, 
    embedding=OpenAIEmbeddings(),
    text_splitter=CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
).from_loaders([loader])

index_creator.__class__.mro()
[attr for attr in dir(index_creator) if attr.startswith('__') is False]

Using embedded DuckDB without persistence: data will be transient


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


['Config',
 '_abc_impl',
 '_calculate_keys',
 '_copy_and_set_values',
 '_decompose_class',
 '_enforce_dict_if_root',
 '_get_value',
 '_init_private_attributes',
 '_iter',
 'construct',
 'copy',
 'dict',
 'from_orm',
 'json',
 'parse_file',
 'parse_obj',
 'parse_raw',
 'query',
 'query_with_sources',
 'schema',
 'schema_json',
 'update_forward_refs',
 'validate',
 'vectorstore']

In [3]:


prompt_template = """Use the context below to write a 100 word blog post about the topic below:
    Context: {context}
    Topic: {topic}
    Blog post:"""
PROMPT = PromptTemplate(
    template=prompt_template, input_variables=["context", "topic"]
)

llm = FakeListLLM(responses = ['France', 'Germany', 'Italy', 'Spain', 'United Kingdom', 'USA', 'China', 'India', 'Japan', 'Russia'])
llm = HuggingFaceHub()

chain = LLMChain(llm=llm, prompt=PROMPT)

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
def getattributes(obj):
    return [attr for attr in dir(obj) if attr.startswith('__') is False]

def generate_blog_post(topic):
    docs = index_creator.vectorstore.similarity_search(topic, k=4)
    for k in range(4):
        print(docs[k].page_content[:50])
    inputs = [{"context": doc.page_content, "topic": topic} for doc in docs]
    return chain.apply(inputs)

generate_blog_post('donor countries promised')

In 2009, donor countries promised to mobilize $100
The World Bank and the donor countries that contro
Getting new money in the door is important, but it
For years, climate financing took a back seat to t
[{'text': ' What'}, {'text': ' In'}, {'text': ' Give'}, {'text': ' In'}]


### Chroma VectorStore Testing 

In [18]:
from langchain.schema import Document
def getDocs():
    for file in os.listdir():
        if file.endswith(".txt"):
            with open(file, "r") as f:
                github_url = f"{file}"
                yield Document(page_content=f.read(), metadata={"source": github_url})


In [15]:

sources = getDocs()

source_chunks = []
splitter = CharacterTextSplitter(separator=" ", chunk_size=512, chunk_overlap=0)
for source in sources:
    print(source.metadata)
    print(source.page_content[:50])
    for chunk in splitter.split_text(source.page_content):
        print(len(chunk))
        source_chunks.append(Document(page_content=chunk, metadata=source.metadata))
        

NameError: name 'getDocs' is not defined

In [16]:
source_chunks

NameError: name 'source_chunks' is not defined

In [17]:


search_index = Chroma.from_documents(source_chunks, OpenAIEmbeddings(), persist_directory = 'dbdir')
search_index.persist()


NameError: name 'source_chunks' is not defined

In [14]:
getattributes(search_index)

NameError: name 'getattributes' is not defined

### Adding new docs 

In [10]:
search_index.add_texts(["Ankush went to Princeton"])
search_index.add_documents([Document(page_content="Ankush went to Princeton", metadata={'source': 'sentence1'})])


['3be928ca-d47e-11ed-8349-22414b0296f5']

### Testing Similarity Scores

In [11]:
print(getattributes(search_index))

['_LANGCHAIN_DEFAULT_COLLECTION_NAME', '_abc_impl', '_client', '_client_settings', '_collection', '_embedding_function', '_persist_directory', 'add_documents', 'add_texts', 'as_retriever', 'delete_collection', 'from_documents', 'from_texts', 'max_marginal_relevance_search', 'max_marginal_relevance_search_by_vector', 'persist', 'similarity_search', 'similarity_search_by_vector', 'similarity_search_with_score']


In [12]:
topic = 'love'
docs = search_index.similarity_search(topic, k=4)


### New index from saved pickle

In [13]:
llm = HuggingFaceHub()
chain = LLMChain(llm=llm, prompt=PROMPT)
def generatefromsaved():
    docs = index_creator.vectorstore.similarity_search(topic, k=4)
    inputs = [{"context": doc.page_content, "topic": topic} for doc in docs]
    print(chain.apply(inputs))

In [16]:
vectorstore2 =  Chroma(persist_directory='dbdir', embedding_function=OpenAIEmbeddings())


Using embedded DuckDB with persistence: data will be stored in: dbdir


In [6]:
template = """Question: {question}

Answer: Let's think step by step."""

prompt = PromptTemplate(template=template, input_variables=["question"])

In [None]:
from langchain.llms import LlamaCpp

llm = LlamaCpp(model_path="./ggml-model-q4_0.bin")


In [27]:
def getDocs():
    for file in os.listdir():
        if file.endswith(".txt"):
            with open(file, "r") as f:
                github_url = f"{file}"
                yield Document(page_content=f.read(), metadata={"source": github_url})

sources = getDocs()
source_chunks = []
splitter = CharacterTextSplitter(separator=" ", chunk_size=512, chunk_overlap=0)
for source in sources:
    print(source.metadata)
    print(source.page_content[:50])
    for chunk in splitter.split_text(source.page_content):
        print(len(chunk))
        source_chunks.append(Document(page_content=chunk, metadata=source.metadata))
        
vectorstore = Chroma.from_documents(getDocs(), OpenAIEmbeddings(), persist_directory = 'dbdir2')

Using embedded DuckDB with persistence: data will be stored in: dbdir2


{'source': 'TODO.txt'}
- update getChunks. From a number of temp .txt fil
501
{'source': 'article.txt'}
Humanity Is Facing a Great Injustice. The World Ba
507
507
508
510
508
507
508
511
508
509
65
{'source': 'article4.txt'}
Le mod`ele lin ́eaire est souvent le premier outil
511
503
502
90
{'source': 'article2.txt'}
The Income Gap Is Becoming a Physical-Activity Div
508
510
510
510
507
509
507
511
510
288
{'source': 'article3.txt'}
Israel Is Courting Disaster
March 5, 2023
By Micha
512
510
508
508
499
512
512
511
510
504
510
507
222


### Counting number of documents

In [5]:
vectorstore = Chroma(embedding_function =  OpenAIEmbeddings(), persist_directory = 'dbdir')
vectorstore.persist()


Using embedded DuckDB with persistence: data will be stored in: dbdir


In [11]:
vectorstore.add_documents([Document(page_content="Ankush went to Princeton a fourht time!", metadata={'source': 'sentence4'})])

['f879a872-d69d-11ed-9594-22414b0296f5']

In [12]:
vectorstore.persist()

In [13]:
vectorstore._client._count('langchain')

14

In [54]:
docs1 = Document(page_content="Ankush went to Princeton a fourht time!", metadata={'source': 'sentence4'})

In [None]:
docs1.page_content

### Get some random content from the vectorstore - Vector DB Text Generation


In [33]:
vectorstore = Chroma(embedding_function =  OpenAIEmbeddings(), persist_directory = 'dbdir')
vectorstore.persist()
prompt = 'give me a summary of the data i gave you'
huggingface_hub_api_key = "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"
os.environ['HUGGINGFACEHUB_API_TOKEN'] = huggingface_hub_api_key

prompt_template = """Use the context below to write a 400 word blog post about the topic below:
    Context: {context}
    Topic: {topic}
    Blog post:"""

PROMPT = PromptTemplate(
    template=prompt_template, input_variables=["context", "topic"]
)

llm = HuggingFaceHub()
llm_gpt4all =  GPT4All(model="../../gpt4all_model/gpt4all-converted.bin")
llm_openai = OpenAI()
chain = LLMChain(llm=llm_gpt4all, prompt=PROMPT)
topic = 'usa'
docs = vectorstore.similarity_search(topic, k=4)
inputs = [{"context": doc.page_content, "topic": topic} for doc in docs]
print(inputs)
print(len(inputs))
print(chain.apply(inputs))



[2023-04-09 08:48:54,866] {posthog.py:15} INFO - Anonymized telemetry enabled. See https://docs.trychroma.com/telemetry for more information.
[2023-04-09 08:48:54,869] {__init__.py:80} INFO - Running Chroma using direct local API.
[2023-04-09 08:48:54,924] {duckdb.py:430} INFO - loaded in 14 embeddings
[2023-04-09 08:48:54,926] {duckdb.py:440} INFO - loaded in 1 collections
[2023-04-09 08:48:54,927] {duckdb.py:85} INFO - collection with name langchain already exists, returning existing collection
[2023-04-09 08:48:54,929] {duckdb.py:445} INFO - PersistentDuckDB del, about to run persist
[2023-04-09 08:48:54,929] {duckdb.py:388} INFO - Persisting DB to disk, putting it in the save folder: dbdir
[2023-04-09 08:48:54,936] {duckdb.py:388} INFO - Persisting DB to disk, putting it in the save folder: dbdir


llama_model_load: loading model from '../../gpt4all_model/gpt4all-converted.bin' - please wait ...
llama_model_load: n_vocab = 32001
llama_model_load: n_ctx   = 512
llama_model_load: n_embd  = 4096
llama_model_load: n_mult  = 256
llama_model_load: n_head  = 32
llama_model_load: n_layer = 32
llama_model_load: n_rot   = 128
llama_model_load: f16     = 2
llama_model_load: n_ff    = 11008
llama_model_load: n_parts = 1
llama_model_load: type    = 1
llama_model_load: ggml map size = 4017.70 MB
llama_model_load: ggml ctx size =  81.25 KB
llama_model_load: mem required  = 5809.78 MB (+ 2052.00 MB per state)
llama_model_load: loading tensors from '../../gpt4all_model/gpt4all-converted.bin'
llama_model_load: model size =  4017.27 MB / num tensors = 291
llama_init_from_file: kv self size  =  512.00 MB


[{'context': 'Constitution is not perfect — no law is — but its\nmany checks and balances have been essential to protecting and advancing fundamental rights and\nmaintaining national stability. It was only through those safeguards that the United States has managed to\nwithstand extreme shocks to our democracy in recent years — including a disgraceful attempt to prevent\nthe peaceful transfer of power — without a catastrophic fracturing.\nIn withstanding those shocks, the United States also has had a luxury that Israel does', 'topic': 'usa'}, {'context': 'partly on a relationship with the United\nStates built on shared values — freedom, equality, democracy — that can only be sustained by a\ncommitment to the rule of law, including an independent judiciary capable of upholding it. If Israel\nretreats from that long-term commitment and moves its model of governance toward one that mirrors\nthose of authoritarian countries, it risks weakening its ties to the United States and other free n

llama_generate: seed = 1681022935

system_info: n_threads = 4 / 8 | AVX = 0 | AVX2 = 0 | AVX512 = 0 | FMA = 0 | NEON = 1 | ARM_FMA = 1 | F16C = 0 | FP16_VA = 1 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 0 | VSX = 0 | 
sampling: temp = 0.800000, top_k = 40, top_p = 0.950000, repeat_last_n = 64, repeat_penalty = 1.300000
generate: n_ctx = 512, n_batch = 1, n_predict = 256, n_keep = 0




In [19]:
vectorstore.__class__.mro()

[langchain.vectorstores.chroma.Chroma,
 langchain.vectorstores.base.VectorStore,
 abc.ABC,
 object]