# Multilanguage RAG filtering by multiple PDFs with Langchain and Cohere

In [None]:
# lets install our super tools
%pip install -Uqq langchain-weaviate
%pip install openai tiktoken langchain

you must have a valid key for OpenAi in OPENAI_API_KEY environment variable

In [114]:
import weaviate, os

client = weaviate.connect_to_embedded(
    # comment the line above and uncomment the one below if using docker
    #url="http://localhost:8080",
    headers={
        "X-OpenAi-Api-Key": os.environ.get("OPENAI_API_KEY"), # Replace with your Cohere key
    }
)

print("Client is Ready?", client.is_ready())


Started /Users/dudanogueira/.cache/weaviate-embedded: process ID 40896
listen tcp :6060: bind: address already in use


{"action":"startup","default_vectorizer_module":"none","level":"info","msg":"the default vectorizer modules is set to \"none\", as a result all new schema classes without an explicit vectorizer setting, will use this vectorizer","time":"2024-04-17T10:08:51-03:00"}
{"action":"startup","auto_schema_enabled":true,"level":"info","msg":"auto schema enabled setting is set to \"true\"","time":"2024-04-17T10:08:51-03:00"}
{"level":"info","msg":"No resource limits set, weaviate will use all available memory and CPU. To limit resources, set LIMIT_RESOURCES=true","time":"2024-04-17T10:08:51-03:00"}
{"action":"grpc_startup","level":"info","msg":"grpc server listening at [::]:50050","time":"2024-04-17T10:08:51-03:00"}
{"action":"restapi_management","level":"info","msg":"Serving weaviate at http://127.0.0.1:8079","time":"2024-04-17T10:08:51-03:00"}


Client is Ready? True


{"level":"info","msg":"Completed loading shard wine_CeASlfMg7yUZ in 4.750833ms","time":"2024-04-17T10:08:52-03:00"}
{"action":"hnsw_vector_cache_prefill","count":3000,"index_id":"main","level":"info","limit":1000000000000,"msg":"prefilled vector cache","time":"2024-04-17T10:08:52-03:00","took":53084}
{"level":"info","msg":"Completed loading shard book_DVdagYKAPLa5 in 5.351292ms","time":"2024-04-17T10:08:52-03:00"}
{"action":"hnsw_vector_cache_prefill","count":3000,"index_id":"main","level":"info","limit":1000000000000,"msg":"prefilled vector cache","time":"2024-04-17T10:08:52-03:00","took":62666}
{"level":"info","msg":"Completed loading shard category_aompjF18Rnr4 in 5.571458ms","time":"2024-04-17T10:08:52-03:00"}
{"action":"hnsw_vector_cache_prefill","count":3000,"index_id":"main","level":"info","limit":1000000000000,"msg":"prefilled vector cache","time":"2024-04-17T10:08:52-03:00","took":55959}
{"level":"info","msg":"Completed loading shard wikipedialangchain_4jgiMLgBPfzc in 5.899583

## Let's create our class beforehand

In [115]:
from weaviate import classes as wvc
# clear this class first
client.collections.delete("WikipediaLangChain")
# lets make sure its vectorizer is what the one we want
collection = client.collections.create(
    name="WikipediaLangChain",
    vectorizer_config=wvc.config.Configure.Vectorizer.text2vec_openai(),
    generative_config=wvc.config.Configure.Generative.openai(),
)

{"level":"info","msg":"Created shard wikipedialangchain_JLyOFT7sYWy6 in 1.347125ms","time":"2024-04-17T10:09:06-03:00"}
{"action":"hnsw_vector_cache_prefill","count":1000,"index_id":"main","level":"info","limit":1000000000000,"msg":"prefilled vector cache","time":"2024-04-17T10:09:06-03:00","took":50459}


Now we have a Weaviate client!
Let's read our 2 pdf files, [brazil-wikipedia-article-text.pdf](./brazil-wikipedia-article-text.pdf) and [netherlands-wikipedia-article-text.pdf](./netherlands-wikipedia-article-text.pdf)

Then chunk them and ingest using Langchain.

In [116]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.document_loaders import PyPDFLoader

from langchain_weaviate.vectorstores import WeaviateVectorStore


text_splitter = RecursiveCharacterTextSplitter(chunk_size=400, chunk_overlap=50)
embeddings = OpenAIEmbeddings()

# import first article
loader = PyPDFLoader("brazil-wikipedia-article-text.pdf", extract_images=False)
docs = loader.load_and_split(text_splitter)
print(f"GOT {len(docs)} docs for Brazil")
db = WeaviateVectorStore.from_documents(docs, embeddings, client=client, index_name="WikipediaLangChain")


# import second article
loader = PyPDFLoader("netherlands-wikipedia-article-text.pdf", extract_images=False)
docs = loader.load_and_split(text_splitter)
print(f"GOT {len(docs)} docs for Netherlands")
db = WeaviateVectorStore.from_documents(docs, embeddings, client=client, index_name="WikipediaLangChain")

GOT 247 docs for Brazil
GOT 274 docs for Netherlands


In [None]:
# lets first get our collection
collection = client.collections.get("WikipediaLangChain")

let's count how many objects we have in total

In [117]:
response = collection.aggregate.over_all(total_count=True)
print(response)

AggregateReturn(properties={}, total_count=521)


Now, how many objects we have per source?

In [121]:
response = collection.aggregate.over_all(group_by="source")
for group in response.groups:
    print(group.grouped_by.value, group.total_count)

netherlands-wikipedia-article-text.pdf 274
brazil-wikipedia-article-text.pdf 247


Langchain added some metadata, like `source` `page`. Let's get one object.

In [124]:
object = collection.query.fetch_objects(limit=1).objects[0]

In [125]:
object.properties.keys()

dict_keys(['text', 'page', 'source'])

In [126]:
print(object.properties.get("source"))
print(object.properties.get("page"))
print(object.properties.get("text"))

netherlands-wikipedia-article-text.pdf
3.0
was governed by its own administrative body known as the States-Provincial. The confederal government, known as the States
General, was headquartered in The Hague and comprised representatives from each of the seven provinces. The sparsely
populated region of Drenthe was also part of the republic, albeit not considered a province in its own right. Moreover, during the


## Let's ask in French, a content in English, and request the answer in Spanish

In [127]:
# lets do a RAG directly using only Weaviate

# This is our prompt.
generateTask = "Quelle est la nourriture traditionnelle de ce pays? Write the response only in Spanish."
# lets filter it out, and only use this specific file
source_file = "brazil-wikipedia-article-text.pdf"
#source_file = "netherlands-wikipedia-article-text.pdf"

query = collection.generate.near_text(
    query="tradicional food",
    filters=wvc.query.Filter.by_property("source").equal(source_file),
    limit=10,
    grouped_task=generateTask
)

In [128]:
print(query.generated)

La comida tradicional de Brasil incluye platos como la feijoada, beiju, feijão tropeiro, vatapá, moqueca, polenta, acarajé, brigadeiros, bolo de rolo, cocada, beijinhos, Romeu e Julieta, paçoca, rapadura y pé-de-moleque. También se consumen frutas locales como el açaí, cupuaçu, mango, papaya, cacao, anacardo, guayaba, naranja, lima, maracuyá, piña y ciruela de cerdo en jugos, chocolates, paletas de hielo y helados. La bebida nacional es el café y la cachaça es el licor nativo de Brasil, utilizado en la famosa caipirinha. Los platos típicos suelen incluir arroz y frijoles con carne de res, ensalada, papas fritas y huevo frito, a menudo acompañados de mandioca frita.


those were some of the objects used for this generation

In [129]:
for object in query.objects[0:10]:
    print(object.properties)

{'text': 'flour (farofa). Fried potatoes, fried cassava, fried banana, fried meat and fried cheese are very often eaten in lunch and\nserved in most typical restaurants. Popular snacks are pastel (a fried pastry); coxinha (a variation of chicken croquete); pão\nde queijo (cheese bread and cassava flour / tapioca); pamonha (corn and milk paste); esfirra (a variation of Lebanese', 'page': 13.0, 'source': 'brazil-wikipedia-article-text.pdf'}
{'text': "Cuisine\nBrazilian cuisine varies greatly by region, reflecting the country's varying mix of indigenous and immigrant populations. This\nhas created a national cuisine marked by the preservation of regional differences. Examples are Feijoada, considered the\ncountry's national dish; and regional foods such as beiju, feijão tropeiro, vatapá, moqueca, polenta (from Italian cuisine) and", 'page': 13.0, 'source': 'brazil-wikipedia-article-text.pdf'}
{'text': 'pastry); kibbeh (from Arabic cuisine); empanada (pastry) and empada, little salt pies f

Note that we used a filter, so the content will be searched and generated only for that specific pdf.
Let's change the filter to the second pdf file.

In [130]:
# We can filter it out, now for Netherlands
generateTask = "Quelle est la nourriture traditionnelle de ce pays ? Answer in Spanish"
# now generating the answer using Wikipedia
source_file = "netherlands-wikipedia-article-text.pdf"

query = collection.generate.near_text(
    query="tradicional food",
    filters=wvc.query.Filter.by_property("source").equal(source_file),
    limit=10,
    grouped_task=generateTask
)

print(query.generated)

La comida tradicional de los Países Bajos incluye papas, una porción de carne y verduras (de temporada) para la cena. También se destacan productos como la mayonesa, mostazas de grano entero y la industria del chocolate. Otros alimentos típicos son el arenque en escabeche, mejillones, anguilas, ostras y camarones. En cuanto a postres, se destacan los stroopwafel, gevulde koek y Zeeuwse bolus. Las bebidas alcohólicas tradicionales de la región son la cerveza y el Jenever.


And of course, we can use different filters, and get different content for our questions

In [131]:
# We can filter it out, now for Netherlands
generateTask = "What is in common on the food of thouse two countries?"
# now generating the answer using Wikipedia
source_files = ["netherlands-wikipedia-article-text.pdf", "brazil-wikipedia-article-text.pdf"]

query = collection.generate.near_text(
    query="tradicional food",
    filters=wvc.query.Filter.by_property("source").contains_any(source_files),
    limit=10,
    grouped_task=generateTask
)

print(query.generated)

Both Brazil and the Netherlands have a variety of fried foods in their cuisine. In Brazil, fried potatoes, cassava, banana, meat, and cheese are commonly eaten, while in the Netherlands, fried fish dishes like kibbeling and lekkerbek are popular. Additionally, both countries have a tradition of using flour in their dishes, such as in Brazilian farofa and Dutch cookies and pastries. Both countries also have a history of incorporating influences from other cultures into their cuisine, creating a diverse and cosmopolitan food scene.


## Using Langchain to query data and answer questions

Up until now, we used Langchain to ingest our data, and we queried Weaviate directly.

Now, let's use Langchain also to query. If you noticed, after ingesting our data, langchain will return us a vectorstore. 

We can use that vector store, or initiate a new one. Let's initiate a new one, passing and empty docs []

In [132]:
embeddings = OpenAIEmbeddings()
db = WeaviateVectorStore.from_documents([], embeddings, client=client, index_name="WikipediaLangChain")

### We can now search our data

In [133]:
# we can now do a similarity search on all objects
docs = db.similarity_search("traditional food")
print(docs)

[Document(page_content='(in its modern form) and \nZeeuwse bolus\n are\ngood examples. Cookies are also produced in great number and tend to contain a lot of butter and sugar, like \nstroopwafel\n, as well\nas a filling of some kind, mostly almond, like \ngevulde koek\n. The traditional alcoholic beverages of this region are beer (strong pale\nlager) and \nJenever', metadata={'page': 14.0, 'source': 'netherlands-wikipedia-article-text.pdf'}), Document(page_content='widely available and typical for the region. \nKibbeling\n, once a local delicacy consisting of small chunks of battered white fish, has\nbecome a national fast food, just as lekkerbek.\nThe Southern Dutch cuisine consists of the cuisines of the Dutch provinces of North Brabant and Limburg and the Flemish Region in', metadata={'page': 14.0, 'source': 'netherlands-wikipedia-article-text.pdf'}), Document(page_content='amount of fish. The various dried sausages, belonging to the metworst-family of Dutch sausages are found throu

### Filter by the source property
the property `source` is automatically added by LangChain

In [134]:
# change bellow to get chunks per different files / countries
source_file = "brazil-wikipedia-article-text.pdf"
#source_file = "netherlands-wikipedia-article-text.pdf"
where_filter = wvc.query.Filter.by_property("source").equal(source_file)
docs = db.similarity_search("traditional food", filters=where_filter)
print(docs)

[Document(page_content='accounting for 32% of the total trade. Other large trading partners include the United States, Argentina, the Netherlands and\nCanada. Its automotive industry is the eighth-largest in the world. In the food industry, Brazil was the second-largest\nexporter of processed foods in the world in 2019. The country was the second-largest producer of pulp in the world and the', metadata={'page': 7.0, 'source': 'brazil-wikipedia-article-text.pdf'}), Document(page_content='characterized by traditional Portuguese festivities,\nReligious pluralism increased during the 20th century, and the Protestant community has grown to include over 22% of the\npopulation. The most common Protestant denominations are Evangelical Pentecostal ones. Other Protestant branches with', metadata={'page': 10.0, 'source': 'brazil-wikipedia-article-text.pdf'}), Document(page_content="making up 6.6% of total GDP.\nBrazil is one of the largest producers of various agricultural commodities, and also h

### You can also do some question answering

In [135]:
from langchain.prompts import PromptTemplate

prompt_template = """Text: {context}

Question: {question}

Answer the question based on the text provided. If the text doesn't contain the answer, reply that the answer is not available."""


PROMPT = PromptTemplate(
    template=prompt_template, input_variables=["context", "question"]
)

chain_type_kwargs = {"prompt": PROMPT}

In [136]:
from langchain.llms import OpenAI
from langchain.chains import RetrievalQA

# Let's answer some question
#source_file = "brazil-wikipedia-article-text.pdf"
source_file = "netherlands-wikipedia-article-text.pdf"
where_filter = wvc.query.Filter.by_property("source").equal(source_file)

# we want our retriever to filter the results
retriever = db.as_retriever(search_kwargs={"filters": where_filter})

qa = RetrievalQA.from_chain_type(llm=OpenAI(openai_api_key=os.environ.get("OPENAI_API_KEY")),
                                 chain_type="stuff", 
                                 retriever=retriever, 
                                 chain_type_kwargs=chain_type_kwargs, 
                                 return_source_documents=True)
                                 
answer = qa({"query": "What is the traditional food of this country?"})
print(answer)

/Users/dudanogueira/dev/weaviate/recipes/venv/lib/python3.11/site-packages/pydantic/v1/main.py:304: PydanticDeprecatedSince20: The `__fields__` attribute is deprecated, use `model_fields` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.5/migration/
  return hasattr(instance, '__fields__') and super().__instancecheck__(instance)
/Users/dudanogueira/dev/weaviate/recipes/venv/lib/python3.11/site-packages/pydantic/main.py:952: PydanticDeprecatedSince20: The `__fields__` attribute is deprecated, use `model_fields` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.5/migration/


{'query': 'What is the traditional food of this country?', 'result': '\n\nThe traditional food of this country is widely available and typical for the region, including dishes such as Kibbeling and Lekkerbek. Other traditional dishes include Vlaai and Worstenbroodje, and the traditional alcoholic beverages are beer and Jenever. ', 'source_documents': [Document(page_content='widely available and typical for the region. \nKibbeling\n, once a local delicacy consisting of small chunks of battered white fish, has\nbecome a national fast food, just as lekkerbek.\nThe Southern Dutch cuisine consists of the cuisines of the Dutch provinces of North Brabant and Limburg and the Flemish Region in', metadata={'page': 14.0, 'source': 'netherlands-wikipedia-article-text.pdf'}), Document(page_content='cream, custard or fruits. Cakes, such as the \nVlaai\n from Limburg and the \nMoorkop\n and \nBossche Bol\n from Brabant, are typical\npastries. Savoury pastries also occur, with the \nworstenbroodje\n (

In [137]:
#lets close our embedded server
client.close()

{"action":"restapi_management","level":"info","msg":"Shutting down... ","time":"2024-04-17T10:20:42-03:00"}
{"action":"restapi_management","level":"info","msg":"Stopped serving weaviate at http://127.0.0.1:8079","time":"2024-04-17T10:20:42-03:00"}
