# Multilanguage RAG filtering by multiple PDFs with Langchain and Cohere

In [1]:
# lets install our super tools
!pip3 install --upgrade weaviate-client langchain pypdf tiktoken cohere


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m23.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


you must have a valid key for OpenAi in OPENAI_API_KEY environment variable

In [16]:
import weaviate, os

client = weaviate.Client(
    embedded_options=weaviate.EmbeddedOptions(port=8080),
    # comment the line above and uncomment the one below if using docker
    #url="http://localhost:8080",
    additional_headers={
        "X-Cohere-Api-Key": os.environ.get("COHERE_API_KEY"), # Replace with your OpenAI key
    }
)

## If using WCS with auth
# import weaviate

# client = weaviate.Client(
#   url="https://your-weaviate-cluster.weaviate.network",  # URL of your Weaviate instance
#   auth_client_secret=weaviate.AuthApiKey(api_key="YOUR-WEAVIATE-API-KEY"),  # Replace w/ your Weaviate instance API key

#   additional_headers={
#     "X-OPENAI-Api-Key": "sk-key", # Replace with your OpenAI key
#   }
# )

# client.schema.get()  # Get the schema to test connection
print("Client is Ready?", client.is_ready())

Started /Users/dudanogueira/.cache/weaviate-embedded: process ID 17609


{"action":"startup","default_vectorizer_module":"none","level":"info","msg":"the default vectorizer modules is set to \"none\", as a result all new schema classes without an explicit vectorizer setting, will use this vectorizer","time":"2023-11-13T20:45:58-03:00"}
{"action":"startup","auto_schema_enabled":true,"level":"info","msg":"auto schema enabled setting is set to \"true\"","time":"2023-11-13T20:45:58-03:00"}
{"action":"grpc_startup","level":"info","msg":"grpc server listening at [::]:50060","time":"2023-11-13T20:45:58-03:00"}
{"action":"restapi_management","level":"info","msg":"Serving weaviate at http://127.0.0.1:8080","time":"2023-11-13T20:45:58-03:00"}
{"action":"hnsw_vector_cache_prefill","count":3000,"index_id":"wikipedialangchain_JuBMDLjHaViF","level":"info","limit":1000000000000,"msg":"prefilled vector cache","time":"2023-11-13T20:45:58-03:00","took":23693416}


Client is Ready? True


## Let's create our class beforehand

In [17]:
# clear this class first
client.schema.delete_class("WikipediaLangChain")
# lets make sure its vectorizer is what the one we want
class_definition = {
    "class": "WikipediaLangChain",
    "vectorizer": "text2vec-cohere",
    "vectorIndexConfig": {
        "distance": "dot" # Set to "cosine" for English models; "dot" for multilingual models
      },
    "moduleConfig": { # specify the model you want to use
            "generative-cohere": { 
                #"model": "command-xlarge-nightly",  // Optional - Defaults to `command-xlarge-nightly`. 
                # Can also use`command-xlarge-beta` and `command-xlarge`
                #"temperatureProperty": <temperature>,  // Optional
                #"maxTokensProperty": <maxTokens>,  // Optional
                #"kProperty": <k>, // Optional
                #"stopSequencesProperty": <stopSequences>, // Optional
                #"returnLikelihoodsProperty": <returnLikelihoods>, // Optional
            },
            "text2vec-cohere": {
                # "model": "embed-multilingual-v3.0", # Defaults to embed-multilingual-v3.0 if not set
                # "truncate": "RIGHT", # Defaults to RIGHT if not set
                #"baseURL": "https://proxy.yourcompanydomain.com"  // Optional. 
                # Can be overridden by one set in the HTTP header.
        }
    }
}
client.schema.create_class(class_definition)

{"action":"hnsw_vector_cache_prefill","count":1000,"index_id":"wikipedialangchain_7hoTYEkNLg6B","level":"info","limit":1000000000000,"msg":"prefilled vector cache","time":"2023-11-13T20:46:03-03:00","took":48500}


In [18]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import CohereEmbeddings
from langchain.vectorstores import Weaviate    
from langchain.document_loaders import PyPDFLoader

text_splitter = RecursiveCharacterTextSplitter(chunk_size=400, chunk_overlap=50)
embeddings = CohereEmbeddings(model="embed-multilingual-v3.0")

# import first article
loader = PyPDFLoader("brazil-wikipedia-article-text.pdf", extract_images=False)
docs = loader.load_and_split(text_splitter)
print(f"GOT {len(docs)} docs for Brazil")
db = Weaviate.from_documents(docs, embeddings, index_name="WikipediaLangChain", client=client, by_text=False)

# import second article
loader = PyPDFLoader("netherlands-wikipedia-article-text.pdf", extract_images=False)
docs = loader.load_and_split(text_splitter)
print(f"GOT {len(docs)} docs for Netherlands")
db = Weaviate.from_documents(docs, embeddings, index_name="WikipediaLangChain", client=client, by_text=False)

GOT 247 docs for Brazil
GOT 274 docs for Netherlands


In [19]:
# let's count how many objects we have per source
import json
response = (
    client.query
    .aggregate("WikipediaLangChain")
    .with_fields("source { count type topOccurrences { occurs value } }")
    .do()
)

print(json.dumps(response, indent=2))

# Let's query some objects
response = (
    client.query
    .get("WikipediaLangChain", "text source")
    .with_limit(4)
    .do()
)

print(json.dumps(response, indent=2))

{
  "data": {
    "Aggregate": {
      "WikipediaLangChain": [
        {
          "source": {
            "count": 521,
            "topOccurrences": [
              {
                "occurs": 274,
                "value": "netherlands-wikipedia-article-text.pdf"
              },
              {
                "occurs": 247,
                "value": "brazil-wikipedia-article-text.pdf"
              }
            ],
            "type": "text"
          }
        }
      ]
    }
  }
}
{
  "data": {
    "Get": {
      "WikipediaLangChain": [
        {
          "source": "netherlands-wikipedia-article-text.pdf",
          "text": "States, with agricultural exports earning \u20ac80.7 billion in 2014, up from \u20ac75.4 billion in 2012. In 2019 agricultural exports were worth\n\u20ac94.5 billion. In an effort to reduce agricultural pollution, the Dutch government is imposing strict limits on the productivity of the\nfarming sector, triggering Dutch farmers' protests."
        },
        

## Let's ask in French, a content in English, and request the answer in Spanish

In [20]:
# lets do a RAG directly using only Weaviate

# This is our prompt.
generateTask = "Quelle est la nourriture traditionnelle de ce pays ? Answer in Spanish"
# lets filter it out, and only use this specific file
source_file = "brazil-wikipedia-article-text.pdf"

result = (
  client.query
  .get("WikipediaLangChain", "text")
  .with_generate(grouped_task = generateTask)
  .with_where({
      "operator": "Equal",
      "path": ["source"],
      "valueText": source_file
  })
  .with_near_text({
   "concepts": ["tradicional Food"]
  })
  .with_limit(5).do()
)

print(json.dumps(result, indent=1))

{
 "data": {
  "Get": {
   "WikipediaLangChain": [
    {
     "_additional": {
      "generate": {
       "error": null,
       "groupedResult": "La comida tradicional de Brasil incluye la farofa (harina de mandioca), papas fritas, yuca frita, pl\u00e1tanos fritos, carne frita y queso frito, que se consumen con frecuencia en el almuerzo y se sirven en la mayor\u00eda de los restaurantes t\u00edpicos. Los aperitivos populares son el pastel (una empanada frita), la coxinha (una variaci\u00f3n de croqueta de pollo), el p\u00e3o de queijo (pan de queso y harina de mandioca/tapioca), la pamonha (pasta de ma\u00edz y leche), la esfirra (una variaci\u00f3n de la cocina libanesa), el kibbeh (de la cocina \u00e1rabe), la empanada y la empada, peque\u00f1as empanadas rellenas de camarones o palmito. Brasil tambi\u00e9n tiene una variedad de postres como los brigadeiros (bolitas de dulce de chocolate), el bolo de rolo (pastel enrollado con goiabada), la cocada (un dulce de coco), los beijinhos (t

In [21]:
# We can filter it out, now for Netherlands
generateTask = "Quelle est la nourriture traditionnelle de ce pays ? Answer in Spanish"
# now generating the answer using Wikipedia
source_file = "netherlands-wikipedia-article-text.pdf"

result = (
  client.query
  .get("WikipediaLangChain", "text")
  .with_generate(grouped_task = generateTask)
  .with_where({
      "operator": "Equal",
      "path": ["source"],
      "valueText": source_file
  })
  .with_near_text({
    "concepts": ["tradicional Food"]
  })
  .with_limit(5).do()
)

print(json.dumps(result, indent=1))

{
 "data": {
  "Get": {
   "WikipediaLangChain": [
    {
     "_additional": {
      "generate": {
       "error": null,
       "groupedResult": "La comida tradicional de este pa\u00eds incluye pasteles como el Vlaai de Limburgo y el Moorkop y Bossche Bol de Brabante, as\u00ed como pasteles salados como el worstenbroodje. Las bebidas alcoh\u00f3licas tradicionales de la regi\u00f3n son la cerveza y el Jenever. Tambi\u00e9n se producen galletas en gran cantidad, como el stroopwafel y el gevulde koek. En el sur del pa\u00eds, la cocina consiste en platos de las provincias de Brabante del Norte y Limburgo y la regi\u00f3n flamenca. En el norte, se encuentran panes de centeno, pasteles y galletas especiadas con jengibre o carne. En la regi\u00f3n costera, se consume una gran cantidad de pescado y embutidos secos. Las salchichas ahumadas, como la rookworst, son comunes y se suelen comer con platos como el stamppot, hutspot o zuurkool."
      }
     },
     "text": "cream, custard or fruits.

## Using Langchain to query data and answer questions

if you noticed, after ingesting our data, langchain will return us a vectorstore. We can use that vector store, or initiate a new one.

In [22]:
embeddings = CohereEmbeddings(model="embed-multilingual-v3.0")
db = Weaviate(client=client, index_name="WikipediaLangChain", text_key="text", embedding=embeddings)

### We can now search our data

In [23]:
# we can now do a similarity search on all objects
docs = db.similarity_search("traditional food")
print(docs)

[Document(page_content='flour (farofa). Fried potatoes, fried cassava, fried banana, fried meat and fried cheese are very often eaten in lunch and\nserved in most typical restaurants. Popular snacks are pastel (a fried pastry); coxinha (a variation of chicken croquete); pão\nde queijo (cheese bread and cassava flour / tapioca); pamonha (corn and milk paste); esfirra (a variation of Lebanese'), Document(page_content='cream, custard or fruits. Cakes, such as the \nVlaai\n from Limburg and the \nMoorkop\n and \nBossche Bol\n from Brabant, are typical\npastries. Savoury pastries also occur, with the \nworstenbroodje\n (a roll with a sausage of ground beef, literally translates into sausage\nbread) being the most popular. The traditional alcoholic beverage of the region is beer. There are many local brands, ranging from'), Document(page_content="Cuisine\nBrazilian cuisine varies greatly by region, reflecting the country's varying mix of indigenous and immigrant populations. This\nhas create

### Filter by the source property
the property `source` is automatically added by LangChain

In [25]:
# change bellow to get chunks per different files / countries
#source_file = "brazil-wikipedia-article-text.pdf"
source_file = "netherlands-wikipedia-article-text.pdf"
where_filter = {
      "operator": "Equal",
      "path": ["source"],
      "valueText": source_file
  }
docs = db.similarity_search("traditional food", where_filter=where_filter)
print(docs)

[Document(page_content='cream, custard or fruits. Cakes, such as the \nVlaai\n from Limburg and the \nMoorkop\n and \nBossche Bol\n from Brabant, are typical\npastries. Savoury pastries also occur, with the \nworstenbroodje\n (a roll with a sausage of ground beef, literally translates into sausage\nbread) being the most popular. The traditional alcoholic beverage of the region is beer. There are many local brands, ranging from'), Document(page_content='widely available and typical for the region. \nKibbeling\n, once a local delicacy consisting of small chunks of battered white fish, has\nbecome a national fast food, just as lekkerbek.\nThe Southern Dutch cuisine consists of the cuisines of the Dutch provinces of North Brabant and Limburg and the Flemish Region in'), Document(page_content='(in its modern form) and \nZeeuwse bolus\n are\ngood examples. Cookies are also produced in great number and tend to contain a lot of butter and sugar, like \nstroopwafel\n, as well\nas a filling of s

### You can also do some question answering

In [26]:
from langchain.prompts import PromptTemplate

prompt_template = """Text: {context}

Question: {question}

Answer the question based on the text provided. If the text doesn't contain the answer, reply that the answer is not available."""


PROMPT = PromptTemplate(
    template=prompt_template, input_variables=["context", "question"]
)

chain_type_kwargs = {"prompt": PROMPT}

In [27]:
from langchain.llms import Cohere
from langchain.chains import RetrievalQA

# Let's answer some question
#source_file = "brazil-wikipedia-article-text.pdf"
source_file = "netherlands-wikipedia-article-text.pdf"
where_filter = {
      "operator": "Equal",
      "path": ["source"],
      "valueText": source_file
  }

# we want our retriever to filter the results
retriever = db.as_retriever(search_kwargs={"where_filter": where_filter})

qa = RetrievalQA.from_chain_type(llm=Cohere(model="command-nightly", temperature=0), 
                                 chain_type="stuff", 
                                 retriever=retriever, 
                                 chain_type_kwargs=chain_type_kwargs, 
                                 return_source_documents=True)
                                 
answer = qa({"query": "What is the traditional food of this country?"})
print(answer)

{'query': 'What is the traditional food of this country?', 'result': ' The traditional food in the Southern Dutch region consists of a wide variety of pastries, cakes, and cookies containing cream, custard, and fruits, as well as sausages, and fish. Some examples include the Vlaai from Limburg, Moorkop and Bossche Bol from Brabant, Worstenbroodje (sausage bread), Zeeuwse Bolus, Stroopwafel, Geverulde Koek, and Jenever. The region also boasts many local beer brands. \n\nLarger sausages are usually eaten with stamppot, hutspot, or sauerkraut, while smaller sausages are often consumed as street food. The region also has its own delicacies such as Kibbeling (battered white fish) and Lekkerbek, which have now become national fast foods. \n\nIs there anything specific you would like to know about the traditional foods in this region? ', 'source_documents': [Document(page_content='cream, custard or fruits. Cakes, such as the \nVlaai\n from Limburg and the \nMoorkop\n and \nBossche Bol\n from 