In [1]:
# !pip install cohere
# !pip install weaviate-client

In [2]:
import os
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file

Let's start by imporing Weaviate to access the Wikipedia database.

In [3]:
import weaviate
auth_config = weaviate.auth.AuthApiKey(
    api_key=os.environ['WEAVIATE_API_KEY'])

In [4]:
client = weaviate.Client(
    url=os.environ['WEAVIATE_API_URL'],
    auth_client_secret=auth_config,
    additional_headers={
        "X-Cohere-Api-Key": os.environ['COHERE_API_KEY'],
    }
)

In [6]:
client.is_ready()

True

# Keyword search

In [8]:
def keyword_search(query,
                   results_lang='en',
                   properties = ["title","url","text"],
                   num_results=3):

    where_filter = {
    "path": ["lang"],
    "operator": "Equal",
    "valueString": results_lang
    }
    
    response = (
        client.query.get("Articles", properties)
        .with_bm25(
            query=query
        )
        .with_where(where_filter)
        .with_limit(num_results)
        .do()
        )

    result = response['data']['Get']['Articles']
    return result

In [9]:
query = "What is the most spoken language in the world?"
keyword_search_results = keyword_search(query)
print(keyword_search_results)

[{'text': 'Gujarati (; , ) is an Indo-Aryan language native to the Indian state of Gujarat and spoken predominantly by the Gujarati people. Gujarati is descended from Old Gujarati (). In India, it is one of the 22 scheduled languages of the Union. It is also the official language in the state of Gujarat, as well as an official language in the union territory of Dadra and Nagar Haveli and Daman and Diu. As of 2011, Gujarati is the 6th most widely spoken language in India by number of native speakers, spoken by 55.5\xa0million speakers which amounts to about 4.5% of the total Indian population. It is the 26th most widely spoken language in the world by number of native speakers as of 2007.', 'title': 'Gujarati language', 'url': 'https://en.wikipedia.org/wiki?curid=143117'}, {'text': 'Nepali is the third-most spoken language in the Australian territory of Tasmania, where it is spoken by 1.3% of its population, and fifth-most spoken language in the Northern Territory, Australia, spoken by 

In [13]:
properties = ["text", "title", "url", 
             "views", "lang"]

In [14]:
def print_result(result):
    """ Print results with colorful formatting """
    for i,item in enumerate(result):
        print(f'item {i}')
        for key in item.keys():
            print(f"{key}:{item.get(key)}")
            print()
        print()

Searching the query in documents with different languages.

In [None]:
query = "What is the most spoken languages in the world?"
keyword_search_results = keyword_search(query, results_lang='es')
print_result(keyword_search_results)

# list of other languages
# en, de, fr,  es, it, ja, ar, zh, ko, hi