In [None]:
# You need to install the python Weaviate client
!pip install weaviate-client

## Download data
Download data from [drive](https://drive.google.com/file/d/1W8nBPZA2j1_6AGnw2BAe6ydXLNXzuDq2/view?usp=share_link)

## Configure the Weaviate Cloud Instance
### Free 14 day sandbox here: https://console.weaviate.cloud/

In [5]:
import os
import weaviate
from wcs_key import wcs_token

auth_config = weaviate.auth.AuthApiKey(api_key=wcs_token)  # Replace w/ your API Key for the Weaviate instance

client = weaviate.Client(
    url="https://kgc-semanticsearch-demo-f2bsuu2s.weaviate.network",
    auth_client_secret=auth_config,
    additional_headers={
        "X-Cohere-Api-Key": os.getenv("COHERE_API_KEY"),   # Replace w/ your Cohere Key 
        "X-OpenAI-Api-Key": os.getenv("OPENAI_API_KEY"),   # Replace w/ your OpenAI Key 
    }
)
client.is_ready()

True

## Create Database Schema

In [7]:
# delete existing schema, (note, this will delete all your weaviate data)
#client.schema.delete_all()

article_schema = {
    "class": "Article",
    "description": "Wiki Article",
    "vectorizer": "text2vec-cohere",#multi-lingual
    "moduleConfig": {
        "text2vec-cohere": {
            "model": "multilingual-22-12",
            "truncate": "RIGHT"
        }
    },
    "vectorIndexConfig": {
        "distance": "dot"
    },
    "properties": [
    {
        "name": "text",
        "dataType": [ "text" ],
        "description": "Article body",
        "moduleConfig": {
            "text2vec-cohere": {
                "skip": False,
                "vectorizePropertyName": False
            }
        }
    },
    {
        "name": "title",
        "dataType": [ "string" ],
        "moduleConfig": { "text2vec-cohere": { "skip": True } }
    },
    {
        "name": "url",
        "dataType": [ "string" ],
        "moduleConfig": { "text2vec-cohere": { "skip": True } }
    },
    {
        "name": "wiki_id",
        "dataType": [ "int" ],
        "moduleConfig": { "text2vec-cohere": { "skip": True } }
    },
    {
        "name": "views",
        "dataType": [ "number" ],
        "moduleConfig": { "text2vec-cohere": { "skip": True } }
    },
    ]
}

# add the schema
client.schema.create_class(article_schema)

print("The schema has been created")

The schema has been created


In [8]:
client.schema.get()

{'classes': [{'class': 'Article',
   'description': 'Wiki Article',
   'invertedIndexConfig': {'bm25': {'b': 0.75, 'k1': 1.2},
    'cleanupIntervalSeconds': 60,
    'stopwords': {'additions': None, 'preset': 'en', 'removals': None}},
   'moduleConfig': {'text2vec-cohere': {'model': 'multilingual-22-12',
     'truncate': 'RIGHT',
     'vectorizeClassName': True}},
   'properties': [{'dataType': ['text'],
     'description': 'Article body',
     'moduleConfig': {'text2vec-cohere': {'skip': False,
       'vectorizePropertyName': False}},
     'name': 'text',
     'tokenization': 'word'},
    {'dataType': ['string'],
     'moduleConfig': {'text2vec-cohere': {'skip': True,
       'vectorizePropertyName': False}},
     'name': 'title',
     'tokenization': 'word'},
    {'dataType': ['string'],
     'moduleConfig': {'text2vec-cohere': {'skip': True,
       'vectorizePropertyName': False}},
     'name': 'url',
     'tokenization': 'word'},
    {'dataType': ['int'],
     'moduleConfig': {'text2

In [9]:
import pandas as pd
df = pd.read_parquet('wiki_simple_100k.parquet')

In [10]:
df.head()

Unnamed: 0,id,title,text,url,wiki_id,views,paragraph_id,langs,emb
0,0,24-hour clock,The 24-hour clock is a way of telling the time...,https://simple.wikipedia.org/wiki?curid=9985,9985,2450.625488,0,30,"[0.07711287587881088, 0.3197174072265625, -0.2..."
1,1,24-hour clock,A time in the 24-hour clock is written in the ...,https://simple.wikipedia.org/wiki?curid=9985,9985,2450.625488,1,30,"[0.19612890481948853, 0.5142669677734375, 0.03..."
2,2,24-hour clock,"However, the US military prefers not to say 24...",https://simple.wikipedia.org/wiki?curid=9985,9985,2450.625488,2,30,"[0.1391918957233429, 0.17759686708450317, -0.1..."
3,3,24-hour clock,"24-hour clock time is used in computers, milit...",https://simple.wikipedia.org/wiki?curid=9985,9985,2450.625488,3,30,"[0.1279686838388443, 0.06708071380853653, -0.0..."
4,4,24-hour clock,"In railway timetables 24:00 means the ""end"" of...",https://simple.wikipedia.org/wiki?curid=9985,9985,2450.625488,4,30,"[0.0753360167145729, 0.3530837893486023, -0.08..."


## Batch and Add 100k Wikipedia Articles to Weaviate

In [11]:
### Step 1 - configure Weaviate Batch, which optimizes CRUD operations in bulk
# - starting batch size of 100
# - dynamically increase/decrease based on performance
# - add timeout retries if something goes wrong

client.batch.configure(
    batch_size=100,
    dynamic=True,
    timeout_retries=3,
)

<weaviate.batch.crud_batch.Batch at 0x7feb500532e0>

In [12]:
data = df[:100_000] # make sure it is not more than 100k objects

counter=0

with client.batch as batch:
    for idx, item in data.iterrows():        
        # print update message every 100 objects        
        if (counter %100 == 0):
            print(f"Import {counter} / {len(data)} ", end="\r")

        properties = {
        "text": item["text"],
        "title": item["title"],
        "url": item["url"],
        "views": item["views"],
        "wiki_id": item["wiki_id"]
        }

        vector = item["emb"]

        batch.add_data_object(properties, "Article", None, vector)
        counter = counter+1
    print(f"Import {counter} / {len(data)}")
        
print("Import complete")

Import 100000 / 100000
Import complete


In [34]:
# Test that all data has loaded – get object count
result = (
    client.query.aggregate("Article")
    .with_fields("meta { count }")
    .do()
)
print("Object count: ", result["data"]["Aggregate"]["Article"])

Object count:  [{'meta': {'count': 100000}}]


In [15]:
def print_result1(result):
    for item in result:
        print(f"\033[95m{item['title']} ({item['views']}) \033[0m")
        print(f"\033[4m{item['url']}\033[0m")
        print(item['text'])
        print()

## Search through your data
1. Classic Key Word Search
2. Vector(Semantic) Search

## Key word Search

In [17]:
where_filter = {
  "path": ["title"],
  "operator": "Like",
  "valueString": "Avocado"
}

query_result = (
  client.query
  .get("Article", ["title", "text"])
  .with_where(where_filter)
  .do()
)

print(query_result['data']['Get']['Article'][0]['text'])
#print_result1(query_result)

Avocados have much more fat than most of the other fruits, but it's fat is healthy to eat (monounsaturated fat). Avocados have lots of potassium, B vitamins, and vitamin E and K. The Mexican food called guacamole is made of avocados. Many other foods are also made from avocado. Avocado is poisonous to some animals. Many animals will get very sick or die if they eat avocado. Avocado grows there where the climate is a little windy.


In [18]:
where_filter = {
  "path": ["title"],
  "operator": "Like",
  "valueString": "Python"
}

query_result = (
  client.query
  .get("Article", ["title", "text"])
  .with_where(where_filter)
  .do()
)

print(query_result['data']['Get']['Article'][0]['text'])
#print_result1(query_result)

Python is usually utilized for creating sites and programming, task robotization, information investigation, and information representation. Since it's moderately simple to learn, Python has been taken on by numerous non-software engineers like bookkeepers and researchers, for different regular undertakings, such as coordinating funds.


In [19]:
where_filter = {
  "path": ["title"],
  "operator": "Like",
  "valueString": "Snake"
}

query_result = (
  client.query
  .get("Article", ["title", "text"])
  .with_where(where_filter)
  .do()
)

print(query_result['data']['Get']['Article'][0]['text'])
#print_result1(query_result)

Most snakes live on the ground, and in the trees. Others live in the water, and a few live under the soil. Like other reptiles, snakes are ectotherms. They control their body temperature by moving in and out of the direct sunshine. That is why they are rare in cold places.


## Semantic Search

In [20]:
def semantic_search(query):
    nearText = {
        "concepts": [query],
#         "distance": -139.0,
    }

    properties = [
        "text", "title", "url", "views",
        "_additional {distance}"
    ]

    response = (
        client.query
        .get("Article", properties)
        .with_near_text(nearText)
        .with_limit(5)
        .do()
    )

    result = response['data']['Get']['Article']

    return result
    
def print_result(result):
    for item in result:
        print(f"\033[95m{item['title']} ({item['views']}) {item['_additional']['distance']}\033[0m")
        print(f"\033[4m{item['url']}\033[0m")
        print(item['text'])
        print()

In [21]:
query_result = semantic_search("a programming language used for machine learning")

print_result(query_result)

[95mPython (programming language) (398.0628356933594) -147.34064[0m
[4mhttps://simple.wikipedia.org/wiki?curid=44678[0m
Python has become one of the most famous programming languages on the world as of late. It's utilized in all that from AI to building sites and programming testing. It tends to be utilized by engineers and non-designers the same.

[95mC++ (194.4631805419922) -146.67387[0m
[4mhttps://simple.wikipedia.org/wiki?curid=7535[0m
C++ (pronounced "see plus plus") is a computer programming language based on C. It was created for writing programs for many different purposes. In the 1990s, C++ became one of the most used programming languages in the world. Like C, C++ uses manual memory management (unlike most mainstream languages, where memory management is automatic), while the syntax usually used for it is different.

[95mCentral processing unit (575.814453125) -146.48938[0m
[4mhttps://simple.wikipedia.org/wiki?curid=17376[0m
Machine code is just a sequence of 0s a

## This is a Multi-Lingual Model! - It understands different languages ...

In [22]:
# This is a multi-lingual model so it can take in queries in different languages!

#good movies in hindi

query_result = semantic_search("अच्छी फिल्में")

print_result(query_result)

[95mGoodfellas (59.27198791503906) -145.6997[0m
[4mhttps://simple.wikipedia.org/wiki?curid=18427[0m
Goodfellas is a 1990 American biographical crime movie directed by Martin Scorsese. It is about life in the Mafia.

[95mGoodfellas (59.27198791503906) -144.94539[0m
[4mhttps://simple.wikipedia.org/wiki?curid=18427[0m
It is set in New York City. The movie is based on the life of mafioso Henry Hill (1943-2012) from 1955 - 1980. "Goodfellas" uses information from New York crime reporter Nicholas Pileggi's book "Wiseguy". Scorsese and Pileggi worked together to write the movie.

[95mGoodfellas (59.27198791503906) -144.38777[0m
[4mhttps://simple.wikipedia.org/wiki?curid=18427[0m
"Goodfellas" was first shown at the 1990 Venice Film Festival. Scorsese received the Silver Lion award for Best Director.

[95mBen Affleck (118.5960922241211) -144.36804[0m
[4mhttps://simple.wikipedia.org/wiki?curid=64729[0m
Benjamin Géza Affleck-Boldt (born August 15, 1972) is an American actor, movie

In [23]:
#vacation spots in Farsi

query_result = semantic_search("مکان های تعطیلات")

print_result(query_result)

[95mBaden-Württemberg (37.600975036621094) -145.33817[0m
[4mhttps://simple.wikipedia.org/wiki?curid=2907[0m
The Black forest, the Swabian Alb and the Lake Constance are world famous holiday regions. The highest mountain is the Feldberg (1492m).

[95mGuernsey (373.6320495605469) -144.96051[0m
[4mhttps://simple.wikipedia.org/wiki?curid=11615[0m
Castel, Forest, St Andrews, St Martins, St Peter Port, St Pierre du bois, St Sampson, St Saviour's, Torteval, Vale.

[95mLeisure (127.69661712646484) -144.43259[0m
[4mhttps://simple.wikipedia.org/wiki?curid=423[0m
A vacation or holiday is the setting aside of time specifically for leisure. During their vacation, some people travel to a different region or country, and stay at a hotel so that they can do things they could not do near home. Other people prefer to spend their vacation time at home in their own community.

[95mThe Bahamas (103.01834869384766) -143.8995[0m
[4mhttps://simple.wikipedia.org/wiki?curid=10487[0m
The Bahamas 

In [24]:
query_result = semantic_search("famous cricketer in India")

# Print out the result
print_result(query_result)

[95mVirat Kohli (839.5772705078125) -151.58066[0m
[4mhttps://simple.wikipedia.org/wiki?curid=454146[0m
Virat Kohli first came into the limelight in 2008, when he led the Indian U-19 cricket team to victory in 2008 U-19 Cricket World Cup He soon made his debut for the Indian team in August 2008. during a ODI match against in Sri Lanka. Virat also became one of the valuable Indian captain in Test cricket with 27 test wins, 12 overseas test wins and highest winning percentage as captain in both Tests and ODIs. Kohli was ranked eighth in ESPN's list of world's most famous sportspeople in 2016. Virat is the only current player to have a stand named after him in Delhi Firoz Shah Kotla stadium to honour his immaculate contributions to Indian cricket.

[95mRahul Dravid (60.60616683959961) -149.63435[0m
[4mhttps://simple.wikipedia.org/wiki?curid=547524[0m
Rahul Dravid is a sports player who has stopped playing professionally. He used to play cricket and was captain of the national crick

## 100K Documents?? Those are rooky numbers ... Lets try searching over ~10M objects!

In [25]:
import os
import weaviate
from wcs_key import wcs_token

auth_config = weaviate.auth.AuthApiKey(api_key="76320a90-53d8-42bc-b41d-678647c6672e")   # Replace w/ your API Key for the Weaviate instance

client2 = weaviate.Client(
    url="https://cohere-demo.weaviate.network/",
    auth_client_secret=auth_config,
    additional_headers={
        "X-Cohere-Api-Key": os.getenv("COHERE_API_KEY"),   # Replace w/ your Cohere Key 
        "X-OpenAI-Api-Key": os.getenv("OPENAI_API_KEY"),   # Replace w/ your OpenAI Key 
    }
)
client2.is_ready()

True

In [28]:
result = (
    client2.query.aggregate("Articles")
    .with_fields("meta { count }")
    .do()
)
print("Object count: ", result["data"]["Aggregate"]["Articles"])

Object count:  [{'meta': {'count': 9436199}}]


In [32]:
def semantic_search2(query):
    nearText = {
        "concepts": [query],
#         "distance": -139.0,
    }

    properties = [
        "text", "title", "url", "views",
        "_additional {distance}"
    ]

    response = (
        client2.query
        .get("Articles", properties)
        .with_near_text(nearText)
        .with_limit(5)
        .do()
    )

    result = response['data']['Get']['Articles']

    return result
    
def print_result2(result):
    for item in result:
        print(f"\033[95m{item['title']} ({item['views']}) {item['_additional']['distance']}\033[0m")
        print(f"\033[4m{item['url']}\033[0m")
        print(item['text'])
        print()

In [33]:
query_result = semantic_search2("a programming language used for machine learning")

print_result2(query_result)

[95mPython (programming language) (3000) -149.55988[0m
[4mhttps://en.wikipedia.org/wiki?curid=23862[0m
Python is commonly used in artificial intelligence projects and machine learning projects with the help of libraries like TensorFlow, Keras, Pytorch, and scikit-learn. As a scripting language with a modular architecture, simple syntax, and rich text processing tools, Python is often used for natural language processing.

[95mR (programming language) (2000) -148.33429[0m
[4mhttps://en.wikipedia.org/wiki?curid=376707[0m
According to user surveys and studies of scholarly literature databases, R is one of the most commonly used programming languages used in data mining. R ranks 12th in the TIOBE index, a measure of programming language popularity, in which the language peaked in 8th place in August 2020.

[95mPython (programming language) (3000) -147.94357[0m
[4mhttps://en.wikipedia.org/wiki?curid=23862[0m
Due to Python's extensive mathematics library, and the third-party libr