In [10]:
import weaviate
import os
from dotenv import load_dotenv

load_dotenv("../.env")

openai_key = os.getenv("OPENAI_API_KEY")
headers = {
    "X-OpenAI-Api-Key": openai_key,
}

client = weaviate.connect_to_local(
    headers=headers
)

In [11]:
# from weaviate.classes.config import Property, DataType, Configure

# if client.collections.exists("Paper"):
#     client.collections.delete("Paper")

# client.collections.create(
#     name="Paper",
#     properties=[
#         Property(name="title", data_type=DataType.TEXT),
#         Property(name="paper_id", data_type=DataType.TEXT),
#         Property(name="create_date", data_type=DataType.DATE),
#         Property(name="abstract", data_type=DataType.TEXT),
#         Property(name="update_date", data_type=DataType.DATE),
#         Property(name="categories", data_type=DataType.TEXT_ARRAY),
#         Property(name="authors", data_type=DataType.TEXT_ARRAY),
#         Property(name="journal_ref", data_type=DataType.TEXT),
#         Property(name="doi", data_type=DataType.TEXT),
#         Property(name="report_no", data_type=DataType.TEXT),
#         Property(name="license", data_type=DataType.TEXT),
#         Property(name="comments", data_type=DataType.TEXT),
#     ],
#     vectorizer_config=Configure.Vectorizer.text2vec_openai(
#         model="text-embedding-3-small"
#     )
# )


In [12]:
import json
from tqdm import tqdm

with open("../data/aiml_papers_processed.json") as f:
    papers = json.load(f)

print(f"Loaded {len(papers)} papers")

paper = client.collections.get("Paper")



Loaded 243352 papers


            Please make sure to close the connection using `client.close()`.


In [13]:
total_papers = len(papers)
first_35_percent = int(total_papers * 0.35)
papers_to_insert = papers[:first_35_percent]
print(f"Inserting {len(papers_to_insert)} papers")

with paper.batch.dynamic() as batch:
    for obj in tqdm(papers_to_insert):
        batch.add_object(obj)

Inserting 85173 papers


100%|██████████| 85173/85173 [31:13<00:00, 45.45it/s]  


In [15]:
total_papers = len(papers)
first_35_percent = int(total_papers * 0.35)
second_35_percent = int(total_papers * 0.7)
papers_to_insert = papers[first_35_percent:second_35_percent]
print(f"Inserting {len(papers_to_insert)} papers")

with paper.batch.dynamic() as batch:
    for obj in tqdm(papers_to_insert):
        batch.add_object(obj)

Inserting 85173 papers


100%|██████████| 85173/85173 [34:12<00:00, 41.49it/s]  


In [16]:
total_papers = len(papers)
first_35_percent = int(total_papers * 0.35)
second_35_percent = int(total_papers * 0.7)
papers_to_insert = papers[second_35_percent:]
print(f"Inserting {len(papers_to_insert)} papers")

with paper.batch.dynamic() as batch:
    for obj in tqdm(papers_to_insert):
        batch.add_object(obj)

Inserting 73006 papers


100%|██████████| 73006/73006 [29:59<00:00, 40.58it/s]  


In [17]:
from weaviate.classes.query import MetadataQuery
import textwrap

paper = client.collections.get("Paper")


response = paper.query.near_text(
    query="ai and ml",
    return_metadata=MetadataQuery(distance=True, certainty=True), # return distance and certainty metrics
    include_vector=True, # include the vector of the query
    limit=5
)

def print_objects(objects):
    """
        a function to print the retrieved objects
    """
    for obj in objects:
        print(f"ID: {obj.uuid.int}")
        print(f"Distance: {obj.metadata.distance}, Certainty: {obj.metadata.certainty}")
        print(f"Title: {obj.properties['title']}")
        print(f"Date: {obj.properties['create_date']}")
        print(f"Category: {obj.properties['categories']}")
        print(f"abstract: {textwrap.shorten(obj.properties['abstract'], width=100)}")
        print()


print_objects(response.objects)

ID: 99809679363136159136294570168727535497
Distance: 0.5314494371414185, Certainty: 0.7342752814292908
Title: Advancing the Research and Development of Assured Artificial
  Intelligence and Machine Learning Capabilities
Date: 2020-09-24 20:12:14+00:00
Category: ['Machine Learning', 'Cryptography and Security', 'Computers and Society', 'Software Engineering']
abstract: Artificial intelligence (AI) and machine learning (ML) have become increasingly vital in the [...]

ID: 293577723185668284219179507023352147532
Distance: 0.534274697303772, Certainty: 0.732862651348114
Title: MLJ: A Julia package for composable machine learning
Date: 2020-07-23 22:46:33+00:00
Category: ['Machine Learning', 'Machine Learning (Statistics)']
abstract: MLJ (Machine Learing in Julia) is an open source software package providing a common interface [...]

ID: 105240753530877317663087347556395662047
Distance: 0.5371927618980408, Certainty: 0.7314035892486572
Title: Guidance on the Assurance of Machine Learning in