In [1]:
import os
import weaviate
from weaviate.classes.init import Auth
from weaviate.classes.config import Configure

# Load environment variables from .env file 
from dotenv import load_dotenv
load_dotenv()

# Access credentials from environment variables
weaviate_url = os.environ["WEAVIATE_URL"]
weaviate_api_key = os.environ["WEAVIATE_API_KEY"]

# Connect to Weaviate Cloud
client = weaviate.connect_to_weaviate_cloud(
    cluster_url=weaviate_url,
    auth_credentials=Auth.api_key(weaviate_api_key),
)

questions = client.collections.create(
    name="Question",
    vectorizer_config=Configure.Vectorizer.text2vec_weaviate(), # Configure the Weaviate Embeddings integration
    generative_config=Configure.Generative.cohere()             # Configure the Cohere generative AI integration
)

client.close()  # Free up resources


In [5]:
import os
import weaviate
from weaviate.classes.init import Auth
from weaviate.classes.config import Configure
import requests, json, os

# Load environment variables from .env file 
from dotenv import load_dotenv
load_dotenv()

# Access credentials from environment variables
weaviate_url = os.environ["WEAVIATE_URL"]
weaviate_api_key = os.environ["WEAVIATE_API_KEY"]

# Connect to Weaviate Cloud
client = weaviate.connect_to_weaviate_cloud(
    cluster_url=weaviate_url,
    auth_credentials=Auth.api_key(weaviate_api_key),
)

resp = requests.get(
    "https://raw.githubusercontent.com/weaviate-tutorials/quickstart/main/data/jeopardy_tiny.json"
)

data = json.loads(resp.text)

questions = client.collections.get("Question")

with questions.batch.dynamic() as batch:
    for d in data:
        batch.add_object({
            "answer": d["Answer"],
            "question": d["Question"],
            "category": d["Category"],
        })
        if batch.number_errors > 10:
            print("Batch import stopped due to excessive errors.")
            break

failed_objects = questions.batch.failed_objects
if failed_objects:
    print(f"Number of failed imports: {len(failed_objects)}")
    print(f"First failed object: {failed_objects[0]}")


client.close()  # Free up resources


In [11]:
import weaviate
from weaviate.classes.init import Auth
import os, json

# Best practice: store your credentials in environment variables
weaviate_url = os.environ["WEAVIATE_URL"]
weaviate_api_key = os.environ["WEAVIATE_API_KEY"]

client = weaviate.connect_to_weaviate_cloud(
    cluster_url=weaviate_url,                                    # Replace with your Weaviate Cloud URL
    auth_credentials=Auth.api_key(weaviate_api_key),             # Replace with your Weaviate Cloud key
)

questions = client.collections.get("Question")

response = questions.query.near_text(
    query="biology",
    limit=3
)

for obj in response.objects:
    print(json.dumps(obj.properties, indent=2))

client.close()  # Free up resources

{
  "answer": "Elephant",
  "question": "It's the only living mammal in the order Proboseidea",
  "category": "ANIMALS"
}
{
  "answer": "Elephant",
  "question": "It's the only living mammal in the order Proboseidea",
  "category": "ANIMALS"
}
{
  "answer": "Liver",
  "question": "This organ removes excess glucose from the blood & stores it as glycogen",
  "category": "SCIENCE"
}


In [16]:
import os
import weaviate
from weaviate.auth import AuthClientPassword

# Load environment variables from .env file (assumes you are using dotenv to manage environment variables)
from dotenv import load_dotenv
load_dotenv()

# Access credentials from environment variables
weaviate_url = os.environ["WEAVIATE_URL"]
weaviate_api_key = os.environ["WEAVIATE_API_KEY"]

# Connect to Weaviate Cloud
client = weaviate.connect_to_weaviate_cloud(
    cluster_url=weaviate_url,
    auth_credentials=Auth.api_key(weaviate_api_key),
)

client.collections.create("Papers")



# # Define the schema for the AcademicPaper class
# schema = {
#     "classes": [
#         {
#             "class": "AcademicPaper",
#             "description": "Represents an academic paper with details about its publication and content",
#             "properties": [
#                 {"name": "paperId", "dataType": ["string"], "description": "Unique ID of the paper"},
#                 {"name": "url", "dataType": ["string"], "description": "URL where the paper is hosted"},
#                 {"name": "date", "dataType": ["date"], "description": "Publication date of the paper"},
#                 {"name": "title", "dataType": ["string"], "description": "Title of the paper"},
#                 {"name": "abstract", "dataType": ["text"], "description": "Abstract or summary of the paper"},
#                 {"name": "domain", "dataType": ["string"], "description": "High-level domain of the paper (e.g., Physics, CS, etc.)"},
#                 {"name": "subdomain", "dataType": ["string"], "description": "More specific subdomain (e.g., NLP, Quantum Mechanics, etc.)"},
#                 {"name": "topic", "dataType": ["string"], "description": "Topic within the subdomain"},
#                 {"name": "author", "dataType": ["string"], "description": "Author(s) of the paper"}
#             ],
#             "vectorizer": "text2vec-contextionary"  # Optional, enables automatic vectorization if needed
#         }
#     ]
# }

# # Create the schema in Weaviate
# client.schema.create(schema)

# # If you have data to insert, use the data loader or manual insert here
# # For example:
# # client.data_object.create(
# #     data_object={
# #         "paperId": "12345",
# #         "url": "https://example.com/paper",
# #         "date": "2021-05-20",
# #         "title": "A New Approach to Quantum Computing",
# #         "abstract": "This paper explores...",
# #         "domain": "Physics",
# #         "subdomain": "Quantum Computing",
# #         "topic": "Quantum Algorithms",
# #         "author": "Jane Doe"
# #     },
# #     class_name="AcademicPaper"
# # )

client.close()  # Close the client to free up resources


In [None]:
from weaviate.classes.config import Property, DataType

# Assume 'client' is already created and connected to a Weaviate instance

# Connect to Weaviate Cloud
client = weaviate.connect_to_weaviate_cloud(
    cluster_url=weaviate_url,
    auth_credentials=Auth.api_key(weaviate_api_key),
)

# Define the schema for the "Papers" class with detailed properties
client.collections.create(
    "LLMPapers",
    properties = [
        Property(name="paperId", data_type=DataType.INT, description="Unique ID of the paper"),
        Property(name="url", data_type=DataType.TEXT, description="URL where the paper is hosted"),
        Property(name="date", data_type=DataType.DATE, description="Publication date of the paper"),
        Property(name="title", data_type=DataType.TEXT, description="Title of the paper"),
        Property(name="abstract", data_type=DataType.TEXT, description="Abstract or summary of the paper"),
        Property(name="domain", data_type=DataType.TEXT, description="High-level domain of the paper (e.g., Physics, CS, etc.)"),
        Property(name="subdomain", data_type=DataType.TEXT, description="More specific subdomain (e.g., NLP, Quantum Mechanics, etc.)"),
        Property(name="topic", data_type=DataType.TEXT, description="Topic within the subdomain")
    ]
)


# Alternatively, using the v3 client method as noted in the comment:
# client.data_object.schema.create_class(papers_schema)

client.close()  # Close the client to free up resources


In [32]:
import weaviate
from weaviate.classes.config import Property, DataType
from weaviate.classes.data import DataObject

# Connect to Weaviate Cloud
client = weaviate.connect_to_weaviate_cloud(
    cluster_url=weaviate_url,
    auth_credentials=Auth.api_key(weaviate_api_key),
)

reviews = client.collections.get("LLMPapers")  # This collection must have named vectors configured

reviews.data.insert({
    "paperId": 2,
    "url": "https://arxiv.org/abs/2302.00456",
    "date": "2023-02-15T00:00:00Z",
    "title": "A Comprehensive Survey on Large Language Models",
    "abstract": "This survey explores the rapid advancements in LLMs, including their architecture, training techniques, and real-world applications.",
    "domain": "Computer Science",
    "subdomain": "Machine Learning",
    "topic": "Survey on Large Language Models"
})

# Close the client after inserting the data
client.close()


In [38]:
import weaviate
from weaviate.classes.config import Property, DataType
from weaviate.classes.data import DataObject

# Connect to Weaviate Cloud
client = weaviate.connect_to_weaviate_cloud(
    cluster_url=weaviate_url,
    auth_credentials=Auth.api_key(weaviate_api_key),
)

collection = client.collections.get("LLMPapers")

for item in collection.iterator():
    print(item.uuid, item.properties)


client.close()


55b42d5e-1f24-4a10-83ba-9c73f129a2f4 {'date': datetime.datetime(2023, 2, 15, 0, 0, tzinfo=datetime.timezone.utc), 'topic': 'Survey on Large Language Models', 'url': 'https://arxiv.org/abs/2302.00456', 'abstract': 'This survey explores the rapid advancements in LLMs, including their architecture, training techniques, and real-world applications.', 'title': 'A Comprehensive Survey on Large Language Models', 'paperId': 2, 'domain': 'Computer Science', 'subdomain': 'Machine Learning'}
a9d675c4-c9f0-4768-8d0f-3f81039cf56d {'date': datetime.datetime(2023, 1, 1, 0, 0, tzinfo=datetime.timezone.utc), 'topic': 'Transformer Architectures', 'url': 'https://arxiv.org/abs/2301.00001', 'abstract': 'This paper discusses the latest advancements in transformer-based language models...', 'title': 'Advancements in Large Language Models', 'paperId': 1, 'domain': 'Computer Science', 'subdomain': 'Natural Language Processing'}


In [5]:
from weaviate.classes.config import Property, DataType, ReferenceProperty
import weaviate
from weaviate.classes.init import Auth
import os
from dotenv import load_dotenv
import os
import weaviate
from weaviate.classes.init import Auth
from weaviate.classes.config import Configure

def setup_weaviate_schema():
    # Load environment variables
    load_dotenv()
    
    # Access credentials from environment variables
    weaviate_url = os.environ["WEAVIATE_URL"]
    weaviate_api_key = os.environ["WEAVIATE_API_KEY"]
    
    # Connect to Weaviate Cloud
    client = weaviate.connect_to_weaviate_cloud(
        cluster_url=weaviate_url,
        auth_credentials=Auth.api_key(weaviate_api_key),
    )

    # # ✅ Create "Domains" collection (Top-level category)
    # client.collections.create(
    #     "Domains",
    #     properties=[
    #         Property(name="name", data_type=DataType.TEXT, description="Name of the domain (e.g., Computer Science, Physics)"),
    #         Property(name="description", data_type=DataType.TEXT, description="Brief description of the domain"),
    #     ]
    # )

    # ✅ Create "Subdomains" collection (Linked to Domain)
    client.collections.create(
        "Subdomains",
        properties=[
            Property(name="name", data_type=DataType.TEXT, description="Name of the subdomain (e.g., NLP, Quantum Mechanics)"),
            Property(name="description", data_type=DataType.TEXT, description="Brief description of the subdomain"),
            ReferenceProperty(name="belongs_to_domain", target_collection="Domains", description="Reference to the parent domain"),
        ]
    )

    # ✅ Create "Topics" collection (Linked to Subdomain)
    client.collections.create(
        "Topics",
        properties=[
            Property(name="name", data_type=DataType.TEXT, description="Name of the topic (e.g., Transformer Models, BERT)"),
            Property(name="description", data_type=DataType.TEXT, description="Brief description of the topic"),
            ReferenceProperty(name="belongs_to_subdomain", target_collection="Subdomains", description="Reference to the parent subdomain"),
        ]
    )

    client.close()  # Close the client to free up resource

In [6]:
setup_weaviate_schema()

WeaviateInvalidInputError: Invalid input provided: Invalid collection config create parameters: 1 validation error for _CollectionConfigCreate
properties.2
  Input should be a valid dictionary or instance of Property [type=model_type, input_value=ReferenceProperty(name='b...e to the parent domain'), input_type=ReferenceProperty]
    For further information visit https://errors.pydantic.dev/2.10/v/model_type.

In [3]:
from weaviate.classes.config import Property, DataType
import weaviate

# Connect to local Weaviate instance using the correct method
client = weaviate.connect_to_local(
    host="127.0.0.1",  # Use a string to specify the host
    port=8080,
    grpc_port=50051,
)

print(client.is_ready())

# Only close Weaviate when explicitly called
def close_weaviate_client():
    global client
    if client is not None:
        client.close()
        print("✅ Weaviate connection closed.")



# Create the "Topic" collection with a subdomain_id field instead of an explicit reference
client.collections.create(
    "Topic",
    properties=[
        Property(name="name", data_type=DataType.TEXT, description="Name of the topic"),
        Property(name="description", data_type=DataType.TEXT, description="Detailed description of the topic"),
        Property(name="subdomain_id", data_type=DataType.INT, description="ID of the subdomain this topic belongs to"),
    ]
)


True


<weaviate.collections.collection.sync.Collection at 0x107811b50>

In [6]:


from weaviate.classes.config import Property, DataType
import weaviate
import json
import numpy as np

# Connect to local Weaviate instance using the correct method
client = weaviate.connect_to_local(
    host="127.0.0.1",  # Use a string to specify the host
    port=8080,
    grpc_port=50051,
)

print(client.is_ready())


client.collections.delete("LLMPapers")  # THIS WILL DELETE THE SPECIFIED COLLECTION(S) AND THEIR OBJECTS


# Only close Weaviate when explicitly called
def close_weaviate_client():
    global client
    if client is not None:
        client.close()
        print("✅ Weaviate connection closed.")


# # Create the "LLMPapers" collection with appropriate properties
# client.collections.create(
#     "LLMPapers",
#     properties=[
#         Property(name="topic", data_type=DataType.TEXT, description="Topic of the paper"),
#         Property(name="paperId", data_type=DataType.INT, description="Unique paper identifier"),
#         Property(name="url", data_type=DataType.TEXT, description="Paper URL"),
#         Property(name="date", data_type=DataType.DATE, description="Publication date in RFC3339 format"),
#         Property(name="title", data_type=DataType.TEXT, description="Paper title"),
#         Property(name="abstract", data_type=DataType.TEXT, description="Paper abstract"),
#         Property(name="domain", data_type=DataType.TEXT, description="Main domain of the paper"),
#         Property(name="subdomain", data_type=DataType.TEXT, description="Subdomain of the paper"),
#         Property(name="personId", data_type=DataType.INT, description="Unique person identifier"),
#     ],
#     vector_index_config={"dimension": 384}  # Force vector embeddings to be of size 384
# )

True


In [8]:
# Create the "LLMPapers" collection with appropriate properties
client.collections.create(
    "LLMPapers",
    properties=[
        Property(name="topic", data_type=DataType.TEXT, description="Topic of the paper"),
        Property(name="paperId", data_type=DataType.INT, description="Unique paper identifier"),
        Property(name="url", data_type=DataType.TEXT, description="Paper URL"),
        Property(name="date", data_type=DataType.DATE, description="Publication date in RFC3339 format"),
        Property(name="title", data_type=DataType.TEXT, description="Paper title"),
        Property(name="abstract", data_type=DataType.TEXT, description="Paper abstract"),
        Property(name="domain", data_type=DataType.TEXT, description="Main domain of the paper"),
        Property(name="subdomain", data_type=DataType.TEXT, description="Subdomain of the paper"),
        Property(name="personId", data_type=DataType.INT, description="Unique person identifier"),
    ],
    vector_index_config = {
    "vectorIndexType": "hnsw",
    "distanceMetric": "cosine",
    "dimension": 384  # Ensure embeddings are of size 384
}
)

WeaviateInvalidInputError: Invalid input provided: Invalid collection config create parameters: 3 validation errors for _CollectionConfigCreate
vector_index_config.distance
  Field required [type=missing, input_value={'vectorIndexType': 'hnsw...sine', 'dimension': 384}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.10/v/missing
vector_index_config.multivector
  Field required [type=missing, input_value={'vectorIndexType': 'hnsw...sine', 'dimension': 384}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.10/v/missing
vector_index_config.quantizer
  Field required [type=missing, input_value={'vectorIndexType': 'hnsw...sine', 'dimension': 384}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.10/v/missing.

In [32]:
from weaviate.classes.query import MetadataQuery
from sentence_transformers import SentenceTransformer
from weaviate.classes.config import Property, DataType
import weaviate
import json
import numpy as np

# Connect to local Weaviate instance using the correct method
client = weaviate.connect_to_local(
    host="127.0.0.1",  # Use a string to specify the host
    port=8080,
    grpc_port=50051,
)

print(client.is_ready())

# Load the MiniLM model for embedding generation
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
# Convert query text into an embedding
query_text = "ethics"
query_embedding = model.encode(query_text).tolist()  # Convert to a list for Weaviate
print(query_embedding)

# Perform a near-vector search
jeopardy = client.collections.get("LLMPapers")
response = jeopardy.query.near_vector(
    near_vector=query_embedding, # your query vector goes here
    limit=10,
    return_metadata=MetadataQuery(distance=True)
)

for o in response.objects:
    print(o.properties)
    print(o.metadata.distance)




True
[-0.020419945940375328, 0.10838940739631653, -0.0693139061331749, -0.00044743515900336206, -0.012574478052556515, 0.0053588286973536015, 0.00113482226151973, -0.024420639500021935, 0.008541901595890522, 0.05775546282529831, 0.025432869791984558, 0.06511110067367554, -0.05193061754107475, 0.007630366366356611, -0.0450141467154026, 0.011485394090414047, 0.0018918898422271013, -0.01856875978410244, -0.13263721764087677, 0.014632257632911205, -0.025000253692269325, 0.025366440415382385, 0.06743589788675308, 0.01333522517234087, -0.06069202721118927, 0.02662532590329647, 0.041455212980508804, -0.026412110775709152, 0.051973190158605576, -0.07169979810714722, 0.0047139935195446014, 0.04068607836961746, 0.14197473227977753, -0.008641643449664116, -0.028668934479355812, -0.005752422846853733, 0.0021747241262346506, -0.03386755287647247, 0.023472944274544716, -0.02973015606403351, 0.017496347427368164, -0.06414977461099625, -0.024260174483060837, -0.03623293712735176, -0.011352846398949623

In [35]:
import weaviate

try:
    # Connect to Weaviate instance
    client = weaviate.connect_to_local(
        host="127.0.0.1",
        port=8080,
        grpc_port=50051,
    )

    if not client.is_ready():
        raise ConnectionError("Weaviate client is not ready.")

    # Perform a BM25 search
    jeopardy = client.collections.get("LLMPapers")
    response = jeopardy.query.bm25(
        query="drug",
        limit=3
    )

    # Print results
    for obj in response.objects:
        print(obj.properties)

finally:
    # Ensure the client is properly closed
    client.close()


{'date': datetime.datetime(2023, 10, 23, 0, 0, tzinfo=datetime.timezone.utc), 'topic': 'Molecular and Protein Representation Learning', 'url': 'https://arxiv.org/pdf/https://arxiv.org/pdf/2308.0692', 'abstract': "  The birth of ChatGPT, a cutting-edge language model-based chatbot developed\nby OpenAI, ushered in a new era in AI. However, due to potential pitfalls, its\nrole in rigorous scientific research is not clear yet. This paper vividly\nshowcases its innovative application within the field of drug discovery.\nFocused specifically on developing anti-cocaine addiction drugs, the study\nemploys GPT-4 as a virtual guide, offering strategic and methodological\ninsights to researchers working on generative models for drug candidates. The\nprimary objective is to generate optimal drug-like molecules with desired\nproperties. By leveraging the capabilities of ChatGPT, the study introduces a\nnovel approach to the drug discovery process. This symbiotic partnership\nbetween AI and research