# Databases using weaviate

Using vector database and queries from it

https://weaviate.io/developers/weaviate/quickstart

## Setup

In [1]:
# API and Endpoint
endpoint = 'https://7nq3nbh6qtwzarn6ixflfg.c0.us-east1.gcp.weaviate.cloud'
# OpenAPI key
key_admin = 'bUXe70lkBIJGWX4IucHZFoowDaBN0Y1dGQMw'
key_read = 'SRSAea22sWqrC9dHj1Id3WgWwjbtLh72r65n'

openAI ='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'

In [5]:
# pip install -U weaviate-client

In [2]:
import weaviate
from weaviate.classes.init import Auth
import os

# Best practice: store your credentials in environment variables
wcd_url = endpoint
wcd_api_key = key_admin

client = weaviate.connect_to_weaviate_cloud(
    cluster_url=wcd_url,                                    # Replace with your Weaviate Cloud URL
    auth_credentials=Auth.api_key(wcd_api_key),             # Replace with your Weaviate Cloud key
)

print(client.is_ready())  # Should print: `True`

client.close()  # Free up resources

True


## Populate the database

Run once, when collection is created, then ok

In [3]:
import weaviate
from weaviate.classes.init import Auth
from weaviate.classes.config import Configure
import os

# Best practice: store your credentials in environment variables
wcd_url = endpoint
wcd_api_key = key_admin

client = weaviate.connect_to_weaviate_cloud(
    cluster_url=wcd_url,                                    # Replace with your Weaviate Cloud URL
    auth_credentials=Auth.api_key(wcd_api_key),             # Replace with your Weaviate Cloud key
)

questions = client.collections.create(
    name="Resumes",
    vectorizer_config=Configure.Vectorizer.text2vec_openai(),   # Configure the OpenAI embedding integration
    generative_config=Configure.Generative.openai()             # Configure the OpenAI generative AI integration
)

client.close()  # Free up resources

## Add objects

Resume schema is the idea from: https://jsonresume.org/schema

In [17]:
import weaviate
from weaviate.classes.init import Auth
import requests, json, os
import json

# Best practice: store your credentials in environment variables
wcd_url = endpoint
wcd_api_key = key_admin
openai_api_key = openAI

client = weaviate.connect_to_weaviate_cloud(
    cluster_url=wcd_url,                                    # Replace with your Weaviate Cloud URL
    auth_credentials=Auth.api_key(wcd_api_key),             # Replace with your Weaviate Cloud key
    headers={"X-OpenAI-Api-Key": openai_api_key},           # Replace with your OpenAI API key
)
file_path = './../CV-Job-matching-main/json/resume_schema.json'
with open(file_path, 'r') as file:
    resume_schema = json.load(file)

resumes = client.collections.get("Resumes")

with resumes.batch.dynamic() as batch:
    # The structure of the JSON is different, so we need to adjust how we access the data
    for pii in resume_schema.get("basics", []):
        batch.add_object({
            "name": pii, 
       })
    # Assuming you want to store each work experience as a separate object
    for job in resume_schema.get("work", []):
        batch.add_object({
            "name": job.get("name", ""),  # Using 'summary' as the answer
            "position": job.get("position", ""),  # Using 'position' as the question
            "url": job.get("url", ""),  # Using 'company' as the category
            # You can add more fields as needed
            "startDate": job.get("startDate", ""),
            "endDate": job.get("endDate", ""),
            "highlights": ", ".join(job.get("highlights", [])),  # Join highlights into a single string
        })

    # You might want to add other sections like 'education', 'skills', etc.
    # For example, adding education:
    for edu in resume_schema.get("education", []):
        batch.add_object({
            "category": "Education",
            "institution": edu.get("institution", ""),
            "startDate": edu.get("startDate", ""),
            "endDate": edu.get("endDate", ""),
        })

    # Adding skills
    for skill in resume_schema.get("skills", []):
        batch.add_object({
            "keywords": ", ".join(skill.get("keywords", [])),
            "name": skill.get("name", ""),
            "category": "Skill",
        })

client.close()  # Free up resources

## Queries

In [31]:
from weaviate import client
def query_weaviate(query, collection_name):
    
    nearText = {
        "concepts": [query],
        "distance": 0.7,
    }

    properties = [
        "title", "content", "url",
        "_additional {certainty distance}"
    ]

    result = (
        client.query
        .get(collection_name, properties)
        .with_near_text(nearText)
        .with_limit(10)
        .do()
    )
    
    # Check for errors
    if ("errors" in result):
        print ("\033[91mYou probably have run out of OpenAI API calls for the current minute – the limit is set at 60 per minute.")
        raise Exception(result["errors"][0]['message'])
    
    return result["data"]["Get"][collection_name]

In [32]:
import weaviate
from weaviate.classes.init import Auth
import os, json

# Best practice: store your credentials in environment variables
wcd_url = os.environ["WCD_URL"]
wcd_api_key = os.environ["WCD_API_KEY"]
openai_api_key = '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'

client = weaviate.connect_to_weaviate_cloud(
    cluster_url=wcd_url,                                    # Replace with your Weaviate Cloud URL
    auth_credentials=Auth.api_key(wcd_api_key),             # Replace with your Weaviate Cloud key
    headers={"X-OpenAI-Api-Key": openai_api_key},           # Replace with your OpenAI API key
)

questions = client.collections.get("Question")

response = questions.query.near_text(
    query="biology",
    limit=2
)

for obj in response.objects:
    print(json.dumps(obj.properties, indent=2))

client.close()  # Free up resources

{
  "answer": "DNA",
  "question": "In 1953 Watson & Crick built a model of the molecular structure of this, the gene-carrying substance",
  "category": "SCIENCE"
}
{
  "answer": "species",
  "question": "2000 news: the Gunnison sage grouse isn't just another northern sage grouse, but a new one of this classification",
  "category": "SCIENCE"
}


## RAG 

Retrieval augmented generation (RAG), also called generative search, combines the power of generative AI models such as large language models (LLMs) with the up-to-date truthfulness of a database.


In [34]:
import weaviate
from weaviate.classes.init import Auth
import os

# Best practice: store your credentials in environment variables
wcd_url = os.environ["WCD_URL"]
wcd_api_key = os.environ["WCD_API_KEY"]
openai_api_key = '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'

client = weaviate.connect_to_weaviate_cloud(
    cluster_url=wcd_url,                                    # Replace with your Weaviate Cloud URL
    auth_credentials=Auth.api_key(wcd_api_key),             # Replace with your Weaviate Cloud key
    headers={"X-OpenAI-Api-Key": openai_api_key},           # Replace with your OpenAI API key
)

questions = client.collections.get("Question")

response = questions.generate.near_text(
    query="biology",
    limit=2,
    grouped_task="Write a tweet with emojis about these facts."
)

print(response.generated)  # Inspect the generated text

client.close()  # Free up resources

🧬 In 1953 Watson & Crick built a model of the molecular structure of DNA, the gene-carrying substance! 🧬🔬

🦢 2000 news: the Gunnison sage grouse isn't just another northern sage grouse, but a new species! 🦢🌿 #ScienceFacts #DNA #SpeciesDiscovery


## Additional properties

We can ask Weaviate to return _additional properties for any returned objects. This allows us to obtain properties such as the vector of each returned object as well as the actual certainty value, so we can verify how close each object is to our query vector. Here is a query that will return the certainty value:

The fields `id`, `vector`, `certainty`, `distance`, `featureProjection` and `classification` are available by default.


In [45]:
import weaviate
import weaviate.classes as wvc
import json

wcd_url = os.environ["WCD_URL"]
wcd_api_key = os.environ["WCD_API_KEY"]
openai_api_key = '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'


client = weaviate.connect_to_weaviate_cloud(
    cluster_url=wcd_url,                                    # Replace with your Weaviate Cloud URL
    auth_credentials=Auth.api_key(wcd_api_key),             # Replace with your Weaviate Cloud key
    headers={"X-OpenAI-Api-Key": openai_api_key},           # Replace with your OpenAI API key
)

questions = client.collections.get("Question")
response = questions.query.near_text(
        query="biology",
        limit=2,
        return_metadata=wvc.query.MetadataQuery(certainty=True)
    )

for o in response.objects:
    print(o.metadata.certainty)  # Inspect metadata
    print(o.uuid)  # Inspect UUID (returned by default)
    print(o.properties)  # Inspect returned objects

client.close()

0.6285852193832397
6c464f61-181b-4e98-a063-f948ee3f6ae2
{'answer': 'DNA', 'question': 'In 1953 Watson & Crick built a model of the molecular structure of this, the gene-carrying substance', 'category': 'SCIENCE'}
0.6066622734069824
0aee9827-7596-420f-aa8b-5297cfedb1d1
{'answer': 'species', 'question': "2000 news: the Gunnison sage grouse isn't just another northern sage grouse, but a new one of this classification", 'category': 'SCIENCE'}
