# Pre Retrieval Strategies



## Query Transformation

In [1]:
import json

with open("./articles.json", "r") as f:
    articles = json.load(f)

article_categories = []
authors = []
for article in articles:
    article_categories.append(article["category"])
    authors.append(article["author"])

# set(article_categories)
list(set(authors))

['TechCrunch',
 'MDN Web Docs',
 'Towards Data Science',
 'Smashing Magazine',
 'Krebs on Security',
 'Codecademy',
 'GeeksForGeeks',
 'KDnuggets']

In [2]:
from enum import Enum

# List of categories and authors
article_categories = ['Cybersecurity', 'Data Science', 'ML', 'Programming', 'Web Development']
authors = ['Towards Data Science', 'Smashing Magazine', 'Codecademy', 'MDN Web Docs', 
           'Krebs on Security', 'GeeksForGeeks', 'Analytics Vidhya', 'KDnuggets', 'TechCrunch']

# Dynamically create Enum classes
ArticleCategory = Enum('ArticleCategory', {category.replace(' ', '_').upper(): category for category in article_categories})
Author = Enum('Author', {author.replace(' ', '_').upper(): author for author in authors})


In [3]:
from pydantic import BaseModel
from typing import Optional

PROMPT = """ 
    Your task is to transform a given query into a schema/structure given.
    This schema will be used to filter articles based on the given query.

    Here is a description of the parameters:
    - rewritten_query: rewrite query that is more keyword based like it is for a search engine.
    - author: Only if any particular author of the article mentioned in the query.
    - category: Only if any particular category of the article mentioned in the query
    - date_range: Create a date range if there is any mention of time period/date in the query. like a year or month.
"""

class DateRange(BaseModel):
    start_date: str
    end: str

class ArticleQuery(BaseModel):
    rewritten_query: str
    author: Optional[Author]
    category: Optional[ArticleCategory]
    date_range: DateRange


In [4]:
from openai import OpenAI
from dotenv import load_dotenv

load_dotenv("./../.env")

def transform_query(query, system_prompt, response_format):
    client = OpenAI()
    response = client.beta.chat.completions.parse(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": query}
        ],
        response_format=response_format
    )

    return response.choices[0].message.parsed


response1 = transform_query("what is machine learning?", PROMPT, ArticleQuery)
print(response1)

rewritten_query='definition of machine learning' author=None category=<ArticleCategory.ML: 'ML'> date_range=DateRange(start_date='', end='')


In [5]:
query = "cyber security in 2022"
response2 = transform_query(query, PROMPT, ArticleQuery)
response2

ArticleQuery(rewritten_query='cybersecurity 2022', author=None, category=<ArticleCategory.CYBERSECURITY: 'Cybersecurity'>, date_range=DateRange(start_date='2022-01-01', end='2022-12-31'))

In [6]:
query = "what are articles from Towards Data Science in 2022"
response3 = transform_query(query, PROMPT, ArticleQuery)
response3

ArticleQuery(rewritten_query='articles from Towards Data Science in 2022', author=<Author.TOWARDS_DATA_SCIENCE: 'Towards Data Science'>, category=None, date_range=DateRange(start_date='2022-01-01', end='2022-12-31'))

Run command `kill -9 $(lsof -t -i :8079)` in terminal if you want to reconnect to client

In [7]:
import weaviate
from dotenv import load_dotenv
import os
import socket

def is_port_in_use(port):
    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock:
        return sock.connect_ex(('127.0.0.1', port)) == 0

load_dotenv("./../.env")

client = weaviate.connect_to_embedded(
    headers={
        "X-OpenAI-Api-Key": os.getenv("OPENAI_API_KEY")
    }
)
article = client.collections.get("Article")


{"action":"startup","default_vectorizer_module":"none","level":"info","msg":"the default vectorizer modules is set to \"none\", as a result all new schema classes without an explicit vectorizer setting, will use this vectorizer","time":"2024-09-25T12:24:43+05:30"}
{"action":"startup","auto_schema_enabled":true,"level":"info","msg":"auto schema enabled setting is set to \"true\"","time":"2024-09-25T12:24:43+05:30"}
{"level":"info","msg":"No resource limits set, weaviate will use all available memory and CPU. To limit resources, set LIMIT_RESOURCES=true","time":"2024-09-25T12:24:43+05:30"}
{"level":"info","msg":"module offload-s3 is enabled","time":"2024-09-25T12:24:43+05:30"}
{"level":"info","msg":"open cluster service","servers":{"Embedded_at_8079":53536},"time":"2024-09-25T12:24:43+05:30"}
{"address":"172.22.13.215:53537","level":"info","msg":"starting cloud rpc server ...","time":"2024-09-25T12:24:43+05:30"}
{"level":"info","msg":"starting raft sub-system ...","time":"2024-09-25T12:2

In [8]:
import textwrap

def print_objects(objects):
    """
        a function to print the retrieved objects
    """
    for obj in objects:
        print(f"ID: {obj.uuid.int}")
        metadata = [{k: round(v, 2) if isinstance(v, float) else v} for k, v in obj.metadata.__dict__.items() if v is not None]
        print(f"Metadata: {metadata}")
        print(f"Title: {obj.properties['title']}")
        print(f"Date: {obj.properties['date']}")
        print(f"Category: {obj.properties['category']}")
        print(f"Author: {obj.properties['author']}")
        print(f"Body: {textwrap.shorten(obj.properties['body'], width=100)}")
        print()

In [9]:
from weaviate.classes.query import Filter
from datetime import datetime, timezone
from weaviate.classes.query import MetadataQuery

def retrieve(params: ArticleQuery):
    start_date = datetime.strptime(params.date_range.start_date, "%Y-%m-%d").replace(tzinfo=timezone.utc) if params.date_range.start_date else None
    end_date = datetime.strptime(params.date_range.end, "%Y-%m-%d").replace(tzinfo=timezone.utc) if params.date_range.end else None

    # Collect filters in a list
    filters_list = []
    if params.author:
        filters_list.append(Filter.by_property("author").equal(params.author.value))
    if params.category:
        filters_list.append(Filter.by_property("category").equal(params.category.value))
    if start_date:
        filters_list.append(Filter.by_property("date").greater_than(start_date))
    if end_date:
        filters_list.append(Filter.by_property("date").less_than(end_date))

    # Combine all filters using the AND (&) operator, if any filters exist
    filters = None
    if filters_list:
        filters = filters_list[0]
        for f in filters_list[1:]:
            filters = filters & f
            
    chunks = article.query.near_text(
        query=params.rewritten_query,
        filters=filters if filters else None,
        return_metadata=MetadataQuery(distance=True, certainty=True),
        limit=3
    )

    return chunks.objects

print(f"Query: {response1}\n")
chunks = retrieve(response1)
print_objects(chunks)


Query: rewritten_query='definition of machine learning' author=None category=<ArticleCategory.ML: 'ML'> date_range=DateRange(start_date='', end='')



{"action":"telemetry_push","level":"info","msg":"telemetry started","payload":"\u0026{MachineID:d52e011e-8305-416e-8f39-2e6299413b63 Type:INIT Version:1.26.1 NumObjects:0 OS:darwin Arch:arm64 UsedModules:[text2vec-openai]}","time":"2024-09-25T12:24:46+05:30"}
{"action":"lsm_recover_from_active_wal_success","class":"Article","index":"article","level":"info","msg":"successfully recovered from write-ahead-log","path":"/Users/vishwasgowda/.local/share/weaviate/article/PHiJNusYv2bT/lsm/objects/segment-1727247230532865000.wal","shard":"PHiJNusYv2bT","time":"2024-09-25T12:24:46+05:30"}
{"action":"lsm_recover_from_active_wal_success","class":"Article","index":"article","level":"info","msg":"successfully recovered from write-ahead-log","path":"/Users/vishwasgowda/.local/share/weaviate/article/PHiJNusYv2bT/lsm/property_title/segment-1727247230533512000.wal","shard":"PHiJNusYv2bT","time":"2024-09-25T12:24:46+05:30"}
{"action":"lsm_recover_from_active_wal_success","class":"Article","index":"articl

ID: 235959584379154997552071134050082173029
Metadata: [{'distance': 0.34}, {'certainty': 0.83}]
Title: Machine Learning Basics
Date: 2023-03-05 00:00:00+00:00
Category: ML
Author: TechCrunch
Body: Machine learning is a branch of artificial intelligence that focuses on building systems that [...]

ID: 66177066615778596449200477828369435490
Metadata: [{'distance': 0.36}, {'certainty': 0.82}]
Title: Machine Learning
Date: 2021-10-01 00:00:00+00:00
Category: ML
Author: Towards Data Science
Body: Machine learning is a subset of artificial intelligence (AI) that provides systems the ability [...]

ID: 37499734945959922522607306974367251245
Metadata: [{'distance': 0.48}, {'certainty': 0.76}]
Title: Introduction to Deep Learning
Date: 2022-07-10 00:00:00+00:00
Category: ML
Author: Towards Data Science
Body: Deep learning is a subset of machine learning that uses neural networks with many layers to [...]



In [10]:
print(f"Query: {response2}\n")
chunks = retrieve(response2)
print_objects(chunks)

Query: rewritten_query='cybersecurity 2022' author=None category=<ArticleCategory.CYBERSECURITY: 'Cybersecurity'> date_range=DateRange(start_date='2022-01-01', end='2022-12-31')

ID: 111770961923860747703082376322890795880
Metadata: [{'distance': 0.48}, {'certainty': 0.76}]
Title: The Importance of Cybersecurity
Date: 2022-12-12 00:00:00+00:00
Category: Cybersecurity
Author: Krebs on Security
Body: Cybersecurity involves protecting computer systems and networks from information disclosure, [...]

ID: 9620050475437321785307488639092868804
Metadata: [{'distance': 0.55}, {'certainty': 0.72}]
Title: Cryptography
Date: 2022-08-25 00:00:00+00:00
Category: Cybersecurity
Author: TechCrunch
Body: Cryptography is the study and practice of securing communication and data from unauthorized [...]



In [11]:
print(f"Query: {response3}\n")
chunks = retrieve(response3)
print_objects(chunks)

Query: rewritten_query='articles from Towards Data Science in 2022' author=<Author.TOWARDS_DATA_SCIENCE: 'Towards Data Science'> category=None date_range=DateRange(start_date='2022-01-01', end='2022-12-31')

ID: 37499734945959922522607306974367251245
Metadata: [{'distance': 0.55}, {'certainty': 0.73}]
Title: Introduction to Deep Learning
Date: 2022-07-10 00:00:00+00:00
Category: ML
Author: Towards Data Science
Body: Deep learning is a subset of machine learning that uses neural networks with many layers to [...]

