# Pre Retrieval Strategies



## Query Transformation

In [85]:
import json

with open("./articles.json", "r") as f:
    articles = json.load(f)

article_categories = []
authors = []
for article in articles:
    article_categories.append(article["category"])
    authors.append(article["author"])

# set(article_categories)
list(set(authors))

['Analytics Vidhya',
 'MDN Web Docs',
 'GeeksForGeeks',
 'KDnuggets',
 'Smashing Magazine',
 'Towards Data Science',
 'TechCrunch',
 'Codecademy',
 'Krebs on Security']

In [86]:
from enum import Enum

# List of categories and authors
article_categories = ['Cybersecurity', 'Data Science', 'ML', 'Programming', 'Web Development']
authors = ['Towards Data Science', 'Smashing Magazine', 'Codecademy', 'MDN Web Docs', 
           'Krebs on Security', 'GeeksForGeeks', 'Analytics Vidhya', 'KDnuggets', 'TechCrunch']

# Dynamically create Enum classes
ArticleCategory = Enum('ArticleCategory', {category.replace(' ', '_').upper(): category for category in article_categories})
Author = Enum('Author', {author.replace(' ', '_').upper(): author for author in authors})


In [87]:
from pydantic import BaseModel, Field
from datetime import date
from typing import Literal, Optional, List

PROMPT = """ 
    Your task is to transform a given query into a schema/structure given.
    This schema will be used to filter articles based on the given query.

    Here is a description of the parameters:
    - rewritten_query: rewrite query that is more keyword based like it is for a search engine.
    - author: Only if any particular author of the article mentioned in the query.
    - category: Only if any particular category of the article mentioned in the query
    - date_range: Create a date range if there is any mention of time period/date in the query. like a year or month.
"""

class DateRange(BaseModel):
    start_date: str
    end: str

class Article(BaseModel):
    rewritten_query: str
    author: Optional[Author]
    category: Optional[ArticleCategory]
    date_range: DateRange


In [88]:
from openai import OpenAI
from dotenv import load_dotenv

load_dotenv("./../.env")

client = OpenAI()

def transform_query(query, system_prompt, response_format):
    response = client.beta.chat.completions.parse(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": query}
        ],
        response_format=response_format
    )

    return response.choices[0].message.parsed


response = transform_query("what is machine learning?", PROMPT, Article)
print(response)

rewritten_query='definition of machine learning' author=None category=None date_range=DateRange(start_date='', end='')


In [89]:
query = "what has been going on in the world of cybersecurity?"
response = transform_query(query, PROMPT, Article)
response

Article(rewritten_query='updates in cybersecurity news', author=None, category=<ArticleCategory.CYBERSECURITY: 'Cybersecurity'>, date_range=DateRange(start_date='2023-01-01', end='2023-10-31'))

In [90]:
query = "what are articles from tech crunch in july 2023?"
response = transform_query(query, PROMPT, Article)
response

Article(rewritten_query='tech crunch articles July 2023', author=<Author.TECHCRUNCH: 'TechCrunch'>, category=None, date_range=DateRange(start_date='2023-07-01', end='2023-07-31'))