In [46]:
### PHASE 1: Data Loading & Pinecone Indexing

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
## Step 1: Install Required Libraries

In [5]:
!pip install -q openai pinecone-client pandas


In [6]:
## Step 2: Load the CSV Data

In [7]:
import pandas as pd

# Load your CSV file
df = pd.read_csv("/content/drive/MyDrive/Bridged/sample_data.csv")
df.head()


Unnamed: 0,pageURL,title,publishedDate,author,tags
0,https://www.indiatoday.in/sports/cricket/story...,"IPL 2025: Ruthless MI top table, knock Rajasth...",2025-05-01T23:12:07.000000Z,Jane Doe,"[""#IPL2025"", ""#MumbaiIndians"", ""#RajasthanRoya..."
1,https://www.indiatoday.in/sports/cricket/story...,Shubman Gill likely to play vs SRH despite bac...,2025-05-01T22:30:24.000000Z,Jane Doe,"[""#ShubmanGill"", ""#GujaratTitans"", ""#IPLInjur..."
2,https://www.indiatoday.in/sports/cricket/story...,IPL 2025: Vaibhav Suryavanshi out for 2-ball d...,2025-05-01T21:53:11.000000Z,Jane Doe,"[""#VaibhavSuryavanshi"", ""#IPL2025"", ""#CricketF..."
3,https://www.indiatoday.in/sports/cricket/story...,"IPL: Rohit Sharma completes 6,000 runs for MI,...",2025-05-01T21:13:15.000000Z,Jane Doe,"[""#RohitSharma"", ""#MumbaiIndians"", ""#IPLRecords""]"
4,https://www.indiatoday.in/sports/cricket/story...,IPL 2025: Don't praise Vaibhav Suryavanshi to ...,2025-05-01T20:36:03.000000Z,Jane Doe,"[""#VaibhavSuryavanshi"", ""#SunilGavaskar"", ""#IP..."


In [8]:
## Step 3: Inspect Required Columns

In [9]:
df.columns

Index(['pageURL', 'title', 'publishedDate', 'author', 'tags'], dtype='object')

In [10]:
# Rename column to match the expected format
df.rename(columns={'publishedDate': 'published_date'}, inplace=True)

# Clean tags into list format
df['tags'] = df['tags'].apply(lambda x: [tag.strip() for tag in str(x).split(',')])


In [31]:
##  Step 4: Initialize Pinecone

In [15]:
# uninstall the old pinecone-client
!pip uninstall -y pinecone-client

# install the new official package
!pip install -U pinecone


Found existing installation: pinecone-client 6.0.0
Uninstalling pinecone-client-6.0.0:
  Successfully uninstalled pinecone-client-6.0.0
Collecting pinecone
  Downloading pinecone-7.0.2-py3-none-any.whl.metadata (9.5 kB)
Collecting pinecone-plugin-assistant<2.0.0,>=1.6.0 (from pinecone)
  Downloading pinecone_plugin_assistant-1.6.1-py3-none-any.whl.metadata (27 kB)
Downloading pinecone-7.0.2-py3-none-any.whl (516 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m516.3/516.3 kB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pinecone_plugin_assistant-1.6.1-py3-none-any.whl (239 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m239.1/239.1 kB[0m [31m17.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pinecone-plugin-assistant, pinecone
Successfully installed pinecone-7.0.2 pinecone-plugin-assistant-1.6.1


In [17]:
import os
from pinecone import Pinecone, ServerlessSpec

# 1. Initialize Pinecone client
pc = Pinecone(api_key="PINECONE_API_KEY")

# 2. Create index (only if it doesn't exist)
index_name = "bridged-assignment"

if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=1536,
        metric="cosine",
        spec=ServerlessSpec(
            cloud="aws",
            region="us-east-1"
        )
    )

# 3. Connect to index
index = pc.Index(index_name)


In [32]:
## Step 5: Generate Embeddings + Upsert into Pinecone

In [19]:
import openai

openai.api_key = "OPENAI_API_KEY"


In [20]:
## Define Embedding Function

In [21]:
# Generate embedding for a given text using OpenAI
def get_embedding(text, model="text-embedding-ada-002"):
    response = openai.embeddings.create(
        input=[text],
        model=model
    )
    return response.data[0].embedding


In [22]:
## Clean Data and Prepare Metadata

In [23]:
import pandas as pd

# Use the existing DataFrame (already renamed column earlier)
df['tags'] = df['tags'].apply(lambda x: [tag.strip() for tag in str(x).split(',')])
df['published_date'] = pd.to_datetime(df['published_date'], errors='coerce')

# Check formatting
df[['title', 'author', 'published_date', 'tags']].head()


Unnamed: 0,title,author,published_date,tags
0,"IPL 2025: Ruthless MI top table, knock Rajasth...",Jane Doe,2025-05-01 23:12:07+00:00,"[['[""#IPL2025""', '""#MumbaiIndians""', '""#Rajast..."
1,Shubman Gill likely to play vs SRH despite bac...,Jane Doe,2025-05-01 22:30:24+00:00,"[['[""#ShubmanGill""', '""#GujaratTitans""', '""#IP..."
2,IPL 2025: Vaibhav Suryavanshi out for 2-ball d...,Jane Doe,2025-05-01 21:53:11+00:00,"[['[""#VaibhavSuryavanshi""', '""#IPL2025""', '""#C..."
3,"IPL: Rohit Sharma completes 6,000 runs for MI,...",Jane Doe,2025-05-01 21:13:15+00:00,"[['[""#RohitSharma""', '""#MumbaiIndians""', '""#IP..."
4,IPL 2025: Don't praise Vaibhav Suryavanshi to ...,Jane Doe,2025-05-01 20:36:03+00:00,"[['[""#VaibhavSuryavanshi""', '""#SunilGavaskar""'..."


In [27]:
print(df['tags'].iloc[0])


[]


In [28]:
import pandas as pd

# Reload from Google Drive
df = pd.read_csv('/content/drive/MyDrive/Bridged/sample_data.csv')

# Peek at the raw 'tags' value
print(df['tags'].iloc[0])


["#IPL2025", "#MumbaiIndians", "#RajasthanRoyals"]


In [29]:
import ast

# Safely convert stringified lists into actual Python lists
df['tags'] = df['tags'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else [])


In [30]:
df['tags'].head()


Unnamed: 0,tags
0,"[#IPL2025, #MumbaiIndians, #RajasthanRoyals]"
1,"[#ShubmanGill, #GujaratTitans, #IPLInjuries]"
2,"[#VaibhavSuryavanshi, #IPL2025, #CricketForm]"
3,"[#RohitSharma, #MumbaiIndians, #IPLRecords]"
4,"[#VaibhavSuryavanshi, #SunilGavaskar, #IPL2025]"


In [35]:
# Rename to match expected field
df.rename(columns={'publishedDate': 'published_date'}, inplace=True)


In [36]:
print(df.columns)


Index(['pageURL', 'title', 'published_date', 'author', 'tags'], dtype='object')


In [33]:
## Embed & Upload to Pinecone

In [69]:
from tqdm import tqdm

vectors = []

for i, row in tqdm(df.iterrows(), total=len(df)):
    content = str(row['title'])  # Using title as input for embeddings
    try:
        embedding = get_embedding(content)
    except Exception as e:
        print(f"Embedding failed at row {i}: {e}")
        continue

    # Extract date
    date = pd.to_datetime(row['published_date'], errors='coerce')
    metadata = {
        "title": str(row['title']),
        "author": str(row['author']),
        "published_year": int(date.year) if pd.notnull(date) else None,
        "published_month": int(date.month) if pd.notnull(date) else None,
        "tags": row['tags']
    }

    vectors.append({
        "id": f"doc-{i}",
        "values": embedding,
        "metadata": metadata
    })

    # Upload in batches of 100
    if len(vectors) == 100 or i == len(df)-1:
        index.upsert(vectors=vectors)
        vectors = []


100%|██████████| 15/15 [00:07<00:00,  2.06it/s]


In [45]:
### PHASE 2: Build the Natural Language → Pinecone Filter Agent

In [39]:
## Step 1: Define Few-Shot Prompt Template

In [40]:
SYSTEM_PROMPT = """
You are a smart assistant that converts natural language search queries into Pinecone-compatible JSON filters.

Your task is to extract structured metadata from the user's query and return ONLY a JSON object.

Valid filter fields are:
- author: string
- published_year: integer
- published_month: integer
- tags: list of strings (exact match)

Examples:

User: Show me articles by Jane Doe from May 2025 about IPL
Response:
{
  "author": "Jane Doe",
  "published_year": { "$eq": 2025 },
  "published_month": { "$eq": 5 },
  "tags": { "$in": ["IPL"] }
}

User: Anything by John Smith on cricket injuries?
Response:
{
  "author": "John Smith",
  "tags": { "$in": ["cricket injuries"] }
}

Return ONLY the filter JSON, without any explanation.
"""


In [41]:
## Step 2: Create a Function to Generate Filters

In [42]:
def generate_filter_from_query(user_query):
    response = openai.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": SYSTEM_PROMPT},
            {"role": "user", "content": user_query}
        ],
        temperature=0.0
    )
    json_str = response.choices[0].message.content.strip()

    try:
        return ast.literal_eval(json_str)
    except Exception as e:
        print("Error parsing JSON:", e)
        return {}


In [43]:
## Step 3: Try Some Test Queries

In [44]:
query1 = "Show me articles by Jane Doe from May 2025 about IPL"
query2 = "Find posts tagged with #MumbaiIndians in 2025"
query3 = "Anything by John Smith on cricket injuries?"

print(generate_filter_from_query(query1))
print(generate_filter_from_query(query2))
print(generate_filter_from_query(query3))


{'author': 'Jane Doe', 'published_year': {'$eq': 2025}, 'published_month': {'$eq': 5}, 'tags': {'$in': ['IPL']}}
{'published_year': {'$eq': 2025}, 'tags': {'$in': ['MumbaiIndians']}}
{'author': 'John Smith', 'tags': {'$in': ['cricket injuries']}}


In [47]:
### PHASE 3: Run Filtered Vector Search in Pinecone

In [48]:
## Step 1: Define Search Function

In [49]:
def search_articles(query_text, top_k=5):
    # 1. Get query embedding
    query_embedding = get_embedding(query_text)

    # 2. Generate metadata filter using GPT
    filter_dict = generate_filter_from_query(query_text)

    # 3. Query Pinecone with vector + metadata filter
    results = index.query(
        vector=query_embedding,
        top_k=top_k,
        filter=filter_dict,
        include_metadata=True
    )

    return results


In [50]:
## Step 2: Try a Query

In [51]:
query = "Show me articles by Jane Doe from May 2025 about IPL"
results = search_articles(query)

for match in results.matches:
    print(f"\nScore: {match.score}")
    print("Title:", match.metadata.get("title", "N/A"))
    print("Author:", match.metadata.get("author", "N/A"))
    print("Tags:", match.metadata.get("tags", []))


In [64]:
## Check top_k setting

In [52]:
results = search_articles(query, top_k=5)


In [65]:
## Print Filter and Confirm Matching

In [53]:
def search_articles(query_text, top_k=5):
    query_embedding = get_embedding(query_text)
    filter_dict = generate_filter_from_query(query_text)

    print("Generated filter:\n", filter_dict)

    results = index.query(
        vector=query_embedding,
        top_k=top_k,
        filter=filter_dict,
        include_metadata=True
    )

    return results


In [66]:
## Test Without Filter (Vector-Only Search)

In [54]:
def search_without_filter(query_text, top_k=5):
    query_embedding = get_embedding(query_text)

    results = index.query(
        vector=query_embedding,
        top_k=top_k,
        include_metadata=True
    )

    return results


In [70]:
query = "Show me articles about IPL"
results = search_without_filter(query)

for match in results.matches:
    print(f"\nScore: {match.score}")
    print("Title:", match.metadata.get("title", "N/A"))
    print("Author:", match.metadata.get("author", "N/A"))
    print("Tags:", match.metadata.get("tags", []))



Score: 0.827514768
Title: IPL 2025: Former England football manager Gareth Southgate watches RR vs MI
Author: Mary Poppins
Tags: ['#GarethSouthgate', '#RajasthanRoyals', '#IPL2025']

Score: 0.817077696
Title: IPL 2025: Ruthless MI top table, knock Rajasthan Royals out of playoffs race
Author: Jane Doe
Tags: ['#IPL2025', '#MumbaiIndians', '#RajasthanRoyals']

Score: 0.812927246
Title: IPL 2025: Vaibhav Suryavanshi out for 2-ball duck days after 35-ball hundred
Author: Jane Doe
Tags: ['#VaibhavSuryavanshi', '#IPL2025', '#CricketForm']

Score: 0.807495177
Title: Virender Sehwag reveals why Delhi didn't pick Virat Kohli in IPL 2008
Author: Harry Potter
Tags: ['#VirenderSehwag', '#ViratKohli', '#IPLHistory']

Score: 0.796264768
Title: IPL 2025: Don't praise Vaibhav Suryavanshi to the skies, says Sunil Gavaskar
Author: Jane Doe
Tags: ['#VaibhavSuryavanshi', '#SunilGavaskar', '#IPL2025']


In [67]:
## Test with Only Partial Filters

In [57]:
{
  "tags": { "$in": ["IPL"] }
}


{'tags': {'$in': ['IPL']}}

In [58]:
{
  "author": "Jane Doe"
}


{'author': 'Jane Doe'}