<a href="https://colab.research.google.com/github/worldofaryavart/colab_notebooks/blob/colabnotebook/query_optimization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers sentence-transformers spacy wordnet nltk

In [None]:
!python -m spacy download en_core_web_sm

In [None]:
import torch
from transformers import AutoTokenizer, AutoModel
from sentence_transformers import SentenceTransformer
import spacy
import nltk
from nltk.corpus import wordnet
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

In [None]:
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')

In [None]:
class QueryDataset(Dataset):
  def __init__(self, texts, labels, tokenizer, max_length=128):
    self.texts = texts
    self.labels = labels
    self.tokenizer = tokenizer
    self.max_length = max_length

  def __len__(self):
    return len(self.texts)

  def __getitem__(self, idx):
    text = str(self.texts[idx])
    label = self.labels[idx]

    encoding = self.tokenizer(
        text,
        add_special_tokens=True,
        max_length=self.max_length,
        return_token_type_ids=False,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt',
    )

    return {
        'input_ids': encoding['input_ids'].flatten(),
        'attention_mask': encoding['attention_mask'].flatten(),
        'labels': torch.tensor(label, dtype=torch.long)
    }

In [None]:
class IntentClassifier(nn.Module):
  def __init__(self, n_classes, pretrained_model="bert-base-uncased"):
    super().__init__()
    self.bert = AutoModel.from_pretrained(pretrained_model)
    self.drop = nn.Dropout(p=0.3)
    self.fc = nn.Linear(self.bert.config.hidden_size, n_classes)

  def forward(self, input_ids, attention_mask):
    outputs = self.bert(
        input_ids=input_ids,
        attention_mask=attention_mask
    )
    output = self.drop(outputs[1])
    return self.fc(output)

In [None]:
class QueryExpander:
  def __init__(self):
    self.nlp = spacy.load('en_core_web_sm')

  def get_synonyms(self, word):
    synonyms = set()
    for syn in wordnet.synsets(word):
      for lemma in syn.lemmas():
        synonyms.add(lemma.name())
    return list(synonyms)

  def expand_query(self, query):
    doc = self.nlp(query)
    expanded_terms = []

    for token in doc:
      if token.pos_ in ['NOUN', 'VERB', 'ADJ']:
        synonyms = self.get_synonyms(token.text)
        expanded_terms.extend(synonyms[:2])

    return list(set([term.lower() for term in expanded_terms]))


In [None]:
def create_sample_dataset():
  """Create a sample dataset for intent classification"""
  queries = [
      "Find reasearch papers about quantum computing",
      "Download PDF papers on machine learning",
      "Summarize recent articles about AI",
      "Show me videos explaining neural networks",
      "Get images of black holes",
  ]

  intents = [
        "research_retrieval",
        "pdf_download",
        "summarization",
        "video_search",
        "image_search",
        # Add corresponding intents...
  ]

  return pd.DataFrame({'query': queries, 'intent': intents})


In [None]:
def train_intent_classifier(model, train_loader, device, epochs=3):
  optimizer = optim.Adam(model.parameters(), lr=2e-5)
  criterion = nn.CrossEntropyLoss()

  for epoch in range(epochs):
    model.train()
    total_loss = 0

    for batch in train_loader:
      input_ids = batch['input_ids'].to(device)
      attention_mask = batch['attention_mask'].to(device)
      labels = batch['labels'].to(device)

      optimizer.zero_grad()
      outputs = model(input_ids, attention_mask)
      loss = criterion(outputs, labels)

      loss.backward()
      optimizer.step()

      total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch + 1}, Average Loss: {avg_loss:.4f}")

In [None]:
def main():
  df = create_sample_dataset()
  label_encoder = LabelEncoder()
  df['encoded_intent'] = label_encoder.fit_transform(df['intent'])

  train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

  tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
  n_classes = len(label_encoder.classes_)
  model = IntentClassifier(n_classes)

  train_dataset = QueryDataset(
        texts=train_df['query'].values,
        labels=train_df['encoded_intent'].values,
        tokenizer=tokenizer
    )

  train_loader = DataLoader(
      train_dataset,
      batch_size=8,
      shuffle=True
  )

  query_expander = QueryExpander()

  device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
  model.to(device)

  train_intent_classifier(model, train_loader, device)

  test_query = "Find recent papers about deep learning"

  expanded_terms = query_expander.expand_query(test_query)
  print(f"Expanded terms: {expanded_terms}")

  model.eval()
  with torch.no_grad():
    encoding = tokenizer(
        test_query,
        add_special_tokens=True,
        max_length=128,
        return_token_type_ids=False,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt'
    )

    outputs = model(
        input_ids=encoding['input_ids'].to(device),
        attention_mask=encoding['attention_mask'].to(device)
    )
    predicted_intent = label_encoder.inverse_transform([outputs.argmax().item()])[0]
    print(f"Predicted intent: {predicted_intent}")

if __name__ == "__main__":
  main()