<a href="https://colab.research.google.com/github/worldofaryavart/colab_notebooks/blob/colabnotebook/making_scraperModel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers sentence-transformers spacy wordnet nltk

Collecting wordnet
  Downloading wordnet-0.0.1b2.tar.gz (8.8 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting colorama==0.3.9 (from wordnet)
  Downloading colorama-0.3.9-py2.py3-none-any.whl.metadata (13 kB)
Downloading colorama-0.3.9-py2.py3-none-any.whl (20 kB)
Building wheels for collected packages: wordnet
  Building wheel for wordnet (setup.py) ... [?25l[?25hdone
  Created wheel for wordnet: filename=wordnet-0.0.1b2-py3-none-any.whl size=10498 sha256=61dba133ab7213348a9dce9fd6562c976e6a729a67575187fcfd8375463cc8f2
  Stored in directory: /root/.cache/pip/wheels/c0/a1/e8/4649c8712033dcdbd1e64a0fc75216a5d1769665852c36b4f9
Successfully built wordnet
Installing collected packages: colorama, wordnet
Successfully installed colorama-0.3.9 wordnet-0.0.1b2


In [2]:
import torch
from transformers import AutoTokenizer, AutoModel
from sentence_transformers import SentenceTransformer
import spacy
import nltk
from nltk.corpus import wordnet
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

In [3]:
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [4]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m20.7 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [5]:
class QueryDataset(Dataset):
  def __init__(self, texts, labels, tokenizer, max_length=128):
    self.texts = texts
    self.labels = labels
    self.tokenizer = tokenizer
    self.max_length = max_length

  def __len__(self):
    return len(self.texts)

  def __getitem__(self, idx):
    text = str(self.texts[idx])
    label = self.labels[idx]

    encoding = self.tokenizer(
        text,
        add_special_tokens=True,
        max_length=self.max_length,
        return_token_type_ids=False,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt',
    )

    return {
        'input_ids': encoding['input_ids'].flatten(),
        'attention_mask': encoding['attention_mask'].flatten(),
        'labels': torch.tensor(label, dtype=torch.long)
    }

In [6]:
class IntentClassifier(nn.Module):
  def __init__(self, n_classes, pretrained_model="bert-base-uncased"):
    super().__init__()
    self.bert = AutoModel.from_pretrained(pretrained_model)
    self.drop = nn.Dropout(p=0.3)
    self.fc = nn.Linear(self.bert.config.hidden_size, n_classes)

  def forward(self, input_ids, attention_mask):
    outputs = self.bert(
        input_ids=input_ids,
        attention_mask=attention_mask
    )
    output = self.drop(outputs[1])
    return self.fc(output)

In [7]:
class QueryExpander:
  def __init__(self):
    self.nlp = spacy.load('en_core_web_sm')

  def get_synonyms(self, word):
    synonyms = set()
    for syn in wordnet.synsets(word):
      for lemma in syn.lemmas():
        synonyms.add(lemma.name())
    return list(synonyms)

  def expand_query(self, query):
    doc = self.nlp(query)
    expanded_terms = []

    for token in doc:
      if token.pos_ in ['NOUN', 'VERB', 'ADJ']:
        synonyms = self.get_synonyms(token.text)
        expanded_terms.extend(synonyms[:2])

    return list(set([term.lower() for term in expanded_terms]))


In [8]:
def create_sample_dataset():
  """Create a sample dataset for intent classification"""
  queries = [
      "Find reasearch papers about quantum computing",
      "Download PDF papers on machine learning",
      "Summarize recent articles about AI",
      "Show me videos explaining neural networks",
      "Get images of black holes",
  ]

  intents = [
        "research_retrieval",
        "pdf_download",
        "summarization",
        "video_search",
        "image_search",
        # Add corresponding intents...
  ]

  return pd.DataFrame({'query': queries, 'intent': intents})


In [9]:
def train_intent_classifier(model, train_loader, device, epochs=3):
  optimizer = optim.Adam(model.parameters(), lr=2e-5)
  criterion = nn.CrossEntropyLoss()

  for epoch in range(epochs):
    model.train()
    total_loss = 0

    for batch in train_loader:
      input_ids = batch['input_ids'].to(device)
      attention_mask = batch['attention_mask'].to(device)
      labels = batch['labels'].to(device)

      optimizer.zero_grad()
      outputs = model(input_ids, attention_mask)
      loss = criterion(outputs, labels)

      loss.backward()
      optimizer.step()

      total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch + 1}, Average Loss: {avg_loss:.4f}")

In [16]:
def main():
  df = create_sample_dataset()
  label_encoder = LabelEncoder()
  df['encoded_intent'] = label_encoder.fit_transform(df['intent'])

  train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

  tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
  n_classes = len(label_encoder.classes_)
  model = IntentClassifier(n_classes)
  # print(model)

  train_dataset = QueryDataset(
        texts=train_df['query'].values,
        labels=train_df['encoded_intent'].values,
        tokenizer=tokenizer
    )

#   train_loader = DataLoader(
#       train_dataset,
#       batch_size=8,
#       shuffle=True
#   )

#   query_expander = QueryExpander()

#   device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#   model.to(device)

#   train_intent_classifier(model, train_loader, device)

#   test_query = "Find recent papers about deep learning"

#   expanded_terms = query_expander.expand_query(test_query)
#   print(f"Expanded terms: {expanded_terms}")

#   model.eval()
#   with torch.no_grad():
#     encoding = tokenizer(
#         test_query,
#         add_special_tokens=True,
#         max_length=128,
#         return_token_type_ids=False,
#         padding='max_length',
#         truncation=True,
#         return_attention_mask=True,
#         return_tensors='pt'
#     )

#     outputs = model(
#         input_ids=encoding['input_ids'].to(device),
#         attention_mask=encoding['attention_mask'].to(device)
#     )
#     predicted_intent = label_encoder.inverse_transform([outputs.argmax().item()])[0]
#     print(f"Predicted intent: {predicted_intent}")

if __name__ == "__main__":
  main()

