<a href="https://colab.research.google.com/github/worldofaryavart/colab_notebooks/blob/colabnotebook/making_scraperModel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers sentence-transformers spacy wordnet nltk

Collecting wordnet
  Downloading wordnet-0.0.1b2.tar.gz (8.8 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting colorama==0.3.9 (from wordnet)
  Downloading colorama-0.3.9-py2.py3-none-any.whl.metadata (13 kB)
Downloading colorama-0.3.9-py2.py3-none-any.whl (20 kB)
Building wheels for collected packages: wordnet
  Building wheel for wordnet (setup.py) ... [?25l[?25hdone
  Created wheel for wordnet: filename=wordnet-0.0.1b2-py3-none-any.whl size=10498 sha256=6133a2650f933e092ebb79d93a043d002c3f2a292926d7224f6f0d5e845c5719
  Stored in directory: /root/.cache/pip/wheels/c0/a1/e8/4649c8712033dcdbd1e64a0fc75216a5d1769665852c36b4f9
Successfully built wordnet
Installing collected packages: colorama, wordnet
Successfully installed colorama-0.3.9 wordnet-0.0.1b2


In [2]:
import torch
from transformers import AutoTokenizer, AutoModel
from sentence_transformers import SentenceTransformer
import spacy
import nltk
from nltk.corpus import wordnet
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

In [3]:
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [4]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m97.7 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [5]:
class QueryDataset(Dataset):
  def __init__(self, texts, labels, tokenizer, max_length=128):
    self.texts = texts
    self.labels = labels
    self.tokenizer = tokenizer
    self.max_length = max_length

  def __len__(self):
    return len(self.texts)

  def __getitem__(self, idx):
    text = str(self.texts[idx])
    label = self.labels[idx]

    encoding = self.tokenizer(
        text,
        add_special_tokens=True,
        max_length=self.max_length,
        return_token_type_ids=False,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt',
    )

    return {
        'input_ids': encoding['input_ids'].flatten(),
        'attention_mask': encoding['attention_mask'].flatten(),
        'labels': torch.tensor(label, dtype=torch.long)
    }

In [6]:
class IntentClassifier(nn.Module):
  def __init__(self, n_classes, pretrained_model="bert-base-uncased"):
    super().__init__()
    self.bert = AutoModel.from_pretrained(pretrained_model)
    self.drop = nn.Dropout(p=0.3)
    self.fc = nn.Linear(self.bert.config.hidden_size, n_classes)

  def forward(self, input_ids, attention_mask):
    outputs = self.bert(
        input_ids=input_ids,
        attention_mask=attention_mask
    )
    output = self.drop(outputs[1])
    return self.fc(output)

In [7]:
class QueryExpander:
  def __init__(self):
    self.nlp = spacy.load('en_core_web_sm')

  def get_synonyms(self, word):
    synonyms = set()
    for syn in wordnet.synsets(word):
      for lemma in syn.lemmas():
        synonyms.add(lemma.name())
    return list(synonyms)

  def expand_query(self, query):
    doc = self.nlp(query)
    expanded_terms = []

    for token in doc:
      if token.pos_ in ['NOUN', 'VERB', 'ADJ']:
        synonyms = self.get_synonyms(token.text)
        expanded_terms.extend(synonyms[:2])

    return list(set([term.lower() for term in expanded_terms]))


In [8]:
def create_sample_dataset():
  """Create a sample dataset for intent classification"""
  queries = [
      "Find reasearch papers about quantum computing",
      "Download PDF papers on machine learning",
      "Summarize recent articles about AI",
      "Show me videos explaining neural networks",
      "Get images of black holes",
  ]

  intents = [
        "research_retrieval",
        "pdf_download",
        "summarization",
        "video_search",
        "image_search",
        # Add corresponding intents...
  ]

  return pd.DataFrame({'query': queries, 'intent': intents})


In [9]:
def train_intent_classifier(model, train_loader, device, epochs=3):
  optimizer = optim.Adam(model.parameters(), lr=2e-5)
  criterion = nn.CrossEntropyLoss()

  for epoch in range(epochs):
    model.train()
    total_loss = 0

    for batch in train_loader:
      input_ids = batch['input_ids'].to(device)
      attention_mask = batch['attention_mask'].to(device)
      labels = batch['labels'].to(device)

      optimizer.zero_grad()
      outputs = model(input_ids, attention_mask)
      loss = criterion(outputs, labels)

      loss.backward()
      optimizer.step()

      total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch + 1}, Average Loss: {avg_loss:.4f}")

In [12]:
def main():
  df = create_sample_dataset()
  label_encoder = LabelEncoder()
  df['encoded_intent'] = label_encoder.fit_transform(df['intent'])

  train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

  tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
  n_classes = len(label_encoder.classes_)
  model = IntentClassifier(n_classes)

  train_dataset = QueryDataset(
        texts=train_df['query'].values,
        labels=train_df['encoded_intent'].values,
        tokenizer=tokenizer
    )

  train_loader = DataLoader(
      train_dataset,
      batch_size=8,
      shuffle=True
  )

  query_expander = QueryExpander()

  device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
  model.to(device)

  train_intent_classifier(model, train_loader, device)

  test_query = "Find recent papers about deep learning"

  expanded_terms = query_expander.expand_query(test_query)
  print(f"Expanded terms: {expanded_terms}")

  model.eval()
  with torch.no_grad():
    encoding = tokenizer(
        test_query,
        add_special_tokens=True,
        max_length=128,
        return_token_type_ids=False,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt'
    )

    outputs = model(
        input_ids=encoding['input_ids'].to(device),
        attention_mask=encoding['attention_mask'].to(device)
    )
    predicted_intent = label_encoder.inverse_transform([outputs.argmax().item()])[0]
    print(f"Predicted intent: {predicted_intent}")

if __name__ == "__main__":
  main()



Epoch 1, Average Loss: 1.6312
Epoch 2, Average Loss: 1.4880
Epoch 3, Average Loss: 1.5727
Expanded terms: ['discovery', 'document', 'oceanic_abyss', 'report', 'abstruse', 'scholarship', 'breakthrough', 'larn', 'holocene', 'holocene_epoch']
Predicted intent: image_search


In [15]:
!pip install playwright beautifulsoup4 pytesseract pillow PyMuPDF youtube_dl transformers


Collecting playwright
  Downloading playwright-1.48.0-py3-none-manylinux1_x86_64.whl.metadata (3.5 kB)
Collecting pytesseract
  Downloading pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)
Collecting PyMuPDF
  Downloading PyMuPDF-1.24.13-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.4 kB)
Collecting youtube_dl
  Downloading youtube_dl-2021.12.17-py2.py3-none-any.whl.metadata (1.5 kB)
Collecting pyee==12.0.0 (from playwright)
  Downloading pyee-12.0.0-py3-none-any.whl.metadata (2.8 kB)
Downloading playwright-1.48.0-py3-none-manylinux1_x86_64.whl (38.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m38.2/38.2 MB[0m [31m17.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyee-12.0.0-py3-none-any.whl (14 kB)
Downloading pytesseract-0.3.13-py3-none-any.whl (14 kB)
Downloading PyMuPDF-1.24.13-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (19.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.8/19.8 MB[0m [31m63.

In [17]:
import asyncio
from playwright.async_api import async_playwright
import aiohttp
import pandas as pd
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import pytesseract
from PIL import Image
import io
import fitz
import youtube_dl
import re
from concurrent.futures import ThreadPoolExecutor
import numpy as np
from transformers import pipeline
import threading
from queue import Queue
import hashlib
import time


In [None]:
class SuperPoweredCrawler:
  def __init__(self):
    self.visited_urls = set()
    self.url_queue = Queue()
    self.results = []
    self.content_lock = threading.Lock()

    self.summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

    self.max_depth = 3
    self.max_pages_per_domain = 1000
    self.respect_robots = False
    self.concurrent_requests = 50

    self.content_handlers = {
        'text/plain': self.process_html,
        'application/pdf': self.process_pdf,
        'image': self.process_image,
        'video': self.process_video
    }

  async def start_crawl(self, seed_urls, search_query):
    """Start the crawling process with mutliple seed URLs"""

    print(f"Starting crawl for query: {search_query}")

    for url in seed_urls:
      self.url_queue.put((url, 0))

    async with async_playwright() as playwright:
      browser = await playwright.chromium.launch(headless=True)

      contexts = [await browser.new_context() for _ in range(self.concurrent_requests)]

      tasks = []
      for context in contexts:
        task = asyncio.create_task(self.crawler_worker(context, search_query))
        tasks.append(task)

      await asyncio.gather(*tasks)
      await browser.close()

    return self.results

  async def crawler_worker(self, context, search_query):
    """Worker process for crawling pages"""
    while not self.url_queue.empty():
      try:
        url, depth = self.url_queue.get_nowait()
      except:
        continue

      if depth > self.max_depth or url in self.visited_urls:
        continue

      try:
        page = await context.new_page()
        await page.goto(url, wait_until='networkidle', timeout=40000)

        # Get page content and metadata
        content = await page.content()
        title = await page.title()

        #Determine content type
        content_type = await self.detect_content_type(page)

        #Process content based ont type
        if content_type in self.content_handlers:
          result = await self.content_handlers[content_type](page, content, url)
          if results:
            self.add_result(result, search_query)

        new_urls = await self.extract_urls(page)
        for new_url in new_urls:
          if self.should_crawl(new_url):
            self.url_queue.put((new_url, depth + 1))

        await page.close()

      except Exception as e:
        print(f"Error processing URL {url}: {e}")
        continue

      self.visited_urls.add(url)

  async def detect_content_type(self, page):
    """Detect the type of content on the page"""

    #Check response headers
    response = await page.main_frame.request.response()
    if response:
      content_type = response.headers.get('content-type', '')

      if 'pdf' in content_type:
        return 'application/pdf'
      elif any(img_type in content_type for image_type in ['image/jpeg', 'image/png', 'image/gif']):
        return 'image'
      elif 'video' in content_type:
        return 'video'

    return 'text/html'

  async def process_html(self, page, content, url):
    """Process HTML content and extract relevant information"""

    soup = BeautifulSoup(content, 'html.parser')

    text_content = ' '.join([p.get_text() for p in soup.find_all('p')])

    meta_tags = soup.find_all('meta')
    metadata = {
        tag.get('name', tag.get('property', '')): tag.get('content', '')
        for tag in meta_tags
    }

    main_content = await self.extract_main_content(page)

    summary = None
    if len(main_content) > 500:
      summary = self.summarization(main_content, max_length = 150, min_length=50)[0]['summary_text']

    return {
        'url': url,
        'type': 'html',
        'title': await page.title(),
        'content': main_content,
        'summary': summary,
        'metadata': metadata,
        'timestamp': time.time()
    }

  async def process_pdf(self, page, content, url):
    """Extract and process PDF content"""

    try:
      response = await page.main_frame.request.response()
      pdf_content = await response.body()

      #Process PDF using PyMuPDF
      pdf_document = fitz.open(stream=pdf_content, filetype='pdf')
      text_content = ""
      images = []

      for page_num in range(pdf_document.page_count):
        page = pdf_document[page_num]
        text_content += page.get_text()

        image_list = page.get_images(full=True)
        for img_index, img in enumerate(image_list):
          xref = img[0]
          base_image = pdf_document.extract_image(xref)
          image_bytes = base_image["image"]
          images.append(image_bytes)

      return {
          'url': url,
          'type': 'html',
          'title': await page.title(),
          'content': text_content,
          'images': images,
          'timestamp': time.time()
      }

    except Exception as e:
      print(f"Error processing PDF {url}: {e}")
      return None

  async def process_image(self, page, content, url):
    """Process and analyze image content"""

    try:
      #Get image data
      response = await page.main_frame.request.response()
      image_data = await response.body()

      #Convert to PIL Image
      image = Image.open(io.BytesIO(image_data))

      ocr_text = pytesseract.image_to_string(image)

      #Basic image analysis
      width, height = image.size
      format = image.format
      mode = image.mode

      return {
          'url': url,
          'type': 'image',
          'ocr_text': ocr_text,
          'metadata': {
              'width': width,
              'height': height,
              'format': format,
              'mode': mode
          },
          'timestamp': time.time()
      }

    except Exception as e:
      print(f"Error processing image {url}: {str(e)}")
      return None

  async def process_video(self, page, content, url):
    """Extract information from video content"""

    try:
      ydl_opts = {
          'format': 'best',
          'extract_flat': True,
          'force_generic_extractor': True
      }

      with youtube_dl.YoutubeDL(ydl_opts) as ydl:
        video_info = ydl.extract_info(url, download=False)

      return {
          'url': url,
          'type': 'video',
          'title': video_info.get('title'),
          'description': video_info.get('description'),
          'duration': video_info.get('duration'),
          'timestamp': time.time()
      }

    except Exception as e:
      print(f"Error processing video {url}: {str(e)}")
      return None

  def add_result(self, result, search_query):
    """Add processed result to the results list with relevance scoring"""
