<a href="https://colab.research.google.com/github/worldofaryavart/colab_notebooks/blob/colabnotebook/making_scraperModel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers sentence-transformers spacy wordnet nltk

Collecting wordnet
  Downloading wordnet-0.0.1b2.tar.gz (8.8 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting colorama==0.3.9 (from wordnet)
  Downloading colorama-0.3.9-py2.py3-none-any.whl.metadata (13 kB)
Downloading colorama-0.3.9-py2.py3-none-any.whl (20 kB)
Building wheels for collected packages: wordnet
  Building wheel for wordnet (setup.py) ... [?25l[?25hdone
  Created wheel for wordnet: filename=wordnet-0.0.1b2-py3-none-any.whl size=10498 sha256=510e1df90ebc6d15225240d1ae4b42dfac0f44e896a482be9e1e1c8c81047f49
  Stored in directory: /root/.cache/pip/wheels/c0/a1/e8/4649c8712033dcdbd1e64a0fc75216a5d1769665852c36b4f9
Successfully built wordnet
Installing collected packages: colorama, wordnet
Successfully installed colorama-0.3.9 wordnet-0.0.1b2


In [2]:
import torch
from transformers import AutoTokenizer, AutoModel
from sentence_transformers import SentenceTransformer
import spacy
import nltk
from nltk.corpus import wordnet
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

In [3]:
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [4]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m21.1 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [5]:
class QueryDataset(Dataset):
  def __init__(self, texts, labels, tokenizer, max_length=128):
    self.texts = texts
    self.labels = labels
    self.tokenizer = tokenizer
    self.max_length = max_length

  def __len__(self):
    return len(self.texts)

  def __getitem__(self, idx):
    text = str(self.texts[idx])
    label = self.labels[idx]

    encoding = self.tokenizer(
        text,
        add_special_tokens=True,
        max_length=self.max_length,
        return_token_type_ids=False,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt',
    )

    return {
        'input_ids': encoding['input_ids'].flatten(),
        'attention_mask': encoding['attention_mask'].flatten(),
        'labels': torch.tensor(label, dtype=torch.long)
    }

In [6]:
class IntentClassifier(nn.Module):
  def __init__(self, n_classes, pretrained_model="bert-base-uncased"):
    super().__init__()
    self.bert = AutoModel.from_pretrained(pretrained_model)
    self.drop = nn.Dropout(p=0.3)
    self.fc = nn.Linear(self.bert.config.hidden_size, n_classes)

  def forward(self, input_ids, attention_mask):
    outputs = self.bert(
        input_ids=input_ids,
        attention_mask=attention_mask
    )
    output = self.drop(outputs[1])
    return self.fc(output)

In [7]:
class QueryExpander:
  def __init__(self):
    self.nlp = spacy.load('en_core_web_sm')

  def get_synonyms(self, word):
    synonyms = set()
    for syn in wordnet.synsets(word):
      for lemma in syn.lemmas():
        synonyms.add(lemma.name())
    return list(synonyms)

  def expand_query(self, query):
    doc = self.nlp(query)
    expanded_terms = []

    for token in doc:
      if token.pos_ in ['NOUN', 'VERB', 'ADJ']:
        synonyms = self.get_synonyms(token.text)
        expanded_terms.extend(synonyms[:2])

    return list(set([term.lower() for term in expanded_terms]))


In [8]:
def create_sample_dataset():
  """Create a sample dataset for intent classification"""
  queries = [
      "Find reasearch papers about quantum computing",
      "Download PDF papers on machine learning",
      "Summarize recent articles about AI",
      "Show me videos explaining neural networks",
      "Get images of black holes",
  ]

  intents = [
        "research_retrieval",
        "pdf_download",
        "summarization",
        "video_search",
        "image_search",
        # Add corresponding intents...
  ]

  return pd.DataFrame({'query': queries, 'intent': intents})


In [9]:
def train_intent_classifier(model, train_loader, device, epochs=3):
  optimizer = optim.Adam(model.parameters(), lr=2e-5)
  criterion = nn.CrossEntropyLoss()

  for epoch in range(epochs):
    model.train()
    total_loss = 0

    for batch in train_loader:
      input_ids = batch['input_ids'].to(device)
      attention_mask = batch['attention_mask'].to(device)
      labels = batch['labels'].to(device)

      optimizer.zero_grad()
      outputs = model(input_ids, attention_mask)
      loss = criterion(outputs, labels)

      loss.backward()
      optimizer.step()

      total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch + 1}, Average Loss: {avg_loss:.4f}")

In [10]:
def main():
  df = create_sample_dataset()
  label_encoder = LabelEncoder()
  df['encoded_intent'] = label_encoder.fit_transform(df['intent'])

  train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

  tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
  n_classes = len(label_encoder.classes_)
  model = IntentClassifier(n_classes)

  train_dataset = QueryDataset(
        texts=train_df['query'].values,
        labels=train_df['encoded_intent'].values,
        tokenizer=tokenizer
    )

  train_loader = DataLoader(
      train_dataset,
      batch_size=8,
      shuffle=True
  )

  query_expander = QueryExpander()

  device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
  model.to(device)

  train_intent_classifier(model, train_loader, device)

  test_query = "Find recent papers about deep learning"

  expanded_terms = query_expander.expand_query(test_query)
  print(f"Expanded terms: {expanded_terms}")

  model.eval()
  with torch.no_grad():
    encoding = tokenizer(
        test_query,
        add_special_tokens=True,
        max_length=128,
        return_token_type_ids=False,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt'
    )

    outputs = model(
        input_ids=encoding['input_ids'].to(device),
        attention_mask=encoding['attention_mask'].to(device)
    )
    predicted_intent = label_encoder.inverse_transform([outputs.argmax().item()])[0]
    print(f"Predicted intent: {predicted_intent}")

if __name__ == "__main__":
  main()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Epoch 1, Average Loss: 1.5012
Epoch 2, Average Loss: 1.7414
Epoch 3, Average Loss: 1.4054
Expanded terms: ['oceanic_abyss', 'recover', 'regain', 'thick', 'memorize', 'get_word', 'report', 'wallpaper', 'recent_epoch', 'holocene_epoch']
Predicted intent: image_search


In [2]:
!pip install playwright beautifulsoup4 pytesseract pillow PyMuPDF youtube_dl transformers


Collecting playwright
  Downloading playwright-1.48.0-py3-none-manylinux1_x86_64.whl.metadata (3.5 kB)
Collecting pytesseract
  Downloading pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)
Collecting PyMuPDF
  Downloading PyMuPDF-1.24.13-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.4 kB)
Collecting youtube_dl
  Downloading youtube_dl-2021.12.17-py2.py3-none-any.whl.metadata (1.5 kB)
Collecting pyee==12.0.0 (from playwright)
  Downloading pyee-12.0.0-py3-none-any.whl.metadata (2.8 kB)
Downloading playwright-1.48.0-py3-none-manylinux1_x86_64.whl (38.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m38.2/38.2 MB[0m [31m16.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyee-12.0.0-py3-none-any.whl (14 kB)
Downloading pytesseract-0.3.13-py3-none-any.whl (14 kB)
Downloading PyMuPDF-1.24.13-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (19.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.8/19.8 MB[0m [31m55.

In [4]:
!playwright install chromium

Downloading Chromium 130.0.6723.31 (playwright build v1140)[2m from https://playwright.azureedge.net/builds/chromium/1140/chromium-linux.zip[22m
[1G164.5 MiB [] 0% 0.0s[0K[1G164.5 MiB [] 0% 81.6s[0K[1G164.5 MiB [] 0% 52.8s[0K[1G164.5 MiB [] 0% 22.8s[0K[1G164.5 MiB [] 0% 19.1s[0K[1G164.5 MiB [] 0% 18.0s[0K[1G164.5 MiB [] 0% 16.3s[0K[1G164.5 MiB [] 0% 13.6s[0K[1G164.5 MiB [] 1% 12.5s[0K[1G164.5 MiB [] 1% 13.0s[0K[1G164.5 MiB [] 1% 12.7s[0K[1G164.5 MiB [] 1% 11.6s[0K[1G164.5 MiB [] 1% 11.1s[0K[1G164.5 MiB [] 2% 11.0s[0K[1G164.5 MiB [] 2% 10.9s[0K[1G164.5 MiB [] 2% 10.8s[0K[1G164.5 MiB [] 2% 11.1s[0K[1G164.5 MiB [] 2% 10.8s[0K[1G164.5 MiB [] 2% 10.9s[0K[1G164.5 MiB [] 3% 10.5s[0K[1G164.5 MiB [] 3% 10.2s[0K[1G164.5 MiB [] 3% 10.5s[0K[1G164.5 MiB [] 4% 10.4s[0K[1G164.5 MiB [] 4% 10.3s[0K[1G164.5 MiB [] 4% 10.7s[0K[1G164.5 MiB [] 4% 10.9s[0K[1G164.5 MiB [] 4% 10.5s[0K[1G164.5 MiB [] 5% 10.1s[0K[1G164.5 MiB [] 5% 9.7s[0K[1G164.5 MiB 

In [5]:
!apt-get install -y poppler-utils

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following NEW packages will be installed:
  poppler-utils
0 upgraded, 1 newly installed, 0 to remove and 49 not upgraded.
Need to get 186 kB of archives.
After this operation, 696 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 poppler-utils amd64 22.02.0-2ubuntu0.5 [186 kB]
Fetched 186 kB in 0s (377 kB/s)
Selecting previously unselected package poppler-utils.
(Reading database ... 123623 files and directories currently installed.)
Preparing to unpack .../poppler-utils_22.02.0-2ubuntu0.5_amd64.deb ...
Unpacking poppler-utils (22.02.0-2ubuntu0.5) ...
Setting up poppler-utils (22.02.0-2ubuntu0.5) ...
Processing triggers for man-db (2.10.2-1) ...


In [6]:
!apt-get install -y tesseract-ocr

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  tesseract-ocr-eng tesseract-ocr-osd
The following NEW packages will be installed:
  tesseract-ocr tesseract-ocr-eng tesseract-ocr-osd
0 upgraded, 3 newly installed, 0 to remove and 49 not upgraded.
Need to get 4,816 kB of archives.
After this operation, 15.6 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr-eng all 1:4.00~git30-7274cfa-1.1 [1,591 kB]
Get:2 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr-osd all 1:4.00~git30-7274cfa-1.1 [2,990 kB]
Get:3 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr amd64 4.1.1-2.1build1 [236 kB]
Fetched 4,816 kB in 1s (5,038 kB/s)
Selecting previously unselected package tesseract-ocr-eng.
(Reading database ... 123653 files and directories currently installed.)
Preparing to unpack .../tesseract-ocr-

In [12]:
import asyncio
from playwright.async_api import async_playwright
import aiohttp
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import pytesseract
from PIL import Image
import io
import fitz  # PyMuPDF
import re
from transformers import pipeline
import threading
from queue import Queue
import time

In [13]:
!pip install nest_asyncio



In [14]:
import nest_asyncio

In [16]:
nest_asyncio.apply()

In [None]:
class SuperPoweredCrawler:
    def __init__(self):
        self.visited_urls = set()
        self.url_queue = Queue()
        self.results = []
        self.content_lock = threading.Lock()

        # Initialize content extractors
        self.summarizer = pipeline("summarization", model="facebook/bart-large-cnn", device="cpu")

        # Configurable crawler settings
        self.max_depth = 3
        self.max_pages_per_domain = 1000
        self.concurrent_requests = 5  # Reduced for stability

    async def start_crawl(self, seed_urls, search_query):
        """
        Start the crawling process with multiple seed URLs
        """
        print(f"Starting crawl for query: {search_query}")

        for url in seed_urls:
            self.url_queue.put((url, 0))  # (url, depth)

        async with async_playwright() as playwright:
            browser = await playwright.chromium.launch(headless=True)
            context = await browser.new_context(
                viewport={'width': 1280, 'height': 800},
                user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
            )

            # Start crawler workers
            tasks = []
            for _ in range(self.concurrent_requests):
                task = asyncio.create_task(self.crawler_worker(context, search_query))
                tasks.append(task)

            await asyncio.gather(*tasks)
            await browser.close()

        return self.results

    async def crawler_worker(self, context, search_query):
        """
        Worker process for crawling pages
        """
        while not self.url_queue.empty():
            try:
                url, depth = self.url_queue.get_nowait()
            except:
                break

            if depth > self.max_depth or url in self.visited_urls:
                continue

            try:
                print(f"Crawling: {url}")
                page = await context.new_page()

                # Set timeout and handle navigation
                response = await page.goto(
                    url,
                    wait_until='domcontentloaded',
                    timeout=30000
                )

                if not response:
                    print(f"No response from {url}")
                    await page.close()
                    continue

                # Check if the page was successfully loaded
                if response.status >= 400:
                    print(f"Error status {response.status} for {url}")
                    await page.close()
                    continue

                # Get page content
                content = await page.content()

                # Process the page content
                result = await self.process_page(page, content, url)
                if result:
                    self.add_result(result, search_query)

                # Extract and queue new URLs
                new_urls = await self.extract_urls(page)
                for new_url in new_urls:
                    if self.should_crawl(new_url):
                        self.url_queue.put((new_url, depth + 1))

                self.visited_urls.add(url)
                await page.close()

                # Add a small delay between requests
                await asyncio.sleep(1)

            except Exception as e:
                print(f"Error crawling {url}: {str(e)}")
                if 'page' in locals():
                    await page.close()
                continue

    async def process_page(self, page, content, url):
        """
        Process page content based on content type
        """
        try:
            # Get content type from response headers
            response = await page.request.response()
            content_type = response.headers.get('content-type', '').lower()

            if 'pdf' in content_type:
                return await self.process_pdf_content(response, url)
            elif any(img_type in content_type for img_type in ['image/jpeg', 'image/png', 'image/gif']):
                return await self.process_image_content(response, url)
            else:
                return await self.process_html_content(page, content, url)

        except Exception as e:
            print(f"Error processing page {url}: {str(e)}")
            return None

    async def process_html_content(self, page, content, url):
        """
        Process HTML content
        """
        try:
            soup = BeautifulSoup(content, 'html.parser')

            # Extract text content
            text_content = ' '.join([p.get_text() for p in soup.find_all(['p', 'article', 'section'])])
            text_content = re.sub(r'\s+', ' ', text_content).strip()

            # Get title
            title = await page.title()

            # Generate summary if content is long enough
            summary = None
            if len(text_content) > 500:
                try:
                    summary = self.summarizer(text_content[:4096], max_length=150, min_length=50)[0]['summary_text']
                except Exception as e:
                    print(f"Error generating summary: {str(e)}")

            return {
                'url': url,
                'type': 'html',
                'title': title,
                'content': text_content[:5000],  # Limit content length
                'summary': summary,
                'timestamp': time.time()
            }

        except Exception as e:
            print(f"Error processing HTML content for {url}: {str(e)}")
            return None

    async def process_pdf_content(self, response, url):
        """
        Process PDF content
        """
        try:
            pdf_data = await response.body()
            pdf_document = fitz.open(stream=pdf_data, filetype="pdf")
            text_content = ""

            for page_num in range(min(pdf_document.page_count, 10)):  # Limit to first 10 pages
                page = pdf_document[page_num]
                text_content += page.get_text()

            return {
                'url': url,
                'type': 'pdf',
                'content': text_content[:5000],  # Limit content length
                'page_count': pdf_document.page_count,
                'timestamp': time.time()
            }

        except Exception as e:
            print(f"Error processing PDF {url}: {str(e)}")
            return None

    async def process_image_content(self, response, url):
        """
        Process image content
        """
        try:
            image_data = await response.body()
            image = Image.open(io.BytesIO(image_data))

            # Extract text using OCR
            try:
                ocr_text = pytesseract.image_to_string(image)
            except:
                ocr_text = ""

            return {
                'url': url,
                'type': 'image',
                'ocr_text': ocr_text,
                'metadata': {
                    'width': image.size[0],
                    'height': image.size[1],
                    'format': image.format
                },
                'timestamp': time.time()
            }

        except Exception as e:
            print(f"Error processing image {url}: {str(e)}")
            return None

    async def extract_urls(self, page):
        """
        Extract URLs from the page
        """
        try:
            # Get all links using JavaScript evaluation
            links = await page.evaluate('''() => {
                const links = Array.from(document.getElementsByTagName('a'));
                return links.map(link => link.href).filter(href => href);
            }''')

            return list(set(links))  # Remove duplicates

        except Exception as e:
            print(f"Error extracting URLs: {str(e)}")
            return []

    def should_crawl(self, url):
        """
        Determine if a URL should be crawled
        """
        try:
            parsed = urlparse(url)

            # Basic URL filtering
            if not parsed.scheme in ['http', 'https']:
                return False

            # Check domain crawl limit
            domain = parsed.netloc
            domain_count = sum(1 for visited in self.visited_urls
                             if urlparse(visited).netloc == domain)

            if domain_count >= self.max_pages_per_domain:
                return False

            # Filter out non-content URLs
            exclude_patterns = [
                r'\.(css|js|json|xml)$',
                r'(login|signup|logout)',
                r'(facebook|twitter|instagram)',
                r'\.(jpg|jpeg|png|gif)$'  # Skip direct image links
            ]

            return not any(re.search(pattern, url, re.I)
                         for pattern in exclude_patterns)

        except:
            return False

    def add_result(self, result, search_query):
        """
        Add processed result to the results list with relevance scoring
        """
        with self.content_lock:
            result['relevance_score'] = self.calculate_relevance(result, search_query)
            self.results.append(result)
            self.results.sort(key=lambda x: x['relevance_score'], reverse=True)

    def calculate_relevance(self, result, query):
        """
        Calculate relevance score for a result
        """
        score = 0
        query_terms = query.lower().split()

        # Get the content to score
        content = ''
        if 'content' in result:
            content = result['content'].lower()
        elif 'ocr_text' in result:
            content = result['ocr_text'].lower()

        # Term frequency scoring
        for term in query_terms:
            score += content.count(term)

        # Type-based boosting
        type_boost = {
            'pdf': 1.2,
            'html': 1.0,
            'image': 0.8
        }
        score *= type_boost.get(result['type'], 1.0)

        return score

async def run_crawler():
    # Example usage
    crawler = SuperPoweredCrawler()

    # Define seed URLs
    seed_urls = [
        "https://quantum-computing.ibm.com",
        "https://www.nature.com/subjects/quantum-computing",
        "https://www.scientificamerican.com/computing/",
        "https://phys.org/technology-news/quantum-computing/",
        "https://www.science.org/topic/tags/quantum-computing"
    ]

    # Start the crawl
    results = await crawler.start_crawl(seed_urls, "quantum computing latest developments")

    # Print results
    print("\nSearch Results:")
    for i, result in enumerate(results[:10], 1):
        print(f"\n{i}. {result['url']}")
        print(f"Type: {result['type']}")
        print(f"Score: {result['relevance_score']:.2f}")
        if 'summary' in result and result['summary']:
            print(f"Summary: {result['summary']}")

await run_crawler()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]



Starting crawl for query: quantum computing latest developments
Crawling: https://quantum-computing.ibm.com
Crawling: https://www.nature.com/subjects/quantum-computing
Crawling: https://www.scientificamerican.com/computing/
Crawling: https://phys.org/technology-news/quantum-computing/
Crawling: https://www.science.org/topic/tags/quantum-computing
Error status 403 for https://www.science.org/topic/tags/quantum-computing
Error status 404 for https://phys.org/technology-news/quantum-computing/
Error status 404 for https://www.nature.com/subjects/quantum-computing
Error processing page https://www.scientificamerican.com/computing/: 'APIRequestContext' object has no attribute 'response'
Error processing page https://quantum-computing.ibm.com: 'APIRequestContext' object has no attribute 'response'
Crawling: https://www.scientificamerican.com/article/artificial-intelligence-will-let-humanity-talk-to-alien-civilizations/
Crawling: https://www.scientificamerican.com/computing/?page=2
Error proc

In [30]:
class SuperPoweredCrawler:
    def __init__(self):
        self.visited_urls = set()
        self.url_queue = Queue()
        self.results = []
        self.content_lock = threading.Lock()

        # Initialize content extractors
        self.summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

        # Configurable crawler settings
        self.max_depth = 3
        self.max_pages_per_domain = 1000
        self.respect_robots = False  # Note: Be careful with this setting
        self.concurrent_requests = 50

        # Content type handlers
        self.content_handlers = {
            'text/html': self.process_html,
            'application/pdf': self.process_pdf,
            'image': self.process_image,
            'video': self.process_video
        }

    async def start_crawl(self, seed_urls, search_query):
        """
        Start the crawling process with multiple seed URLs
        """
        print(f"Starting crawl for query: {search_query}")

        for url in seed_urls:
            self.url_queue.put((url, 0))  # (url, depth)

        async with async_playwright() as playwright:
            browser = await playwright.chromium.launch(headless=True)

            # Create multiple browser contexts for parallel processing
            contexts = [await browser.new_context() for _ in range(self.concurrent_requests)]

            # Start crawler workers
            tasks = []
            for context in contexts:
                task = asyncio.create_task(self.crawler_worker(context, search_query))
                tasks.append(task)

            await asyncio.gather(*tasks)
            await browser.close()

        return self.results

    async def crawler_worker(self, context, search_query):
        """
        Worker process for crawling pages
        """
        while not self.url_queue.empty():
            try:
                url, depth = self.url_queue.get_nowait()
            except:
                continue

            if depth > self.max_depth or url in self.visited_urls:
                continue

            try:
                page = await context.new_page()
                await page.goto(url, wait_until='networkidle', timeout=30000)

                # Get page content and metadata
                content = await page.content()
                title = await page.title()

                # Determine content type
                content_type = await self.detect_content_type(page)

                # Process content based on type
                if content_type in self.content_handlers:
                    result = await self.content_handlers[content_type](page, content, url)
                    if result:
                        self.add_result(result, search_query)

                # Extract new URLs
                new_urls = await self.extract_urls(page)
                for new_url in new_urls:
                    if self.should_crawl(new_url):
                        self.url_queue.put((new_url, depth + 1))

                await page.close()

            except Exception as e:
                print(f"Error crawling {url}: {str(e)}")
                continue

            self.visited_urls.add(url)

    async def detect_content_type(self, page):
        """
        Detect the type of content on the page
        """
        # Check response headers
        response = await page.main_frame.request.response()
        if response:
            content_type = response.headers.get('content-type', '')

            if 'pdf' in content_type:
                return 'application/pdf'
            elif any(img_type in content_type for img_type in ['image/jpeg', 'image/png', 'image/gif']):
                return 'image'
            elif 'video' in content_type:
                return 'video'

        return 'text/html'

    async def process_html(self, page, content, url):
        """
        Process HTML content and extract relevant information
        """
        soup = BeautifulSoup(content, 'html.parser')

        # Extract text content
        text_content = ' '.join([p.get_text() for p in soup.find_all('p')])

        # Extract metadata
        meta_tags = soup.find_all('meta')
        metadata = {
            tag.get('name', tag.get('property', '')): tag.get('content', '')
            for tag in meta_tags
        }

        # Get main content using readability algorithms
        main_content = await self.extract_main_content(page)

        # Generate summary if content is long enough
        summary = None
        if len(main_content) > 500:
            summary = self.summarizer(main_content, max_length=150, min_length=50)[0]['summary_text']

        return {
            'url': url,
            'type': 'html',
            'title': await page.title(),
            'content': main_content,
            'summary': summary,
            'metadata': metadata,
            'timestamp': time.time()
        }

    async def process_pdf(self, page, content, url):
        """
        Extract and process PDF content
        """
        try:
            # Download PDF content
            response = await page.main_frame.request.response()
            pdf_content = await response.body()

            # Process PDF using PyMuPDF
            pdf_document = fitz.open(stream=pdf_content, filetype="pdf")
            text_content = ""
            images = []

            for page_num in range(pdf_document.page_count):
                page = pdf_document[page_num]
                text_content += page.get_text()

                # Extract images
                image_list = page.get_images(full=True)
                for img_index, img in enumerate(image_list):
                    xref = img[0]
                    base_image = pdf_document.extract_image(xref)
                    image_bytes = base_image["image"]
                    images.append(image_bytes)

            return {
                'url': url,
                'type': 'pdf',
                'content': text_content,
                'images': images,
                'page_count': pdf_document.page_count,
                'timestamp': time.time()
            }

        except Exception as e:
            print(f"Error processing PDF {url}: {str(e)}")
            return None

    async def process_image(self, page, content, url):
        """
        Process and analyze image content
        """
        try:
            # Get image data
            response = await page.main_frame.request.response()
            image_data = await response.body()

            # Convert to PIL Image
            image = Image.open(io.BytesIO(image_data))

            # Extract text using OCR
            ocr_text = pytesseract.image_to_string(image)

            # Basic image analysis
            width, height = image.size
            format = image.format
            mode = image.mode

            return {
                'url': url,
                'type': 'image',
                'ocr_text': ocr_text,
                'metadata': {
                    'width': width,
                    'height': height,
                    'format': format,
                    'mode': mode
                },
                'timestamp': time.time()
            }

        except Exception as e:
            print(f"Error processing image {url}: {str(e)}")
            return None

    async def process_video(self, page, content, url):
        """
        Extract information from video content
        """
        try:
            # Video metadata extraction using youtube-dl
            ydl_opts = {
                'format': 'best',
                'extract_flat': True,
                'force_generic_extractor': True
            }

            with youtube_dl.YoutubeDL(ydl_opts) as ydl:
                video_info = ydl.extract_info(url, download=False)

            return {
                'url': url,
                'type': 'video',
                'title': video_info.get('title'),
                'description': video_info.get('description'),
                'duration': video_info.get('duration'),
                'timestamp': time.time()
            }

        except Exception as e:
            print(f"Error processing video {url}: {str(e)}")
            return None

    def add_result(self, result, search_query):
        """
        Add processed result to the results list with relevance scoring
        """
        with self.content_lock:
            # Calculate relevance score
            relevance_score = self.calculate_relevance(result, search_query)
            result['relevance_score'] = relevance_score

            self.results.append(result)
            # Sort results by relevance
            self.results.sort(key=lambda x: x['relevance_score'], reverse=True)

    def calculate_relevance(self, result, query):
        """
        Calculate relevance score for a result
        """
        score = 0
        query_terms = query.lower().split()

        if 'content' in result:
            content = result['content'].lower()
            # Term frequency scoring
            for term in query_terms:
                score += content.count(term)

            # Position scoring
            for term in query_terms:
                pos = content.find(term)
                if pos != -1:
                    score += 1 / (pos + 1)

        # Type-based boosting
        type_boost = {
            'pdf': 1.2,  # Boost PDFs as they often contain detailed information
            'html': 1.0,
            'video': 1.1,
            'image': 0.9
        }
        score *= type_boost.get(result['type'], 1.0)

        return score

    async def extract_urls(self, page):
        """
        Extract URLs from the page
        """
        links = await page.evaluate('''() => {
            const links = Array.from(document.getElementsByTagName('a'));
            return links.map(link => link.href);
        }''')

        return [link for link in links if self.should_crawl(link)]

    def should_crawl(self, url):
        """
        Determine if a URL should be crawled
        """
        try:
            parsed = urlparse(url)

            # Basic URL filtering
            if not parsed.scheme in ['http', 'https']:
                return False

            # Check domain crawl limit
            domain = parsed.netloc
            domain_count = sum(1 for visited in self.visited_urls
                             if urlparse(visited).netloc == domain)

            if domain_count >= self.max_pages_per_domain:
                return False

            # Filter out common non-content URLs
            exclude_patterns = [
                r'\.(css|js|json|xml)$',
                r'(login|signup|logout)',
                r'(facebook|twitter|instagram)'
            ]

            return not any(re.search(pattern, url, re.I)
                         for pattern in exclude_patterns)

        except:
            return False

    async def extract_main_content(self, page):
        """
        Extract main content from webpage using readability
        """
        script = """
        () => {
            function getMainContent() {
                // Priority elements to check
                const priorities = ['article', 'main', '.content', '#content'];

                for (const selector of priorities) {
                    const element = document.querySelector(selector);
                    if (element) {
                        return element.innerText;
                    }
                }

                // Fallback to largest text block
                let largest = '';
                document.querySelectorAll('p').forEach(p => {
                    const text = p.innerText;
                    if (text.length > largest.length) {
                        largest = text;
                    }
                });

                return largest;
            }

            return getMainContent();
        }
        """

        content = await page.evaluate(script)
        return content

In [31]:
class SearchEngine:
  def __init__(self):
    self.crawler = SuperPoweredCrawler()

  async def search(self, query, seed_urls=None):
    """Perform a search using the crawler"""
    if seed_urls is None:
      #Default seed URLs for different content types

      seed_urls = [
        "https://arxiv.org",
        "https://scholar.google.com",
        "https://github.com",
        "https://medium.com",
        "https://youtube.com",
        # Add more seed URLs
      ]

    results = await self.crawler.start_crawl(seed_urls, query) # This line was incorrectly indented. Fixed by aligning with 'if' block.
    return self.rank_results(results, query)

  def rank_results(self, results, query):
    """ Rank and filter search results"""

    seen_urls = set()
    unique_results = []

    for result in results:
      url = result['url']
      if url not in seen_urls:
        seen_urls.add(url)
        unique_results.append(result)

    return unique_results

In [32]:
async def main():
    engine = SearchEngine()
    query = "quantum computing latest developments"
    results = await engine.search(query)

    for result in results[:10]:  # Top 10 results
        print(f"\nURL: {result['url']}")
        print(f"Type: {result['type']}")
        print(f"Score: {result['relevance_score']}")
        if 'summary' in result:
            print(f"Summary: {result['summary']}")

if __name__ == "__main__":
    await main()


Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


Starting crawl for query: quantum computing latest developments
Error crawling https://arxiv.org: 'Frame' object has no attribute 'request'
Error crawling https://medium.com: 'Frame' object has no attribute 'request'
Error crawling https://scholar.google.com: 'Frame' object has no attribute 'request'
Error crawling https://github.com: 'Frame' object has no attribute 'request'
Error crawling https://youtube.com: 'Frame' object has no attribute 'request'
