<a href="https://colab.research.google.com/github/worldofaryavart/colab_notebooks/blob/colabnotebook/making_scraperModel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [9]:
!pip install playwright beautifulsoup4 pytesseract pillow PyMuPDF youtube_dl transformers
!playwright install chromium
!apt-get install -y poppler-utils
!apt-get install -y tesseract-ocr
!pip install nest_asyncio
!pip install duckduckgo_search
!pip install pdfplumber
!pip install arxiv



Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
poppler-utils is already the newest version (22.02.0-2ubuntu0.5).
0 upgraded, 0 newly installed, 0 to remove and 49 not upgraded.
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
tesseract-ocr is already the newest version (4.1.1-2.1build1).
0 upgraded, 0 newly installed, 0 to remove and 49 not upgraded.
Collecting pdfplumber
  Downloading pdfplumber-0.11.4-py3-none-any.whl.metadata (41 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.0/42.0 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pdfminer.six==20231228 (from pdfplumber)
  Downloading pdfminer.six-20231228-py3-none-any.whl.metadata (4.2 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-4.30.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [

In [10]:
!python -m spacy download en_core_web_lg

Collecting en-core-web-lg==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.7.1/en_core_web_lg-3.7.1-py3-none-any.whl (587.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m587.7/587.7 MB[0m [31m883.5 kB/s[0m eta [36m0:00:00[0m
Installing collected packages: en-core-web-lg
Successfully installed en-core-web-lg-3.7.1
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [11]:
import fitz  # PyMuPDF
import warnings
import nest_asyncio

In [13]:
# Standard library imports
import asyncio
import io
import os
import re
import time
import uuid
import threading
from concurrent.futures import ThreadPoolExecutor
from urllib.parse import urlparse
from urllib.robotparser import RobotFileParser

# Third-party imports
import aiohttp
import arxiv
import networkx as nx
import pdfplumber
import spacy
from bs4 import BeautifulSoup
from PIL import Image
import pytesseract
from duckduckgo_search import DDGS
from playwright.async_api import async_playwright
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import torch
from transformers import (
    pipeline,
    AutoTokenizer,
    AutoModelForSequenceClassification,
    AutoModelForQuestionAnswering,
    # AutoModelForSeq2SeqGeneration
)

# Optional: Enable GPU if available
device = "cuda" if torch.cuda.is_available() else "cpu"

# Initialize spaCy
try:
    nlp = spacy.load("en_core_web_lg")
except OSError:
    print("Downloading spaCy model...")
    os.system("python -m spacy download en_core_web_lg")
    nlp = spacy.load("en_core_web_lg")

# Configure logging (optional but recommended)
import logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

In [14]:
warnings.filterwarnings('ignore', category=FutureWarning)

nest_asyncio.apply()

In [15]:
class SuperPoweredCrawler:
    def __init__(self):
        self.visited_urls = set()
        self.url_queue = asyncio.Queue()  # Changed to asyncio.Queue
        self.results = []
        self.content_lock = threading.Lock()
        self.browser = None
        self.context = None  # Added context initialization

        # Initialize content extractors
        self.summarizer = pipeline("summarization", model="facebook/bart-large-cnn", device="cpu")
        self.min_text_for_summary = 200

        # Configurable crawler settings
        self.max_depth = 2
        self.max_pages = 20
        self.max_pages_per_domain = 5
        self.concurrent_requests = 15

        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
            'Accept-Language': 'en-US,en;q=0.5',
        }

    async def initialize_browser(self):
        """Initialize browser and context"""
        playwright = await async_playwright().start()
        self.browser = await playwright.chromium.launch(headless=True)  # Fixed typo in chromium
        self.context = await self.browser.new_context(
            viewport={'width': 1200, 'height': 800},
            user_agent=self.headers['User-Agent'],
            extra_http_headers=self.headers  # Fixed parameter name
        )
        return playwright

    async def cleanup(self, playwright):
        """Cleanup browser resources"""
        if self.context:
            await self.context.close()
        if self.browser:
            await self.browser.close()
        await playwright.stop()

    async def start_crawl(self, seed_urls, search_query):
        """Start the crawling process with multiple seed URLs"""
        print(f"Starting crawl for query: {search_query}")

        # Initialize the queue with seed URLs
        for url in seed_urls:
            await self.url_queue.put((url, 0))

        try:
            playwright = await self.initialize_browser()
            workers = [self.crawler_worker(search_query) for _ in range(self.concurrent_requests)]
            await asyncio.gather(*workers)

        except Exception as e:
            print(f"Error during crawling: {str(e)}")

        finally:
            await self.cleanup(playwright)

        return self.results

    async def crawler_worker(self, search_query):
        """Worker process for crawling pages"""
        while True:
            try:
                if len(self.visited_urls) >= self.max_pages:
                    break

                try:
                    url, depth = await asyncio.wait_for(self.url_queue.get(), timeout=10)
                except asyncio.TimeoutError:
                    break

                if depth > self.max_depth or url in self.visited_urls:
                    self.url_queue.task_done()
                    continue

                print(f"Crawling: {url}")

                try:
                    page = await self.context.new_page()
                    print("page is page", page)
                    response = await page.goto(
                        url,
                        wait_until='domcontentloaded',  # Fixed typo
                        timeout=3000
                    )
                    print("response is ", response)
                    if not response:
                        print(f"No response from {url}")
                        await page.close()
                        self.url_queue.task_done()
                        continue

                    if response.status >= 400:
                        print(f"Error response from {url}: {response.status}")
                        await page.close()
                        self.url_queue.task_done()
                        continue

                    content = await page.content()
                    result = await self.process_page(page, response, content, url)  # Fixed parameter order
                    if result:
                        self.add_result(result, search_query)

                    if len(self.visited_urls) < self.max_pages:
                        new_urls = await self.extract_urls(page)
                        for new_url in new_urls:
                            if self.should_crawl(new_url):
                                await self.url_queue.put((new_url, depth + 1))

                    self.visited_urls.add(url)
                    await page.close()
                    await asyncio.sleep(2)
                    print("page crawled")

                except Exception as e:
                    print(f"Error processing {url}: {str(e)}")
                    if 'page' in locals():
                        await page.close()

                self.url_queue.task_done()

            except Exception as e:
                print(f"Worker error: {str(e)}")
                continue


    async def process_page(self, page, response, content, url):
        """Process page content based on content type"""
        try:
            content_type = response.headers.get('content-type', '').lower()

            if 'pdf' in content_type:
                return await self.process_pdf_content(response, url)
            elif any(img_type in content_type for img_type in ['image/jpeg', 'image/png', 'image/gif']):
                return await self.process_image_content(response, url)
            else:
                return await self.process_html_content(page, content, url)

        except Exception as e:
            print(f"Error processing page {url}: {str(e)}")
            return None

    def should_crawl(self, url):
      """Determine if a URL should be crawled"""
      try:
        parsed = urlparse(url)

        if not parsed.scheme in ['http', 'https']:
          return False

        domain = parsed.netloc
        domain_count = sum(1 for visited in self.visited_urls
                           if urlparse(visited).netloc == domain)
        if domain_count >= self.max_pages_per_domain:
          return False

        exclude_patterns = [
                r'\.(css|js|json|xml)$',
                r'(login|signup|logout)',
                r'(facebook|twitter|instagram)',
                r'\.(jpg|jpeg|png|gif)$',  # Skip direct image links
                r'\/api\/',
                r'\/rss\/',
                r'\/feed\/',
                r'\/search\?',
                r'\/page\/\d+',
            ]

        return not any(re.search(pattern, url, re.I)
                         for pattern in exclude_patterns)

      except:
            return False

    async def process_html_content(self, page, content, url):
        """
        Process HTML content
        """
        try:
            soup = BeautifulSoup(content, 'html.parser')

            # Extract text content
            text_content = ' '.join([p.get_text() for p in soup.find_all(['p', 'article', 'section'])])
            text_content = re.sub(r'\s+', ' ', text_content).strip()

            # Get title
            title = await page.title()

            # Generate summary if content is long enough
            summary = None
            if len(text_content) > self.min_text_for_summary:
                try:
                    # Calculate dynamic max_length based on content length
                    content_length = len(text_content.split())
                    max_length = min(150, content_length - 50)  # At least 50 tokens shorter than content
                    min_length = min(50, max_length - 20)  # At least 20 tokens shorter than max_length

                    if max_length > min_length:
                        summary = self.summarizer(
                            text_content[:4096],
                            max_length=max_length,
                            min_length=min_length
                        )[0]['summary_text']
                except Exception as e:
                    print(f"Error generating summary: {str(e)}")

            return {
                'url': url,
                'type': 'html',
                'title': title,
                'content': text_content[:5000],  # Limit content length
                'summary': summary,
                'timestamp': time.time()
            }

        except Exception as e:
            print(f"Error processing HTML content for {url}: {str(e)}")
            return None

    async def process_pdf_content(self, response, url):
        """
        Process PDF content
        """
        try:
            pdf_data = await response.body()
            pdf_document = fitz.open(stream=pdf_data, filetype="pdf")
            text_content = ""

            for page_num in range(min(pdf_document.page_count, 10)):  # Limit to first 10 pages
                page = pdf_document[page_num]
                text_content += page.get_text()

            return {
                'url': url,
                'type': 'pdf',
                'content': text_content[:5000],  # Limit content length
                'page_count': pdf_document.page_count,
                'timestamp': time.time()
            }

        except Exception as e:
            print(f"Error processing PDF {url}: {str(e)}")
            return None

    async def process_image_content(self, response, url):
        """
        Process image content
        """
        try:
            image_data = await response.body()
            image = Image.open(io.BytesIO(image_data))

            # Extract text using OCR
            try:
                ocr_text = pytesseract.image_to_string(image)
            except:
                ocr_text = ""

            return {
                'url': url,
                'type': 'image',
                'ocr_text': ocr_text,
                'metadata': {
                    'width': image.size[0],
                    'height': image.size[1],
                    'format': image.format
                },
                'timestamp': time.time()
            }

        except Exception as e:
            print(f"Error processing image {url}: {str(e)}")
            return None

    async def extract_urls(self, page):
        """
        Extract URLs from the page
        """
        try:
            # Get all links using JavaScript evaluation
            links = await page.evaluate('''() => {
                const links = Array.from(document.getElementsByTagName('a'));
                return links.map(link => link.href).filter(href => href);
            }''')

            return list(set(links))  # Remove duplicates

        except Exception as e:
            print(f"Error extracting URLs: {str(e)}")
            return []

    def add_result(self, result, search_query):
        """
        Add processed result to the results list with relevance scoring
        """
        with self.content_lock:
            result['relevance_score'] = self.calculate_relevance(result, search_query)
            self.results.append(result)
            self.results.sort(key=lambda x: x['relevance_score'], reverse=True)

    def calculate_relevance(self, result, query):
        """
        Calculate relevance score for a result
        """
        score = 0
        query_terms = query.lower().split()

        # Get the content to score
        content = ''
        if 'content' in result:
            content = result['content'].lower()
        elif 'ocr_text' in result:
            content = result['ocr_text'].lower()

        # Term frequency scoring
        for term in query_terms:
            score += content.count(term)

        # Type-based boosting
        type_boost = {
            'pdf': 1.2,
            'html': 1.0,
            'image': 0.8
        }
        score *= type_boost.get(result['type'], 1.0)

        return score


In [16]:
class EnhancedSuperPoweredCrawler(SuperPoweredCrawler):
    def __init__(self):
        super().__init__()
        self.robots_cache = {}
        self.domain_scores = {}
        self.tfidf_vectorizer = TfidfVectorizer(stop_words='english')
        self.ddgs = DDGS()

        # Enhanced settings
        self.concurrent_requests = 10  # Increased from 3
        self.aiohttp_session = None
        self.thread_pool = ThreadPoolExecutor(max_workers=15)

    async def initialize(self):
        """Initialize crawler resources"""
        await super().initialize_browser()
        self.aiohttp_session = aiohttp.ClientSession(headers=self.headers)

    async def cleanup(self, playwright):
        """Cleanup resources"""
        await super().cleanup(playwright)
        if self.aiohttp_session:
            await self.aiohttp_session.close()
        self.thread_pool.shutdown()

    async def discover_seed_urls(self, search_query, num_results=20):
        """Discover relevant seed URLs using DuckDuckGo search"""
        try:
            print("trying to discover seed urls")
            search_results = list(self.ddgs.text(
                search_query,
                max_results=num_results
            ))
            # print("search results are ", search_results)

            urls = [result['href'] for result in search_results]
            print("urls are ", urls)

            # Score and filter URLs
            scored_urls = []
            for url in urls:
                if await self.check_robots_txt(url):
                    domain_score = await self.calculate_domain_score(url)
                    scored_urls.append((url, domain_score))

            # Sort by domain score and return top URLs
            scored_urls.sort(key=lambda x: x[1], reverse=True)
            return [url for url, _ in scored_urls[:10]]

        except Exception as e:
            print(f"Error discovering seed URLs: {str(e)}")
            return []

    async def check_robots_txt(self, url):
        """Check if URL is allowed by robots.txt"""
        try:
            parsed_url = urlparse(url)
            domain = f"{parsed_url.scheme}://{parsed_url.netloc}"

            if domain not in self.robots_cache:
                robots_url = f"{domain}/robots.txt"
                async with self.aiohttp_session.get(robots_url) as response:
                    if response.status == 200:
                        robots_content = await response.text()
                        rp = RobotFileParser()
                        rp.parse(robots_content.splitlines())
                        self.robots_cache[domain] = rp
                    else:
                        self.robots_cache[domain] = None

            rp = self.robots_cache[domain]
            return rp is None or rp.can_fetch(self.headers['User-Agent'], url)

        except Exception:
            return True  # Allow by default if robots.txt check fails

    async def calculate_domain_score(self, url):
        """Calculate domain authority score"""
        if url in self.domain_scores:
            return self.domain_scores[url]

        try:
            async with self.aiohttp_session.get(url) as response:
                score = 1.0

                # Factor 1: Response time
                response_time = response.elapsed.total_seconds()
                score *= max(0.5, 1 - (response_time / 5))

                # Factor 2: Content quality indicators
                if response.status == 200:
                    content = await response.text()
                    soup = BeautifulSoup(content, 'html.parser')

                    # Check for structured data
                    if soup.find(type="application/ld+json"):
                        score *= 1.2

                    # Check for proper HTML structure
                    if soup.find('article') or soup.find('main'):
                        score *= 1.1

                    # Check content length
                    text_content = soup.get_text()
                    if len(text_content) > 2000:
                        score *= 1.2

                self.domain_scores[url] = score
                return score

        except Exception:
            self.domain_scores[url] = 0.5
            return 0.5

    async def start_crawl(self, seed_urls=None, search_query=None):
        """Enhanced crawl with automatic seed URL discovery"""
        if not seed_urls and search_query:
            seed_urls = await self.discover_seed_urls(search_query)

        if not seed_urls:
            raise ValueError("No seed URLs available for crawling")

        await self.initialize()
        try:
            for url in seed_urls:
                await self.url_queue.put((url, 0))

            playwright = await self.initialize_browser()
            workers = [self.crawler_worker(search_query) for _ in range(self.concurrent_requests)]
            await asyncio.gather(*workers)

        finally:
            await self.cleanup(playwright)
              # print("shehse")

        self.results.sort(key=lambda x: x['relevance_score'], reverse=True)
        return self.results

    def calculate_relevance(self, result, query):
        """Enhanced relevance scoring using TF-IDF"""
        try:
            # Get content based on result type
            content = result.get('content', '') or result.get('ocr_text', '')
            if not content:
                return 0

            # Create document corpus
            corpus = [content, query]

            # Calculate TF-IDF scores
            tfidf_matrix = self.tfidf_vectorizer.fit_transform(corpus)
            similarity = (tfidf_matrix * tfidf_matrix.T).A[0][1]

            # Apply type and quality boosting
            score = similarity * self.get_type_boost(result)
            score *= self.get_quality_boost(result)

            return score

        except Exception as e:
            print(f"Error calculating relevance: {str(e)}")
            return 0

    def get_type_boost(self, result):
        """Get content type-based relevance boost"""
        type_boost = {
            'pdf': 1.3,  # Increased PDF boost
            'html': 1.0,
            'image': 0.7
        }
        return type_boost.get(result['type'], 1.0)

    def get_quality_boost(self, result):
        """Calculate quality-based boost factor"""
        boost = 1.0

        # Boost based on content length
        content = result.get('content', '') or result.get('ocr_text', '')
        if len(content) > 5000:
            boost *= 1.2

        # Boost based on URL authority
        url = result['url']
        domain_score = self.domain_scores.get(urlparse(url).netloc, 0.5)
        boost *= (1 + domain_score)

        return boost

In [17]:
class GodLevelCrawler(EnhancedSuperPoweredCrawler):
    def __init__(self):
        super().__init__()

        # Advanced NLP components
        self.qa_pipeline = pipeline("question-answering", model="deepset/roberta-base-squad2")
        self.zero_shot_classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
        self.text_generator = pipeline("text-generation", model="gpt2-xl")

        # Knowledge graph components
        self.knowledge_graph = nx.DiGraph()
        self.entity_linker = spacy.load("en_core_web_lg")

        # Enhanced PDF handling
        self.pdf_parser = pdfplumber

        # Advanced settings
        self.max_depth = 3
        self.max_pages = 50
        self.enable_knowledge_synthesis = True
        self.enable_multimodal = True

    async def process_pdf_content(self, response, url):
        """Enhanced PDF processing with better error handling"""
        try:
            pdf_data = await response.body()

            # Save PDF temporarily
            temp_path = f"temp_{uuid.uuid4()}.pdf"
            with open(temp_path, "wb") as f:
                f.write(pdf_data)

            # Extract text using pdfplumber
            text_content = ""
            with self.pdf_parser.open(temp_path) as pdf:
                for page in pdf.pages[:10]:  # First 10 pages
                    text_content += page.extract_text() or ""

                # Extract tables and figures
                tables = [page.extract_tables() for page in pdf.pages[:10]]
                figures = [page.images for page in pdf.pages[:10]]

            os.remove(temp_path)

            # Process extracted content
            processed_content = {
                'url': url,
                'type': 'pdf',
                'content': text_content[:10000],  # Extended content limit
                'tables': tables,
                'figures': figures,
                'metadata': self.extract_pdf_metadata(pdf_data),
                'timestamp': time.time()
            }

            # Enhance content with NLP
            if self.enable_knowledge_synthesis:
                processed_content.update(await self.enhance_content(processed_content))

            return processed_content

        except Exception as e:
            print(f"Error processing PDF {url}: {str(e)}")
            return None

    async def enhance_content(self, content):
        """Add intelligent content enhancement"""
        text = content['content']

        # Extract key concepts and entities
        doc = self.entity_linker(text)
        entities = [(ent.text, ent.label_) for ent in doc.ents]

        # Generate insights
        key_insights = await self.generate_insights(text)

        # Classify content topics
        topics = await self.classify_topics(text)

        # Update knowledge graph
        self.update_knowledge_graph(entities, key_insights)

        return {
            'entities': entities,
            'insights': key_insights,
            'topics': topics,
            'knowledge_connections': self.find_knowledge_connections(entities)
        }

    async def generate_insights(self, text):
        """Generate deep insights from content"""
        try:
            # Generate questions
            questions = self.generate_analytical_questions(text)

            # Get answers using QA pipeline
            insights = []
            for question in questions[:5]:  # Limit to top 5 questions
                answer = self.qa_pipeline(question=question, context=text)
                insights.append({
                    'question': question,
                    'answer': answer['answer'],
                    'confidence': answer['score']
                })

            return insights
        except Exception as e:
            print(f"Error generating insights: {str(e)}")
            return []

    def generate_analytical_questions(self, text):
        """Generate intelligent analytical questions"""
        prompts = [
            "What are the main contributions?",
            "What are the key limitations?",
            "What are the novel aspects?",
            "What are the potential applications?",
            "What are the underlying assumptions?"
        ]

        # Use zero-shot classification to identify most relevant questions
        results = self.zero_shot_classifier(
            text,
            candidate_labels=[p.lower() for p in prompts],
            multi_label=True
        )

        # Sort questions by relevance
        sorted_questions = [prompts[i] for i in np.argsort(results['scores'])[::-1]]
        return sorted_questions

    def update_knowledge_graph(self, entities, insights):
        """Update knowledge graph with new information"""
        for entity, entity_type in entities:
            if entity not in self.knowledge_graph:
                self.knowledge_graph.add_node(entity, type=entity_type)

        # Add connections based on insights
        for insight in insights:
            self.add_knowledge_connections(insight)

    def find_knowledge_connections(self, entities):
        """Find connections in knowledge graph"""
        connections = []
        for entity, _ in entities:
            if entity in self.knowledge_graph:
                # Find subgraph around entity
                subgraph = nx.ego_graph(self.knowledge_graph, entity, radius=2)
                connections.extend(self.analyze_subgraph(subgraph))
        return connections

    async def start_crawl(self, seed_urls=None, search_query=None):
        """Enhanced crawling with intelligent URL discovery"""
        if not seed_urls and search_query:
            seed_urls = await self.discover_seed_urls(search_query)
            seed_urls.extend(await self.discover_academic_papers(search_query))

        return await super().start_crawl(seed_urls, search_query)

    async def discover_academic_papers(self, query):
        """Discover relevant academic papers"""
        try:
            # Query academic APIs (arXiv, Semantic Scholar, etc.)
            arxiv_results = arxiv.Search(query=query, max_results=5).results()
            urls = [paper.pdf_url for paper in arxiv_results]
            return urls
        except Exception as e:
            print(f"Error discovering papers: {str(e)}")
            return []

In [18]:
crawler = GodLevelCrawler()
results = await crawler.start_crawl(
    seed_urls=["https://arxiv.org/pdf/1706.03762v6"],
    search_query="explain attention all you need paper"
)

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/496M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/79.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/689 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/6.43G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Crawling: https://arxiv.org/pdf/1706.03762v6
page is page <Page url='about:blank'>
Error processing https://arxiv.org/pdf/1706.03762v6: Page.goto: net::ERR_ABORTED at https://arxiv.org/pdf/1706.03762v6
Call log:
navigating to "https://arxiv.org/pdf/1706.03762v6", waiting until "domcontentloaded"



In [7]:
# crawler = SuperPoweredCrawler()

seed_urls = [
    # "https://arxiv.org/pdf/1706.03762v6",
    "https://en.wikipedia.org/wiki/Attention_Is_All_You_Need"
]

# results = await crawler.start_crawl(seed_urls, "quantum computing latest developments")

crawler = EnhancedSuperPoweredCrawler()
results = await crawler.start_crawl(seed_urls = seed_urls, search_query="explain attention all you need paper")
print("Cell is completed")

Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.
You are not authenticated with the Hugging Face Hub in this notebook.
If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).


config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Crawling: https://en.wikipedia.org/wiki/Attention_Is_All_You_Need
page is page <Page url='about:blank'>
response is  <Response url='https://en.wikipedia.org/wiki/Attention_Is_All_You_Need' request=<Request url='https://en.wikipedia.org/wiki/Attention_Is_All_You_Need' method='GET'>>
page crawled
Crawling: https://en.wikipedia.org/wiki/Attention_Is_All_You_Need#cite_note-:1-9
page is page <Page url='about:blank'>
response is  <Response url='https://en.wikipedia.org/wiki/Attention_Is_All_You_Need' request=<Request url='https://en.wikipedia.org/wiki/Attention_Is_All_You_Need' method='GET'>>
page crawled
Crawling: https://en.wikipedia.org/w/index.php?title=Special:CiteThisPage&page=Attention_Is_All_You_Need&id=1252891245&wpFormIdentifier=titleform
page is page <Page url='about:blank'>
response is  <Response url='https://en.wikipedia.org/w/index.php?title=Special:CiteThisPage&page=Attention_Is_All_You_Need&id=1252891245&wpFormIdentifier=titleform' request=<Request url='https://en.wikipedia.o

Exception: : Connection closed while reading from the driver

In [8]:
print("\nSearch Results:")
for i, result in enumerate(results[:5], 1):
    # print(result)
    print(f"\n{i}. {result['url']}")
    print(f"Type: {result['type']}")
    # print(f"Content:{result['content']}")
    print(f"Score: {result['relevance_score']:.2f}")
    if 'summary' in result and result['summary']:
        print(f"Summary: {result['summary']}")


Search Results:


NameError: name 'results' is not defined

In [None]:
# async def run_crawler():
#     """Main function to run the crawler"""
#     crawler = SuperPoweredCrawler()

#     seed_urls = [
#         "https://research.ibm.com/quantum-computing",
#         "https://www.nature.com/subjects/quantum-physics",
#         "https://www.scientificamerican.com/computing/",
#         "https://www.quantum-computing.news/",
#         "https://quantumcomputing.stackexchange.com/"
#     ]

#     results = await crawler.start_crawl(seed_urls, "quantum computing latest developments")

#     print("\nSearch Results:")
#     for i, result in enumerate(results[:10], 1):
#         print(f"\n{i}. {result['url']}")
#         print(f"Type: {result['type']}")
#         print(f"Score: {result['relevance_score']:.2f}")
#         if 'summary' in result and result['summary']:
#             print(f"Summary: {result['summary']}")

# if __name__ == "__main__":
#     asyncio.run(run_crawler())