# Scrape Columbia with Spider


In [38]:
import nest_asyncio
nest_asyncio.apply()

In [39]:
from dotenv import load_dotenv, find_dotenv

load_dotenv(find_dotenv())

from pprint import pprint
from spider import Spider

import asyncio
from typing import List


In [57]:
app = Spider()

urls = [
    "https://www.columbiaspectator.com/opinion/op-eds/1/",
    "https://www.columbiaspectator.com/opinion/2024/12/11/at-columbia-we-dont-strike-our-ideological-opponents/",
    "https://www.columbiaspectator.com/opinion/2024/12/08/columbias-complicity-in-cop29-the-greenwashing-of-human-rights-abuses/",
]

In [76]:
spider_params = {
    "limit": 1,
    "metadata": True,
    "request": "smart",
    # "smart_mode": True,
    "respect_robots": False,
    "return_format": "markdown",
    "anti_bot": True,
    "stealth": True,
    "fingerprint": True,
    "readability": True,
    "scroll": 500,
}

In [77]:
async def scrape_url(url: str) -> str:
    return app.scrape_url(url, params=spider_params)


async def scrape_urls(urls: List[str]) -> List[str]:
    return await asyncio.gather(*[scrape_url(url) for url in urls])

In [78]:
md = app.scrape_url(urls[0], params=spider_params)

In [None]:
len(md), md

## Spider Web Reader with LlamaIndex


In [54]:
import os
from llama_index.readers.web.spider_web.base import (
    SpiderWebReader,
)
SPIDER_API_KEY = os.getenv("SPIDER_API_KEY")
spider_reader = SpiderWebReader(
    api_key=SPIDER_API_KEY,  # Get one at https://spider.cloud
    mode="crawl",
    params=spider_params
)


In [55]:
%%time

documents = spider_reader.load_data(url=urls[0])
len(documents)

CPU times: user 9.6 ms, sys: 4.29 ms, total: 13.9 ms
Wall time: 32.8 s


6

In [56]:
pprint(vars(documents[0]))

{'embedding': None,
 'end_char_idx': None,
 'excluded_embed_metadata_keys': [],
 'excluded_llm_metadata_keys': [],
 'id_': '32a91511-3e1f-40f1-82dc-55603a00a4db',
 'metadata': {'description': 'Why are we at Columbia, and what is the purpose '
                             'of higher education? Many of us arrived at '
                             'Columbia fueled by curiosity and a yearning for '
                             'knowledge. However, in recent months, it has '
                             'become clear that not all members of our '
                             'community share a vision of open dialogue and '
                             'mutual learning.',
              'domain': 'www.columbiaspectator.com',
              'extracted_data': None,
              'file_size': 21517,
              'keywords': None,
              'pathname': '/opinion/2024/12/11/at-columbia-we-dont-strike-our-ideological-opponents/',
              'resource_type': '.md',
              'title': 'At 

In [58]:
spider_crawler = SpiderWebReader(
    api_key=SPIDER_API_KEY,  
    mode="crawl",
    params=spider_params
)

In [59]:
%%time

documents = spider_crawler.load_data(url=urls[0])
len(documents)

CPU times: user 8.49 ms, sys: 4.32 ms, total: 12.8 ms
Wall time: 37.8 s


6

In [None]:
pprint(vars(documents[0]))

## Using Selenium


In [60]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import time
import json


In [72]:
class WordPressScraper:
    def __init__(self, url, scroll_pause_time=2):
        self.url = url
        self.scroll_pause_time = scroll_pause_time
        
        # Add Chrome options
        options = webdriver.ChromeOptions()
        options.add_argument('--headless')  # Run in headless mode (optional)
        
        # Create service object with specific driver path if needed
        # Specify the path to your ChromeDriver
        chromedriver_path = "/Users/pmui/.local/chromedriver-mac-arm64/chromedriver"  # Replace with your actual path
        service = webdriver.ChromeService(executable_path=chromedriver_path)
        # service = webdriver.ChromeService()
        
        self.driver = webdriver.Chrome(options=options, service=service)
        
    def scroll_to_bottom(self, max_scrolls=None):
        """Scroll to the bottom of the page to trigger content loading"""
        scrolls = 0
        last_height = self.driver.execute_script("return document.body.scrollHeight")
        
        while True:
            # Scroll down
            self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            
            # Wait for new content to load
            time.sleep(self.scroll_pause_time)
            
            # Calculate new scroll height
            new_height = self.driver.execute_script("return document.body.scrollHeight")
            
            # Break if reached bottom or max scrolls
            if new_height == last_height or (max_scrolls and scrolls >= max_scrolls):
                break
                
            last_height = new_height
            scrolls += 1
            
        return scrolls
    
    def extract_main_content(self):
        """Extract main content from the page"""
        # Common WordPress article content selectors
        content_selectors = [
            'article',
            '.post-content',
            '.entry-content',
            '.post',
            'main',
            '#main-content'
        ]
        
        soup = BeautifulSoup(self.driver.page_source, 'html.parser')
        articles = []
        
        # Try different selectors
        for selector in content_selectors:
            content_elements = soup.select(selector)
            if content_elements:
                for element in content_elements:
                    # Extract text and clean it
                    text = element.get_text(strip=True, separator=' ')
                    if text:  # Only add non-empty content
                        article = {
                            'text': text,
                            'url': element.find('a', href=True)['href'] if element.find('a', href=True) else None,
                            'title': element.find('h1', class_='entry-title').text if element.find('h1', class_='entry-title') else None
                        }
                        articles.append(article)
                break  # Stop if we found content with current selector
                
        return articles
    
    def scrape(self, max_scrolls=None):
        """Main scraping method"""
        try:
            # Load the page
            self.driver.get(self.url)
            
            # Wait for any of these elements to be present
            selectors = [
                (By.TAG_NAME, "article"),
                (By.CLASS_NAME, "post-content"),
                (By.CLASS_NAME, "entry-content"),
                (By.TAG_NAME, "main"),
                (By.ID, "main-content")
            ]
            
            # Try each selector with a longer timeout
            for selector in selectors:
                try:
                    WebDriverWait(self.driver, 20).until(
                        EC.presence_of_element_located(selector)
                    )
                    break
                except:
                    continue
            
            # Add a small delay to ensure content is loaded
            time.sleep(3)
            
            # Scroll to load all content
            total_scrolls = self.scroll_to_bottom(max_scrolls)
            
            # Extract content
            articles = self.extract_main_content()
            
            return {
                'total_scrolls': total_scrolls,
                'articles': articles
            }
            
        except Exception as e:
            print(f"An error occurred: {str(e)}")
            return {
                'error': str(e),
                'total_scrolls': 0,
                'articles': []
            }
            
        finally:
            self.driver.quit()

In [None]:
urls = [
    "https://www.columbiaspectator.com/opinion/op-eds/1/",
    "https://www.columbiaspectator.com/opinion/2024/12/11/at-columbia-we-dont-strike-our-ideological-opponents/",
    "https://www.columbiaspectator.com/opinion/2024/12/08/columbias-complicity-in-cop29-the-greenwashing-of-human-rights-abuses/",
]

In [75]:
scraper = WordPressScraper(urls[1])
results = scraper.scrape(max_scrolls=1)  

# Save results to file
with open('scraped_content.json', 'w', encoding='utf-8') as f:
    json.dump(results, f, ensure_ascii=False, indent=2)