# Infinite Scroll Crawl with Scrapy

In [1]:
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv())

import scrapy

In [2]:

import json

class OpinionSpider(scrapy.Spider):
    name = "opinion"
    start_urls = ["https://dailybruin.com/category/opinion"]

    def parse(self, response):
        # Extract initial articles from the page
        articles = response.css("h3 a::attr(href)").getall()
        for article in articles:
            yield {"url": article}

        # Extract the next page's URL by observing API requests or network activity in the browser
        next_page = self.get_next_page_url(response)
        if next_page:
            yield scrapy.Request(next_page, callback=self.parse_ajax)

    def parse_ajax(self, response):
        # Parse the AJAX response to extract articles
        data = json.loads(response.text)
        if 'html' in data:
            # Extract articles from the HTML content
            html = scrapy.Selector(text=data['html'])
            articles = html.css("h3 a::attr(href)").getall()
            for article in articles:
                yield {"url": article}

            # If there are more pages, continue crawling
            next_page = data.get("next")
            if next_page:
                yield scrapy.Request(next_page, callback=self.parse_ajax)

    def get_next_page_url(self, response):
        """
        Custom logic to extract the next page URL.
        Observe the site's API or AJAX structure.
        """
        # Inspect the site to find the pagination AJAX call
        api_base = "https://dailybruin.com/wp-json/dailybruin/v1/posts"
        page_number = 2  # Start from the second page
        return f"{api_base}?page={page_number}&category=opinion"
