In [None]:
import scrapy
import requests
import asyncio
from scrapy.crawler import Crawler
import json
from urllib.parse import quote
import re

In [None]:
class NetworkManager:
    def __init__(self):
        self.headers ={
        'User-Agent': 'Mozilla/5.0',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'
    }

        self.proxies = {'http': "108.161.135.118"}

    def create_session(self) -> requests.Session:
        if not self.proxies:
            raise ValueError('no proxy available')
        if self.headers is None:
            raise ValueError('headers are not valid')
        
        client = requests.Session()
        client.proxies.update(self.proxies)
        client.headers.update(self.headers)
        return client
    
    def check_status(self, urls: list) -> None:
        for url in urls:
            response = self.session.get(url)
            print(f'{url} | Status code: {response.status_code}')
            return response.status_code






class NYTimesSpider(scrapy.Spider):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

        self.config = NetworkManager()
        self.session: requests.Session = self.config.create_session()
        self.pages_to_parse = pages_to_parse



    #Scarapy configs
    name = "nytimesspider"
    allowed_domains = ["nytimes.com"]
    start_urls = ["https://nytimes.com/section/business/media"]



    def _get_tokens(self):

        try:
            r = self.session.get(url="https://nytimes.com")
            r.raise_for_status()

            if not self._is_valid_response(r):
                return None
            
            if not self.content_validation(r):
                return None

            #Token extraction

            #variables needed
            vars_dict = {
                "nyt_token": None, 
                "nyt-app-type": None, 
                "nyt-app-version": None}

            for variables in vars_dict:

                #Search
                filtered_value  = f'["\']{variables}["\']\s*:\s*["\']([^"\']+)["\']'
                val = re.search(fr'{filtered_value}', r.text)

                if val:
                    token = val.group(1)
                    token = token.strip()
                else:
                    token = None


                vars_dict[variables] = token


            return vars_dict

            
        except requests.exceptions.RequestException as e:
            self.logger.log(f"Request failed: {e}")
            return None







    def _is_valid_response(self, response):
        if response.status_code == 403:
            #Can try alternate headers here

            alternate_headers = {}

            return False
        return 200 <= response.status_code < 300
    

    def _content_validation(self, response):
        if not response.text:
            return False
        if 'text/html' not in response.headers:
            return False
        else:
            return True
        

    #Update headers dynamically
    def header_update(self):
        header_extension = self._get_tokens()
        if header_extension:
            self.headers = self.headers.update(header_extension)
            #Update session headers
            self.session.headers.update(self.headers)
        else:
            return None
        








    #Scrapy config
    def start_requests(self, cursor=None):
        endpoint = self._request_generator()
        if self._check_api_connection(endpoint):
            yield scrapy.Request(
                url=endpoint,
                headers=self.session.headers,
                callback=self.parse 
                )
        else:
            self.logger.log("API Connection Error")


    def _request_generator(self, cursor = None, operation_name: str = 'PersonalizedPackagesQuery') -> str:
        variables = {
        "id":"/section/business/media",
        "first":10,
        "exclusionMode":"HIGHLIGHTS_AND_EMBEDDED",
        "isFetchMore":False,
        "isTranslatable":False,
        "isEspanol":False,
        "highlightsListUri":"nyt://per/personalized-list/__null__",
        "highlightsListFirst":0,
        "hasHighlightsList":False,
        "cursor": cursor
        }
        extension =  {"persistedQuery":{"version":1,"sha256Hash":"8334262659d77fc2166184bf897e6d139e437af3a9b84d0c020d3dfcb0f177b8"}}
        
        
        #formatting
        var_query = quote(json.dumps(variables))
        extension_query = quote(json.dumps(extension))

        api_endpoint = f'https://samizdat-graphql.nytimes.com/graphql/v2?operation_name={operation_name}&variables={var_query}&extension={extension_query}'
        
        return api_endpoint
        
    
    
    
    def parse(self, response):
        
        data = response.json()
        
        collection = data['data']['legacyCollection']['collectionsPage']
        articles = collection['stream']['edges']

        for article in articles:
            yield {
                'headline':article['node']['headline']['default'],  #Need to get text which is in default="headline"
                'summary': article['node']['summary'],
                'url': article['node']['url'],
                'News Source': article['node']['_typename']
            }
        
        
        #Now we need to parse through the pages
        #The end paramater will be the new start parameter
        start_cursor = collection['stream']['pageInfo']['endCursor']
        current_page = getattr(response.meta, 'page', 1)
        
        if start_cursor and current_page < self.pages_to_parse:
            next_endpoint = self._request_generator(cursor=start_cursor)
            
            yield scrapy.Request(
                url=next_endpoint,
                headers=self.session.headers,
                callback=self.parse
            )
            
    def _check_api_connection(self, url):
        try:
            r = self.session.get(url)
            if 200 <= r.status_code < 300:
                return True
            elif 400 <= r.status_code < 500:
                return False
        except requests.exceptions.RequestException as e:
            self.logger.log("API Conncection Dead!")
            return None


class RateLimitManager:
    

In [None]:
def main():
    if __name__ == "__main__":
        

https://samizdat-graphql.nytimes.com/graphql/v2?operationName=CollectionsQuery&variables=%7B%22id%22%3A%22%2Fsection%2Fbusiness%2Fmedia%22%2C%22first%22%3A10%2C%22exclusionMode%22%3A%22HIGHLIGHTS_AND_EMBEDDED%22%2C%22isFetchMore%22%3Afalse%2C%22isTranslatable%22%3Afalse%2C%22isEspanol%22%3Afalse%2C%22highlightsListUri%22%3A%22nyt%3A%2F%2Fper%2Fpersonalized-list%2F__null__%22%2C%22highlightsListFirst%22%3A0%2C%22hasHighlightsList%22%3Afalse%2C%22cursor%22%3A%22YXJyYXljb25uZWN0aW9uOjIwOQ%3D%3D%22%7D&extensions=%7B%22persistedQuery%22%3A%7B%22version%22%3A1%2C%22sha256Hash%22%3A%228334262659d77fc2166184bf897e6d139e437af3a9b84d0c020d3dfcb0f177b8%22%7D%7D