In [None]:
import scrapy
import requests
import asyncio
from scrapy.crawler import Crawler
import json
from urllib.parse import quote
import re

In [None]:
class NetworkManager:
    def __init__(self):
        self.headers ={
        'User-Agent': 'Mozilla/5.0',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'
    }

        self.proxies = {'http': "108.161.135.118"}

    def create_session(self) -> requests.Session:
        if not self.proxies:
            raise ValueError('no proxy available')
        if self.headers is None:
            raise ValueError('headers are not valid')
        
        client = requests.Session()
        client.proxies.update(self.proxies)
        client.headers.update(self.headers)
        return client
    
    def check_status(self, urls: list) -> None:
        for url in urls:
            response = self.session.get(url)
            print(f'{url} | Status code: {response.status_code}')
            return response.status_code






class NYTimesSpider(scrapy.Spider, NetworkManager):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

        self.config = NetworkManager()
        self.session: requests.Session = self.config.create_session()



    #Scarapy configs
    name = "nytimesspider"
    allowed_domains = ["nytimes.com"]
    start_urls = ["https://nytimes.com/section/business/media"]



    def _get_tokens(self):

        try:
            r = self.session.get(url="https://nytimes.com")
            r.raise_for_status()

            if not self._is_valid_response(r):
                return None
            
            if not self.content_validation(r):
                return None

            #Token extraction

            #variables needed
            vars_dict = {
                "nyt_token": None, 
                "nyt-app-type": None, 
                "nyt-app-version": None}

            for variables in vars_dict:

                #Search
                filtered_value  = f'["\']{variables}["\']\s*:\s*["\']([^"\']+)["\']'
                val = re.search(fr'{filtered_value}', r.text)

                if val:
                    token = val.group(1)
                    token = token.strip()
                else:
                    token = None


                vars_dict[variables] = token


            return vars_dict

            
        except requests.exceptions.RequestException as e:
            self.logger.log(f"Request failed: {e}")
            return None







    def _is_valid_response(self, response):
        if response.status_code == 403:
            #Can try alternate headers here

            alternate_headers = {}

            return False
        return 200 <= response.status_code < 300
    

    def _content_validation(self, response):
        if not response.text:
            return False
        if 'text/html' not in response.headers:
            return False
        else:
            return True
        

    #Update headers dynamically
    def header_update(self):
        header_extension = self.get_tokens()
        if header_extension:
            self.headers = self.headers.update(header_extension)

            #Update session headers
            self.session.headers.update(self.headers)

        else:
            return None
        








    #Scrapy config
    def start_requests(self, cursor= None):
 
        yield scrapy.Request(
            url=self._request_generator(),
            headers=self.session.headers,
            callback=self.parse 
            )


    def _request_generator(self, cursor: str = None, operation_name: str = 'PersonalizedPackagesQuery') -> str:
        variables = {
        "id":"/section/business/media",
        "first":10,
        "exclusionMode":"HIGHLIGHTS_AND_EMBEDDED",
        "isFetchMore":False,
        "isTranslatable":False,
        "isEspanol":False,
        "highlightsListUri":"nyt://per/personalized-list/__null__",
        "highlightsListFirst":0,
        "hasHighlightsList":False,
        "cursor": cursor
        }
        extension =  {"persistedQuery":{"version":1,"sha256Hash":"8334262659d77fc2166184bf897e6d139e437af3a9b84d0c020d3dfcb0f177b8"}}
        
        
        #formatting
        var_query = quote(json.dumps(variables))
        extension_query = quote(json.dumps(extension))

        api_endpoint = f'https://samizdat-graphql.nytimes.com/graphql/v2?operation_name={operation_name}&variables={var_query}&extension={extension_query}'

        return api_endpoint


    def parse(self, response):
        data = response.json()
        
        collection = data['data']['legacyCollection']['collectionsPage']
        articles = collection['stream']['edges']

        for article in articles:
            yield {
                'headline':article['node']['headline']['default'],  #Need to get text which is in default="headline"
                'summary': article['node']['summary'],
                'url': article['node']['url'],
                'News Source': article['node']['_typename']
            }
        
        #Now we need to parse through the pages
        #The end paramater will be the new start parameter
        start_cursor = collection['stream']['pageInfo']['endCursor']
        if start_cursor:
            yield self.make_next_request(start_cursor)


    def _make_next_request(self, new_cursor):
        self.start_request(new_cursor)
        
        
        

Building an API query:

https://samizdat-graphql.nytimes.com/graphql/v2?
operationName=CollectionsQuery&
variables={
    "id":"/section/business/media",
    "first":10,
    "exclusionMode":"HIGHLIGHTS_AND_EMBEDDED",
    "isFetchMore":false,
    "isTranslatable":false,
    "isEspanol":false,
    "highlightsListUri":"nyt://per/personalized-list/__null__",
    "highlightsListFirst":0,
    "hasHighlightsList":false,
    "cursor":"YXJyYXljb25uZWN0aW9uOjIwOQ=="}
    &extensions=
    {"persistedQuery":{"version":1,"sha256Hash":"8334262659d77fc2166184bf897e6d139e437af3a9b84d0c020d3dfcb0f177b8"}}

https://samizdat-graphql.nytimes.com/graphql/v2?operationName=CollectionsQuery&variables=%7B%22id%22%3A%22%2Fsection%2Fbusiness%2Fmedia%22%2C%22first%22%3A10%2C%22exclusionMode%22%3A%22HIGHLIGHTS_AND_EMBEDDED%22%2C%22isFetchMore%22%3Afalse%2C%22isTranslatable%22%3Afalse%2C%22isEspanol%22%3Afalse%2C%22highlightsListUri%22%3A%22nyt%3A%2F%2Fper%2Fpersonalized-list%2F__null__%22%2C%22highlightsListFirst%22%3A0%2C%22hasHighlightsList%22%3Afalse%2C%22cursor%22%3A%22YXJyYXljb25uZWN0aW9uOjIwOQ%3D%3D%22%7D&extensions=%7B%22persistedQuery%22%3A%7B%22version%22%3A1%2C%22sha256Hash%22%3A%228334262659d77fc2166184bf897e6d139e437af3a9b84d0c020d3dfcb0f177b8%22%7D%7D

In [None]:
import requests
from urllib.parse import quote
import json

def scrape_nytimes_with_session():
    session = requests.Session()
    
    # Step 1: Get the main page to establish cookies and get the nyt-token
    print("Getting main page to establish session...")
    main_response = session.get("https://www.nytimes.com/section/business/media")
    
    # Step 2: Extract the nyt-token from the page (it's usually in a script tag)
    # You'll need to parse the HTML to find the token, or use the one from your curl
    
    # Step 3: Set up headers exactly like the browser
    headers = {
        "accept": "*/*",
        "accept-language": "en-US,en;q=0.9",
        "content-type": "application/json",
        "nyt-app-type": "project-vi",
        "nyt-app-version": "0.0.5",
        "nyt-token": nyt_token,
        "origin": "https://www.nytimes.com",
        "priority": "u=1, i",
        "referer": "https://www.nytimes.com/",
        "sec-ch-ua": '"Google Chrome";v="134", "Chromium";v="134", "Not_A Brand";v="24"',
        "sec-ch-ua-mobile": "?0",
        "sec-ch-ua-platform": '"macOS"',
        "sec-fetch-dest": "empty",
        "sec-fetch-mode": "cors",
        "sec-fetch-site": "same-site",
        "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.6998.205 Safari/537.36",
        "x-nyt-internal-meter-override": "undefined"
    }
    
    # Step 4: Make the GraphQL request
    url = "https://samizdat-graphql.nytimes.com/graphql/v2?operationName=CollectionsQuery&variables=%7B%22id%22%3A%22%2Fsection%2Fbusiness%2Fmedia%22%2C%22first%22%3A10%2C%22exclusionMode%22%3A%22HIGHLIGHTS_AND_EMBEDDED%22%2C%22isFetchMore%22%3Afalse%2C%22isTranslatable%22%3Afalse%2C%22isEspanol%22%3Afalse%2C%22highlightsListUri%22%3A%22nyt%3A%2F%2Fper%2Fpersonalized-list%2F__null__%22%2C%22highlightsListFirst%22%3A0%2C%22hasHighlightsList%22%3Afalse%2C%22cursor%22%3A%22YXJyYXljb25uZWN0aW9uOjk%3D%22%7D&extensions=%7B%22persistedQuery%22%3A%7B%22version%22%3A1%2C%22sha256Hash%22%3A%228334262659d77fc2166184bf897e6d139e437af3a9b84d0c020d3dfcb0f177b8%22%7D%7D"
    
    response = session.get(url, headers=headers)
    print(f"Status: {response.status_code}")
    
    if response.status_code == 200:
        return response.json()
    else:
        print(f"Error: {response.text}")
        return None

# Test it
result = scrape_nytimes_with_session()
if result:
    print("Success!")
    print(json.dumps(result, indent=2)[:500])

Getting main page to establish session...
Status: 200
Success!
{
  "data": {
    "legacyCollection": {
      "__typename": "LegacyCollection",
      "active": true,
      "adTargetingParams": [
        {
          "__typename": "AdTargetingParam",
          "key": "tt",
          "value": ""
        },
        {
          "__typename": "AdTargetingParam",
          "key": "is_viral",
          "value": ""
        },
        {
          "__typename": "AdTargetingParam",
          "key": "prop",
          "value": "nyt"
        },
        {
          "__typen


In [None]:
import re



#This url is an api endpoint
url = "https://samizdat-graphql.nytimes.com/graphql/v2?operationName=CollectionsQuery&variables=%7B%22id%22%3A%22%2Fsection%2Fbusiness%2Fmedia%22%2C%22first%22%3A10%2C%22exclusionMode%22%3A%22HIGHLIGHTS_AND_EMBEDDED%22%2C%22isFetchMore%22%3Afalse%2C%22isTranslatable%22%3Afalse%2C%22isEspanol%22%3Afalse%2C%22highlightsListUri%22%3A%22nyt%3A%2F%2Fper%2Fpersonalized-list%2F__null__%22%2C%22highlightsListFirst%22%3A0%2C%22hasHighlightsList%22%3Afalse%2C%22cursor%22%3A%22YXJyYXljb25uZWN0aW9uOjI5%22%7D&extensions=%7B%22persistedQuery%22%3A%7B%22version%22%3A1%2C%22sha256Hash%22%3A%228334262659d77fc2166184bf897e6d139e437af3a9b84d0c020d3dfcb0f177b8%22%7D%7D"
home_url = "https://nytimes.com"

#Proxies and headers that work for nytimes
proxies = {"http": "108.161.135.118"}
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",}


session = requests.Session()
session.proxies.update(proxies)
session.headers.update(headers)

r = session.get(url="https://nytimes.com",headers=headers)



#Need to extract 
token_search = re.search(r'["\']nyt-token["\']\s*:\s*["\']([^"\']+)["\']', r.text)

app_type_search = re.search(r'["\']nyt-app-type["\']\s*:\s*["\']([^"\']+)["\']', r.text)

app_type_version = re.search(r'["\']nyt-app-version["\']\s*:\s*["\']([^"\']+)["\']', r.text)

print(f"Status: {r.status_code}")
print(f"Response: {r.text[:200]}...")
print()
print()
print(f'Token search: {token_search.group()}')
print(f'App type: {app_type_search.group()}')
print(f'App version: {app_type_version.group()}')
#session = ex.create_client()




Status: 200
Response: <!DOCTYPE html>
<html lang="en" class=" nytapp-vi-homepage "  data-nyt-compute-assignment="fallback" xmlns:og="http://opengraphprotocol.org/schema/">
  <head>
    
    
    <meta charset="utf-8" />
  ...


Token search: "nyt-token":"MIIBIjANBgkqhkiG9w0BAQEFAAOCAQ8AMIIBCgKCAQEAs+/oUCTBmD/cLdmcecrnBMHiU/pxQCn2DDyaPKUOXxi4p0uUSZQzsuq1pJ1m5z1i0YGPd1U1OeGHAChWtqoxC7bFMCXcwnE1oyui9G1uobgpm1GdhtwkR7ta7akVTcsF8zxiXx7DNXIPd2nIJFH83rmkZueKrC4JVaNzjvD+Z03piLn5bHWU6+w+rA+kyJtGgZNTXKyPh6EC6o5N+rknNMG5+CdTq35p8f99WjFawSvYgP9V64kgckbTbtdJ6YhVP58TnuYgr12urtwnIqWP9KSJ1e5vmgf3tunMqWNm6+AnsqNj8mCLdCuc5cEB74CwUeQcP2HQQmbCddBy2y0mEwIDAQAB"
App type: "nyt-app-type":"project-vi"
App version: "nyt-app-version":"0.0.5"
