In [1]:
import requests
import json
import websocket
import threading
import time
import tls_client

In [17]:
class WSJSpider:
    def __init__(self, pages_to_parse):
        self.endpoint = "https://shared-data.dowjones.io/gateway/graphql"
        self.ws_endpoint = None
        self.session = requests.Session()
        self.captured_requests = []
        self.pages_to_parse = pages_to_parse

    def discover_websocket_endpoint(self):
        ws_candidates = [
            "wss://shared-data.dowjones.io/gateway/graphql",
            "wss://shared-data.dowjones.io/graphql-ws"
        ]
        
        return ws_candidates

    def analyze_websocket_connection(self, ws_url):

        def on_message(ws, message):
            print(f"websocket Message: {message}")
            try:
                parsed = json.loads(message)
                print(f"Parsed: {json.dumps(parsed, indent=2)}")
            except:
                pass
        
        def on_error(ws, error):
            print(f"Websocket Error: {error}") 

        def on_close(ws, close_status_code, close_msg):
            print("WebSocket connection closed")
        
        def on_open(ws):
            print("Websocket connection opened")

            init_message = {
                "type": "connection_init",
                "payload": {}
            }
            ws.send(json.dumps(init_message))
        
        try:
            ws = websocket.WebSocketApp(ws_url,
                                        on_message=on_message,
                                        on_error=on_error,
                                        on_close=on_close,
                                        on_open=on_open)
            
            ws_thread = threading.Thread(target=ws.run_forever)
            ws_thread.daemon = True
            ws_thread.start()

            return ws, ws_thread
        except Exception as e:
            print(f"WebSocket connection failed: {e}")
            return None, None
    
    def extract_auth_headers(self, browser_headers):

        auth_headers = {}

        auth_patterns = [
            'authorization',
            'x-api-key',
            'x-auth-token',
            'cookie',
            'x-csrf-token'
        ]

        for pattern in auth_patterns:
            for key, value in browser_headers.items():
                if pattern in key.lower():
                    auth_headers[key] = value
        
        return auth_headers

    def pagination(self, base_query, start_page=1):
        """Test the load more functionality by incrementing page numbers"""
        
        results = []

        max_pages = self.pages_to_parse
        
        for page in range(start_page, start_page + max_pages):
            print(f"\n--- Testing Page {page} ---")
            
            # Modify the query to use the current page
            query_copy = base_query.copy()
            if 'variables' in query_copy:
                query_copy['variables']['page'] = page
        
            # Then do POST request
            result = self.capture_post_request(query_copy)
            
            if result:
                results.append({
                    'page': page,
                    'data': result,
                    'article_count': len(result.get('data', {}).get('articlesByContentType', [])) if result.get('data') else 0
                })
                
                # Add delay between requests to be respectful
                time.sleep(4)
            else:
                print(f"Failed to get data for page {page}")
                break
        
        return results
    
    def capture_post_request(self, query_payload, headers=None):
        """Capture POST GraphQL request with proper headers"""
        
        # Default headers based on your earlier capture
        default_headers = {
            "accept": "*/*",
            "accept-encoding": "gzip, deflate, br, zstd",
            "accept-language": "en-US,en;q=0.9",
            "apollographql-client-name": "WSJ Web",  # Common Apollo client name
            "content-type": "application/json",
            "origin": "https://www.wsj.com",
            "referer": "https://www.wsj.com/",
            "sec-fetch-dest": "empty",
            "sec-fetch-mode": "cors",
            "sec-fetch-site": "cross-site",
            "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36 OPR/119.0.0.0"
        }
        
        if headers:
            default_headers.update(headers)
        
        try:
            print("Trying response")
            response = self.session.post(
                self.endpoint,
                headers=default_headers,
                json=query_payload
            )
            
            print(f"POST Response Status: {response.status_code}")
            print("Response Headers:")
            for key, value in response.headers.items():
                print(f"  {key}: {value}")
            
            if response.status_code == 200:
                return response.json()
            else:
                print(f"Error response: {response.text}")
                return None
                
        except Exception as e:
            print(f"POST request failed: {e}")
            return None    
    


In [18]:
def main():
    scraper = WSJSpider(3)

    #Initialize the websocket

    test_query = {
        "query": """
        query ArticlesByContentType($searchQuery: SearchQuery!, $contentType: [SearchContentType], $page: Int) {
          articlesByContentType(searchQuery: $searchQuery, contentType: $contentType, page: $page) {
            headline {
              text
            }
            publishedDateTimeUtc
            seoPath {
              value
            }
          }
        }
        """,
        "variables": {
            "contentType": ["ARTICLE"],
            "page": 1,
            "searchQuery": {
                "and": [
                    {
                        "terms": {
                            "key": "Product",
                            "value": ["WSJ.com"]
                        }
                    }
                ],
                "sort": [
                    {
                        "key": "LiveDate",
                        "order": "desc"
                    }
                ]
            }
        }
    }
    

    print("\n=== Testing GraphQL Query ===")
    result = scraper.capture_post_request(test_query)

    if result:
        print(f"Query Result: {json.dumps(result, indent=2)}")
    else:
        print("Query failed to load")
    

    print("\n=== Testing Load More Pagination ===")
    pagination_results = scraper.pagination(test_query)

    for page_results in pagination_results:
        print(f"Page {page_results['page']}: {page_results['article_count']} articles")
        print(page_results)
    
    ws_candidate = scraper.discover_websocket_endpoint()
    for ws_url in ws_candidate:
        print(f"Testing WebSocket: {ws_url}")
        ws, thread = scraper.analyze_websocket_connection(ws_url)
        if ws:
            time.sleep(5)
            ws.close()

if __name__ == "__main__":
    main()
    


=== Testing GraphQL Query ===
Trying response
POST Response Status: 200
Response Headers:
  Content-Type: application/json
  Transfer-Encoding: chunked
  Connection: keep-alive
  Date: Wed, 23 Jul 2025 17:57:27 GMT
  content-encoding: gzip
  access-control-allow-origin: https://www.wsj.com
  cache-control: max-age=14,public,stale-while-revalidate=14,stale-if-error=14
  apollo-trace-id: 84705987aef4d23a60610aa990ddc925
  strict-transport-security: max-age=31536000; includeSubdomains; preload
  x-content-type-options: nosniff
  vary: origin
  X-Cache: Miss from cloudfront
  Via: 1.1 3368064fd81368cfc0c47fab23e1aa0e.cloudfront.net (CloudFront)
  X-Amz-Cf-Pop: DFW56-P9
  X-Amz-Cf-Id: M63hcmm4RdTL9pTR-372N408wQJ0RLWPmW_hDCgctHsLPwCd2XOJXQ==
Query Result: {
  "data": {
    "articlesByContentType": [
      {
        "headline": {
          "text": "Tech, Media & Telecom Roundup: Market Talk"
        },
        "publishedDateTimeUtc": "2025-07-23T10:28:00Z",
        "seoPath": {
          "va

In [2]:
import requests
import json
import websocket
import threading
import time

class WSJGraphQLAnalyzer:
    def __init__(self):
        self.endpoint = "https://shared-data.dowjones.io/gateway/graphql"
        self.ws_endpoint = None  # We'll need to discover this
        self.session = requests.Session()
        self.captured_requests = []
        
    def capture_options_request(self):
        """Capture the OPTIONS preflight request"""
        try:
            response = self.session.options(self.endpoint)
            print("OPTIONS Response:")
            print(f"Status: {response.status_code}")
            print("Headers:")
            for key, value in response.headers.items():
                print(f"  {key}: {value}")
            return response
        except Exception as e:
            print(f"OPTIONS request failed: {e}")
            return None
    
    def analyze_cors_headers(self, options_response):
        """Analyze CORS headers from OPTIONS response"""
        if not options_response:
            return {}
            
        cors_headers = {}
        for key, value in options_response.headers.items():
            if key.lower().startswith('access-control'):
                cors_headers[key] = value
        return cors_headers
    
    def capture_post_request(self, query_payload, headers=None):
        """Capture POST GraphQL request with proper headers"""
        
        # Default headers based on your earlier capture
        default_headers = {
            "accept": "*/*",
            "accept-encoding": "gzip, deflate, br, zstd",
            "accept-language": "en-US,en;q=0.9",
            "apollographql-client-name": "WSJ Web",  # Common Apollo client name
            "content-type": "application/json",
            "origin": "https://www.wsj.com",
            "referer": "https://www.wsj.com/",
            "sec-fetch-dest": "empty",
            "sec-fetch-mode": "cors",
            "sec-fetch-site": "cross-site",
            "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36 OPR/119.0.0.0"
        }
        
        if headers:
            default_headers.update(headers)
        
        try:
            response = self.session.post(
                self.endpoint,
                headers=default_headers,
                json=query_payload
            )
            
            print(f"POST Response Status: {response.status_code}")
            print("Response Headers:")
            for key, value in response.headers.items():
                print(f"  {key}: {value}")
            
            if response.status_code == 200:
                return response.json()
            else:
                print(f"Error response: {response.text}")
                return None
                
        except Exception as e:
            print(f"POST request failed: {e}")
            return None
    
    def test_load_more_pagination(self, base_query, start_page=1, max_pages=3):
        """Test the load more functionality by incrementing page numbers"""
        
        results = []
        
        for page in range(start_page, start_page + max_pages):
            print(f"\n--- Testing Page {page} ---")
            
            # Modify the query to use the current page
            query_copy = base_query.copy()
            if 'variables' in query_copy:
                query_copy['variables']['page'] = page
            
            # First do OPTIONS request
            options_resp = self.capture_options_request()
            
            # Then do POST request
            result = self.capture_post_request(query_copy)
            
            if result:
                results.append({
                    'page': page,
                    'data': result,
                    'article_count': len(result.get('data', {}).get('articlesByContentType', [])) if result.get('data') else 0
                })
                
                # Add delay between requests to be respectful
                time.sleep(4)
            else:
                print(f"Failed to get data for page {page}")
                break
        
        return results
    
    def discover_websocket_endpoint(self):
        """Try to discover WebSocket endpoint from network traffic patterns"""
        
        # Common WebSocket endpoint patterns for GraphQL
        ws_candidates = [
            "wss://shared-data.dowjones.io/gateway/graphql",
            "wss://shared-data.dowjones.io/graphql-ws",
        ]
        
        return ws_candidates
    
    def analyze_websocket_connection(self, ws_url):
        """Analyze WebSocket connection for GraphQL subscriptions"""
        
        def on_message(ws, message):
            print(f"WebSocket Message: {message}")
            try:
                parsed = json.loads(message)
                print(f"Parsed: {json.dumps(parsed, indent=2)}")
            except:
                pass
        
        def on_error(ws, error):
            print(f"WebSocket Error: {error}")
        
        def on_close(ws, close_status_code, close_msg):
            print("WebSocket connection closed")
        
        def on_open(ws):
            print("WebSocket connection opened")
            # Send connection init message (common for GraphQL subscriptions)
            init_message = {
                "type": "connection_init",
                "payload": {}
            }
            ws.send(json.dumps(init_message))
        
        try:
            ws = websocket.WebSocketApp(ws_url,
                                      on_message=on_message,
                                      on_error=on_error,
                                      on_close=on_close,
                                      on_open=on_open)
            
            # Run WebSocket in a separate thread
            ws_thread = threading.Thread(target=ws.run_forever)
            ws_thread.daemon = True
            ws_thread.start()
            
            return ws, ws_thread
        except Exception as e:
            print(f"WebSocket connection failed: {e}")
            return None, None
    
    def extract_auth_headers(self, browser_headers):
        """Extract authorization headers from browser capture"""
        
        auth_headers = {}
        
        # Look for common auth header patterns
        auth_patterns = [
            'authorization',
            'x-api-key',
            'x-auth-token',
            'cookie',
            'x-csrf-token'
        ]
        
        for pattern in auth_patterns:
            for key, value in browser_headers.items():
                if pattern in key.lower():
                    auth_headers[key] = value
        
        return auth_headers

# Usage example
def main():
    analyzer = WSJGraphQLAnalyzer()

    # Test with your ArticlesByContentType query
    test_query = {
        "query": """
        query ArticlesByContentType($searchQuery: SearchQuery!, $contentType: [SearchContentType], $page: Int) {
          articlesByContentType(searchQuery: $searchQuery, contentType: $contentType, page: $page) {
            headline {
              text
            }
            publishedDateTimeUtc
            seoPath {
              value
            }
          }
        }
        """,
        "variables": {
            "contentType": ["ARTICLE"],
            "page": 1,
            "searchQuery": {
                "and": [
                    {
                        "terms": {
                            "key": "Product",
                            "value": ["WSJ.com"]
                        }
                    }
                ],
                "sort": [
                    {
                        "key": "LiveDate",
                        "order": "desc"
                    }
                ]
            }
        }
    }
    
    print("\n=== Testing GraphQL Query ===")
    result = analyzer.capture_post_request(test_query)

    print("\n=== Testing Load More Pagination ===")
    pagination_results = analyzer.test_load_more_pagination(test_query, 1, 3)
    
    for page_result in pagination_results:
        print(f"Page {page_result['page']}: {page_result['article_count']} articles")
        print(page_result)
    
    print("\n=== WebSocket Endpoint Discovery ===")
    ws_candidates = analyzer.discover_websocket_endpoint()
    for ws_url in ws_candidates:
        print(f"Testing WebSocket: {ws_url}")
        ws, thread = analyzer.analyze_websocket_connection(ws_url)
        if ws:
            time.sleep(5)  # Let it run for a bit
            ws.close()

if __name__ == "__main__":
    main()


=== Testing GraphQL Query ===
POST Response Status: 200
Response Headers:
  Content-Type: application/json
  Transfer-Encoding: chunked
  Connection: keep-alive
  Date: Wed, 23 Jul 2025 16:34:08 GMT
  content-encoding: gzip
  access-control-allow-origin: https://www.wsj.com
  cache-control: max-age=30,public,stale-while-revalidate=30,stale-if-error=30
  apollo-trace-id: bef9c63347bb9ffb60cc1b00092f8b13
  strict-transport-security: max-age=31536000; includeSubdomains; preload
  x-content-type-options: nosniff
  vary: origin
  X-Cache: Miss from cloudfront
  Via: 1.1 0484828cb1a561e34a3abb035d623f78.cloudfront.net (CloudFront)
  X-Amz-Cf-Pop: DFW56-P9
  X-Amz-Cf-Id: 7vP_U3tvAaynGUF7CR_ALBakuev9hrvKrOxmR2obp7QxBcZDxNqMbw==

=== Testing Load More Pagination ===

--- Testing Page 1 ---
OPTIONS Response:
Status: 403
Headers:
  Server: CloudFront
  Date: Wed, 23 Jul 2025 16:34:08 GMT
  Content-Type: text/html
  Content-Length: 919
  Connection: keep-alive
  X-Cache: Error from cloudfront
  V