# Do AI Crawlers Respect Robots.txt

## Part 1: Passive Measurement
For this part, we track if any AI crawler visits any page of our website (e.g., homepage) between Sep 2024 - March 2024. 
We setup two websites, both of which block all AI-crawlers in their robots.txt (one using `*` and the other naming individual crawler).
We passively wait for them to visit our websites.
Any visit (except for fetching robots.txt) signals that an AI crawler doesnt respect robots.txt.
Our website URLs are anonymized.

In [1]:
from imc25_lib import read_log_with_path, parse_log, load_agents_from_file, is_ip_official
from collections import defaultdict
import datetime
import json


PRINT_DEBUG_INFORMATION = True

In [2]:
# Load a set of AI Agents we care about in this paper 
# Load from ../data/user-agents.txt
ALL_AI_USER_AGENTS, ai_data_crawler, ai_search_crawler, ai_assistant_crawler, ai_undocumented_crawler = load_agents_from_file('../data/user-agents.txt')
if PRINT_DEBUG_INFORMATION:
    print('AI Agents Loaded', ALL_AI_USER_AGENTS, '\n')

AI Agents Loaded ['Amazonbot', 'AI2Bot', 'anthropic-ai', 'Applebot', 'Applebot-Extended', 'Bytespider', 'CCBot', 'ChatGPT-User', 'Claude-Web', 'ClaudeBot', 'cohere-ai', 'Diffbot', 'FacebookBot', 'Google-Extended', 'GPTBot', 'Kangaroo Bot', 'Meta-ExternalAgent', 'Meta-ExternalFetcher', 'OAI-SearchBot', 'omgili', 'PerplexityBot', 'Timpibot', 'Webzio-Extended', 'YouBot'] 



In [3]:
# Process an apache log file given: its path and the time our robots.txt file started to be effective
AI_DATA_CRAWLER_IDENTIFIED = set()
def process_website_logs(website_log_dir, robots_effective_time):
    global ALL_AI_USER_AGENTS, AI_DATA_CRAWLER_IDENTIFIED
    # Get website name from the fielname
    website_name = website_log_dir.split('/')[-1]
    if PRINT_DEBUG_INFORMATION:
        print('\n\n----- Processing Website Logs : {} -----' .format(website_name))
    logs = read_log_with_path(website_log_dir)
    robust_logs = []
    # process logs
    for log in logs:
        # process log
        log = log.strip()
        if len(log) == 0:
            continue
        
        # Parse Apache Log. For each log, return a dict that contains things like ip, path, protocol and user-agent.
        parsed = parse_log(log)
        if parsed:
            robust_logs.append((parsed, log))

    ### If Each AI User Agent Visits the Website ###
    ##### First, group by AI user agents
    ai_agent_to_logs = defaultdict(list)
    for parsed, raw in robust_logs:
        # Get timestamp of log
        timestamp = parsed['timestamp']
        # convert timestamp to datetime
        datetime_obj = datetime.datetime.strptime(timestamp, '%d/%b/%Y:%H:%M:%S %z')
        deploytime_obj = datetime.datetime.strptime(robots_effective_time, '%d/%b/%Y:%H:%M:%S %z')
        # Ignore anything happen before robots.txt was deployed
        if datetime_obj < deploytime_obj:
            continue
    
        user_agent = parsed['user_agent']
        for ai_agent in ALL_AI_USER_AGENTS:
            # Note that agent name is case insensitive
            if ai_agent.lower().strip() in user_agent.lower().strip():
                ai_agent_to_logs[ai_agent].append((parsed, raw))
    
    ##### Next, process and print visit per user agent.	{'ip': '34.96.46.4', 'timestamp': '01/Nov/2024:10:51:51 -0700', 'method': 'GET', 'path': '/chatgpt_110111.html', 'protocol': 'HTTP/1.1', 'status': '200', 'user_agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36'}
    for ai_agent, v in ai_agent_to_logs.items():
        # Track how many agents visited our website
        if ai_agent.lower() in ai_data_crawler + ai_search_crawler + ai_undocumented_crawler or ai_agent in ai_data_crawler + ai_search_crawler + ai_undocumented_crawler:
            AI_DATA_CRAWLER_IDENTIFIED.add(ai_agent)
            
        robots_txt_access = []
        other_access = []
        for parsed, raw in v:
            if 'robots.txt' in parsed['path']:
                robots_txt_access.append(parsed)
            else:
                other_access.append(parsed)
        print("User Agent: {}, Total Access Count: {}, Robots.txt Access Count:{}, Other Access Count:{}".format(ai_agent,len(v), len(robots_txt_access), len(other_access)))
        # Don't print everything if its too long
        is_sampled = False
        sample_size = 20
        if len(other_access) > sample_size:
            # sample
            is_sampled = True
            other_access_sample = other_access[::len(other_access)//sample_size]
        if is_sampled == True:
            print('\t[INFO] Sampling {} Access out of {} accesses'.format(sample_size, len(other_access)))
        other_access_print = other_access if is_sampled == False else other_access_sample
        for parsed in other_access_print:
            # # ignore any path thats robots.txt and not bytespider
            ip_string = parsed['ip']
            is_falsified_ip = not is_ip_official(ip_string, ai_agent)
            if is_falsified_ip == False:
                print('\tIP {}\taccessed path "{}"\tUA (short): {}\tTimestamp: {}'.format(parsed['ip'], parsed['path'], ai_agent, parsed['timestamp']))
            else:
                print('\t[Spoofed UA] IP {}\taccessed path "{}"\tUA (short): {}\tTimestamp: {}'.format(parsed['ip'], parsed['path'], ai_agent, parsed['timestamp']))
        print()
    print('------------------------\n\n\n')


## Read log for website a. Website name anonymized 
WEBSITE_A_LOG_DIR = '../data/access_websitea-com.log'
process_website_logs(WEBSITE_A_LOG_DIR, '20/Sep/2024:0:0:00 -0800')

WEBSITE_B_LOG_DIR = '../data/access_websiteb-com.log'
process_website_logs(WEBSITE_B_LOG_DIR, '20/Sep/2024:12:16:00 -0800')
print('\nAI Agents that Visited Our Websites (excluding AI Assitants): {}'.format(sorted(AI_DATA_CRAWLER_IDENTIFIED)))




----- Processing Website Logs : access_websitea-com.log -----
User Agent: GPTBot, Total Access Count: 163, Robots.txt Access Count:160, Other Access Count:3
	[Spoofed UA] IP 64.226.111.159	accessed path "/"	UA (short): GPTBot	Timestamp: 12/Nov/2024:19:00:34 -0800
	[Spoofed UA] IP 162.158.90.101	accessed path "/"	UA (short): GPTBot	Timestamp: 19/Jan/2025:17:15:56 -0800
	[Spoofed UA] IP 138.197.150.3	accessed path "/"	UA (short): GPTBot	Timestamp: 19/Jan/2025:17:38:25 -0800

User Agent: Amazonbot, Total Access Count: 10, Robots.txt Access Count:10, Other Access Count:0

User Agent: Meta-ExternalAgent, Total Access Count: 13, Robots.txt Access Count:13, Other Access Count:0

User Agent: CCBot, Total Access Count: 12, Robots.txt Access Count:12, Other Access Count:0

User Agent: Applebot, Total Access Count: 19, Robots.txt Access Count:18, Other Access Count:1
	[Spoofed UA] IP 78.153.140.218	accessed path "/infos/"	UA (short): Applebot	Timestamp: 13/Dec/2024:15:27:18 -0800

User Agent: C

## Part 2: Active Measurement
For this part, we track if any AI crawler respects robots.txt by requesting them to visit our website (website-c). Each crawler is given a unique URL for the purpose of identification.
This website (website-c) is distinct from the two websites above in that it has a robots.txt but allows all crawlers. 
As we show later, most of the crawlers do not fetch robots.txt and hence dont respect robots.txt. For the few that do fetch robots.txt (e.g., ChatGPT-User), testing whether they respect robots.txt is trivial and is not shown here. The data shown below focuses exclusively on third-party GPT Apps (i.e., they visit our website through a third-party infrastructure).

In [4]:
import json
from imc25_lib import read_log_with_path, parse_log, is_ip_in_openai, is_ua_from_openai, domain_grouping_given_domain_to_ip_dict, generate_ip_to_group_and_domain_to_group
import ipwhois
import time
import os
import re
from collections import defaultdict

ALL_GPT_APPS_BY_RANK_FILE = '../data/gpt_list.json'
LIST_OF_APPS_WITH_ACTION_FILE = '../data/apps-with-action.json'
DOMAIN_REGEX = r'(?:[a-z0-9](?:[a-z0-9-]{0,61}[a-z0-9])?\.)+[a-z0-9][a-z0-9-]{0,61}[a-z0-9]'
DOMAIN_REGEX = re.compile(DOMAIN_REGEX)

PRINT_VALIDATION = False

# Used to identify unique experiment ID for each GPT app
UNIQUE_VISIT_IDENTIFIER = ['10291', '10301', '10311', '11011']
UNIQUE_VISIT_IDENTIFIER_FORMATTED = ["chatgpt_{}".format(i) for i in UNIQUE_VISIT_IDENTIFIER]

# This load a list of apps that might be able to visit a page (can have false positives)
# All of the apps here we can at least trigger an action with the two prompts defined in the paper.
# Whether they actually visit the page or not is another question.
def load_list_of_apps_can_trigger_action():
    global LIST_OF_APPS_WITH_ACTION_FILE
    all_actionable_gpts_objs = []
    with open(LIST_OF_APPS_WITH_ACTION_FILE, 'r') as f:
        all_actionable_gpts_objs = json.load(f)
    return all_actionable_gpts_objs

def map_exp_with_domain_contacted(all_actionable_gpts_objs):
    # Read Logs from ../data/apps-action-logs.json
    logs = []
    with open('../data/apps-action-logs.json', 'r') as f:
        logs = json.load(f)

    # map test_page to rank
    map_test_page_to_rank = {}
    for gpt in all_actionable_gpts_objs:
        test_page = gpt['test_page']
        rank = gpt['rank_among_actionable_gpts']
        map_test_page_to_rank[test_page] = rank

    map_exp_id_to_domain = {}
    for log in logs:
        if 'gpt_app_approval_talking_to_bar' not in log:
            continue
        
        domain = DOMAIN_REGEX.findall(log['gpt_app_approval_talking_to_bar'])
        if len(domain) == 0:
            continue
        domain = domain[0]
        rank = map_test_page_to_rank[log['test_page']]
        current_date = log['time'][5:10].replace('-', '')
        exp_id = "{}1{}".format(current_date, rank)
        # print('Exp ID: {}, Domain: {}'.format(exp_id, domain))
        map_exp_id_to_domain[exp_id] = domain
    return map_exp_id_to_domain
        
    

# This part is skipped.
# See crawler related
def send_visit_requests():
    print('\n[INFO] For each actionable GPT app, we send them requests to')
    print('[INFO] visit our page using the following two prompts.')
    print('[INFO] Prompt 1: Get web page content: [URL].')
    print('[INFO] Prompt 2: Start action, fetch page: [URL].\n')





# This function processes the server logs.
# GPT Apps are prompted to visit page: https://advertcv.com/chatgpt_{10291/10301}{their-relative-rank}.html
# We requested a one-time visit from Meta for page: https://advertcv.com/additional_art.html
def identify_unique_gpt_provider_using_server_logs(all_actionable_gpts_objs, exp_id_to_domain, server_file_path = "../data/access_websitec-com.log"):
    global UNIQUE_VISIT_IDENTIFIER
    global UNIQUE_VISIT_IDENTIFIER_FORMATTED

    # Read and parse server logs
    server_logs = read_log_with_path(server_file_path)
    parsed_server_logs = [parse_log(log) for log in server_logs]
    ignored_openai_logs = 0

    domain_to_ip_assciated = defaultdict(set)
    for parsed in parsed_server_logs:
        if parsed == None:
            continue
        path = parsed['path']
        # this signals that it is a visit from a gpt app
        if any(identifier in path for identifier in UNIQUE_VISIT_IDENTIFIER_FORMATTED):
            # example path would be: /chatgpt_1029115.html; where 15 is the rank
            exp_id = path.split('_')[-1].replace('.html', '').replace('/robots.txt', '').replace('&page=1', '')
            gpt_rank_id = exp_id
            # remove the common part
            for i in UNIQUE_VISIT_IDENTIFIER:
                if i in gpt_rank_id:
                    gpt_rank_id = gpt_rank_id.replace(i, '')
            
            if is_ua_from_openai(parsed) and is_ip_in_openai(parsed['ip']):
                ignored_openai_logs += 1
                continue
            domain_contacted = exp_id_to_domain[exp_id] 
            parsed['domain_contacted'] = domain_contacted
            domain_to_ip_assciated[domain_contacted].add(parsed['ip'])
    
    provider_domains_to_ips = domain_grouping_given_domain_to_ip_dict(domain_to_ip_assciated)
    print('Total Unique GPT Providers: ', len(provider_domains_to_ips), '\n')
    print('--- [INFO] Unique GPT Providers and Their Infra ---')
    # For each group, print all domains and 3 IPs
    for idx, (group, ips) in enumerate(provider_domains_to_ips.items()):
        print('Group: {}\n\tDomains: {}\n\tIPs: {}\n'.format(idx, group, tuple(ips)[:3]))
    print('---\n\n')
    # Get the infrastructure associated with each group
    return provider_domains_to_ips
            

def is_ip_associated_with_a_gpt_provider(ip, ip_to_provider_domains):
    # Check if the IP is associated with any domain in the merged_mapping
    if ip.lower() in ip_to_provider_domains:
        return True
    return False

def is_domain_name_associated_with_a_gpt_provider(domain, merged_mapping):
    # Check if the domain is associated with any group in the merged_mapping
    for group, ips in merged_mapping.items():
        if domain.lower() in group:
            return True
    return False

def does_gpt_group_respect_robots_txt(all_actionable_gpts_objs, provider_domains_to_ips, server_file_path = "../data/access_websitec-com.log"):
    global UNIQUE_VISIT_IDENTIFIER
    global UNIQUE_VISIT_IDENTIFIER_FORMATTED

    ip_to_provider_domains, domain_to_provider_domains = generate_ip_to_group_and_domain_to_group(provider_domains_to_ips)

    # Read and parse server logs
    server_logs = read_log_with_path(server_file_path)
    parsed_server_logs = [parse_log(log) for log in server_logs]
    ignored_openai_logs = 0
    
    
    gpt_group_to_logs = defaultdict(list)

    for idx, parsed in enumerate(parsed_server_logs):
        if parsed == None:
            continue
        path = parsed['path']

        # filter by ua
        # if 'linkReader'.lower() not in parsed['user_agent'].lower():
        #     continue

        # this signals that it is a visit from a gpt app
        if any(identifier in path for identifier in UNIQUE_VISIT_IDENTIFIER_FORMATTED):            
            if is_ua_from_openai(parsed) and is_ip_in_openai(parsed['ip']):
                ignored_openai_logs += 1
                continue
            
            # get 
            group = ip_to_provider_domains[parsed['ip'].lower()]
            group = tuple(sorted(group))
            gpt_group_to_logs[group].append(parsed)
            # Look back 3 logs
            for back_track_log in parsed_server_logs[idx-3:idx]:
                if back_track_log == None:
                    continue
                # add robots.txt to logs
                if 'robots.txt' in back_track_log['path']:
                    # print('Backtrack Log: ', back_track_log)
                    if is_ip_associated_with_a_gpt_provider(back_track_log['ip'], ip_to_provider_domains):
                        this_group = ip_to_provider_domains[back_track_log['ip'].lower()]
                        this_group = tuple(sorted(this_group))
                        if this_group == group:
                            if back_track_log not in gpt_group_to_logs[this_group]:
                                gpt_group_to_logs[this_group].append(back_track_log)

    print('\n\n--- [INFO] Third-party Infra Retrieves Robots.txt ---')
    for k,v in gpt_group_to_logs.items():
        is_sample = False
        sample_threashold = 5
        is_retrieved_robots_txt = False
        # sort by timestamp
        v.sort(key=lambda x: x['timestamp'])
        # compute if robots.txt was retrieved
        for log in v:
            if 'robots.txt' in log['path']:
                is_retrieved_robots_txt = True
        print('Group: {}\tLogs: {}\tIs Robots.txt Retrieved: {}'.format(k, len(v), is_retrieved_robots_txt))
        if len(v) > sample_threashold:
            is_sample = True
        # print hte first 20 logs, start with \t
        if is_sample == True:
            print('\t[INFO] Sampled {} logs out of {} logs'.format(sample_threashold, len(v)))
        for log in v[:sample_threashold]:
            print('\tIP {}\taccessed path "{}"\tTimestamp: {}\tUA: {}'.format(log['ip'], log['path'], log['timestamp'], log['user_agent']))
        print('')
    print('---\n')
    return 

# First, we load a list of apps for which we can trigger an action
# This list of apps is generated by running a crawler that automatically
# interacts with the GPT apps
all_actionable_gpts_objs = load_list_of_apps_can_trigger_action()
print('Total # of Actionable GPTs: ', len(all_actionable_gpts_objs))

# We send them requests to visit our page using two prompts
send_visit_requests()

exp_id_to_domain = map_exp_with_domain_contacted(all_actionable_gpts_objs)

# Get content, trial #1
provider_domains_to_ips = identify_unique_gpt_provider_using_server_logs(
    all_actionable_gpts_objs,
    exp_id_to_domain
)

does_gpt_group_respect_robots_txt(
    all_actionable_gpts_objs,
    provider_domains_to_ips, 
)

Total # of Actionable GPTs:  136

[INFO] For each actionable GPT app, we send them requests to
[INFO] visit our page using the following two prompts.
[INFO] Prompt 1: Get web page content: [URL].
[INFO] Prompt 2: Start action, fetch page: [URL].

Total Unique GPT Providers:  23 

--- [INFO] Unique GPT Providers and Their Infra ---
Group: 0
	Domains: ('r.1lm.io',)
	IPs: ('34.96.45.73', '34.96.44.183', '34.96.44.133')

Group: 1
	Domains: ('actions.sider.ai', 'sider.ai')
	IPs: ('44.201.93.99', '44.203.172.68', '3.84.82.109')

Group: 2
	Domains: ('ad.adintelli.ai', 'api.adzedek.com', 'gpts.webpilot.ai')
	IPs: ('192.234.79.193',)

Group: 3
	Domains: ('content-rewriter.orrenprunckun.com', 'copywriter.orrenprunckun.com', 'seo-plugin.orrenprunckun.com', 'summary.orrenprunckun.com')
	IPs: ('162.241.24.83',)

Group: 4
	Domains: ('scraper.gafo.tech',)
	IPs: ('34.96.46.108', '34.96.46.53', '34.96.46.34')

Group: 5
	Domains: ('websearchg.mixerbox.com',)
	IPs: ('198.46.148.245', '149.20.246.34', '34