In [1]:
import pandas as pd
from tqdm import tqdm
from datetime import date

In [2]:
input_file = '../data/june_2025/disallowed_bots.csv'
list_file = '../data/list_ai.txt'

In [3]:
raw_file = pd.read_csv(input_file)
list_ai = pd.read_table(list_file, header = None)

In [4]:
raw_file['ai'] = int(0)
for _, i in enumerate(raw_file['Disallowed Bot']):
    if i in list(list_ai[0]):
        raw_file.at[_, 'ai'] = int(1)

In [5]:
# 1. Total unique websites
total_websites = raw_file['Website'].nunique()

# 2. Websites with at least one AI bot disallowed
websites_with_ai = raw_file[raw_file['ai'] == 1]['Website'].nunique()

# 3. Results
print(f"Total unique websites: {total_websites}")
print(f"Websites with ≥1 AI bot disallowed: {websites_with_ai}")
print(f"Percentage: {(websites_with_ai / total_websites) * 100:.2f}%")

Total unique websites: 54
Websites with ≥1 AI bot disallowed: 45
Percentage: 83.33%


In [6]:
top_ai_bots = raw_file[raw_file['ai'] == 1]['Disallowed Bot'].value_counts().head(10)
print("Top 10 Disallowed AI Bots:\n", top_ai_bots)

Top 10 Disallowed AI Bots:
 Disallowed Bot
GPTBot             41
CCBot              36
Google-Extended    30
anthropic-ai       29
ClaudeBot          27
ChatGPT-User       26
Claude-Web         25
Bytespider         24
cohere-ai          22
PerplexityBot      18
Name: count, dtype: int64


In [7]:
top_websites_ai = raw_file[raw_file['ai'] == 1]['Website'].value_counts().head(10)
print("Top 10 Websites by AI Bot Disallowances:\n", top_websites_ai)

Top 10 Websites by AI Bot Disallowances:
 Website
nypost.com      23
nytimes.com     23
iltalehti.fi    23
yahoo.com       20
ndr.de          20
aol.com         20
nbcnews.com     20
bbc.com         19
bbc.co.uk       19
lemonde.fr      18
Name: count, dtype: int64


In [8]:
total_bots = len(raw_file)
ai_bots = raw_file['ai'].sum()
ai_share = ai_bots / total_bots * 100

print(f"AI bots: {ai_bots}/{total_bots} ({ai_share:.1f}% of all disallowed bots)")

AI bots: 477/1371 (34.8% of all disallowed bots)
