# ADsBot Verification
Use crawl log data to verify crawling of AdsBot using Google IP ranges

In [42]:
import pandas as pd
import requests

## Load Crawl Data

In [52]:
data = pd.read_csv('data/AdsBot_logs.csv')
df = data.copy()
df.head()

Unnamed: 0,userAgent,remoteIp,count
0,AdsBot-Google (+http://www.google.com/adsbot.h...,72.14.199.98,515
1,AdsBot-Google (+http://www.google.com/adsbot.h...,72.14.199.101,510
2,AdsBot-Google (+http://www.google.com/adsbot.h...,72.14.199.99,508
3,AdsBot-Google (+http://www.google.com/adsbot.h...,72.14.199.100,499
4,AdsBot-Google (+http://www.google.com/adsbot.h...,72.14.199.96,494


## Fetch Google IP Ranges

In [44]:
url = 'https://developers.google.com/static/search/apis/ipranges/special-crawlers.json'
response = requests.get(url)

def check_response(response):
    if response.status_code == 200:
        print('Successful Request')
        return response.json()
    else:
        print('Request Failed')

content = check_response(response)

Successful Request


## Extract the IP ranges

In [45]:
# Extract the IP ranges
google_ips = content['prefixes']

# create a list of IP ranges
google_ip_ranges = [ip for ip in google_ips]

# create ip ranges for ipv6Prefix
google_ipv6_ranges = [ip for ip in google_ips if 'ipv6Prefix' in ip]
google_ipv4_ranges = [ip for ip in google_ips if 'ipv4Prefix' in ip]

# create dataframes
google_ipv4_df = pd.DataFrame(google_ipv4_ranges)
google_ipv6_df = pd.DataFrame(google_ipv6_ranges)

In [46]:
import ipaddress

# Function to compute the range
def compute_range(cidr):
    net = ipaddress.ip_network(cidr)
    return str(net[0]), str(net[-1])

# Apply the function to compute start and end of each range
google_ipv4_df['Start IP'], google_ipv4_df['End IP'] = zip(*google_ipv4_df['ipv4Prefix'].apply(compute_range))

google_ipv6_df['Start IP'], google_ipv6_df['End IP'] = zip(*google_ipv6_df['ipv6Prefix'].apply(compute_range))

# combine the dataframes
google_ip_df = google_ipv4_df[['Start IP', 'End IP']]
google_ip_df

Unnamed: 0,Start IP,End IP
0,108.177.2.0,108.177.2.31
1,192.178.17.0,192.178.17.31
2,209.85.238.0,209.85.238.31
3,209.85.238.128,209.85.238.159
4,209.85.238.160,209.85.238.191
...,...,...
99,74.125.218.32,74.125.218.63
100,74.125.218.64,74.125.218.95
101,74.125.218.96,74.125.218.127
102,74.125.219.0,74.125.219.31


## Verify the IP addresses

In [47]:
# Check if an IP address is in the range
def check_ip(ip, start, end):
    ip_int = int(ipaddress.ip_address(ip))
    start_int = int(ipaddress.ip_address(start))
    end_int = int(ipaddress.ip_address(end))
    result = start_int <= ip_int <= end_int
    return result

# Check if the IP address is in the range for the first 4 rows of df
df['is_google'] = False
for df_index, df_row in df.iterrows():  # Loop through the first 4 rows
    is_google = False
    for google_index, google_row in google_ip_df.iterrows():
        if check_ip(df_row['remoteIp'], google_row['Start IP'], google_row['End IP']):
            is_google = True
            break
    df.at[df_index, 'is_google'] = is_google

# print summary
df['is_google'].value_counts(normalize=True)

is_google
True     0.99376
False    0.00624
Name: proportion, dtype: float64

In [48]:
# Summarize the data
df.groupby('is_google').agg({'is_google': 'count',
                             'count': 'sum',
                             }).rename(columns={'is_google': 'count',
                                                'count': 'total_requests',})

Unnamed: 0_level_0,count,total_requests
is_google,Unnamed: 1_level_1,Unnamed: 2_level_1
False,4,5
True,637,16412


In [49]:
# Example: return False rows
df[df['is_google'] == False]

Unnamed: 0,userAgent,remoteIp,count,is_google
519,Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...,66.102.8.64,2,False
562,Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...,64.233.172.137,1,False
565,Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...,64.233.172.135,1,False
582,Mozilla/5.0 (Linux; Android 13; Pixel 7) Apple...,64.233.172.137,1,False


## Export the data

In [50]:
import time 
datetime = time.strftime("%Y%m%d")
df.to_csv(f'export/{datetime}AdsBot_Verification.csv', index=False)
print('Data Exported Successfully!')

Data Exported Successfully
