# Cost of Athena on CC-MAIN index

In [1]:
import subprocess
import datetime

In [2]:
def get_ccmain_batches(years):
    """
    Return list[string] of all CC-Main batches in the given years
    """
    print(f"Fetching CCMAIN URLs for years: {years}")
    index = subprocess.check_output(["aws", "s3", "ls", "--no-sign-request", 
                                     "s3://commoncrawl/cc-index/table/cc-main/warc/"])
    index = [x.split("crawl=")[1] for x in index.decode().split('\n')[:-1]]
    
    # Subselect based on years provided
    index_subset = [i for i in index if i.split('-')[2] in years]
    
    return index_subset

In [3]:
batches = get_ccmain_batches(["2020", "2021"])
print("Batches:")
print(batches)

Fetching CCMAIN URLs for years: ['2020', '2021']
Batches:
['CC-MAIN-2020-05/', 'CC-MAIN-2020-10/', 'CC-MAIN-2020-16/', 'CC-MAIN-2020-24/', 'CC-MAIN-2020-29/', 'CC-MAIN-2020-34/', 'CC-MAIN-2020-40/', 'CC-MAIN-2020-45/', 'CC-MAIN-2020-50/', 'CC-MAIN-2021-04/', 'CC-MAIN-2021-10/']


In [4]:
def get_total_size(batch):
    terminal = subprocess.check_output(["aws", "s3", "ls", 
                                        f"commoncrawl/cc-index/table/cc-main/warc/crawl={batch}subset=warc/", 
                                        "--recursive", "--human-readable", "--summarize", "--no-sign-request"]).decode()
    idx = terminal.find('Total Size:')
    total_size_str = terminal[idx:].rstrip()
    
    return total_size_str

In [5]:
total_sizes = {batch: get_total_size(batch) for batch in batches}
total_sizes

{'CC-MAIN-2020-05/': 'Total Size: 230.3 GiB',
 'CC-MAIN-2020-10/': 'Total Size: 198.4 GiB',
 'CC-MAIN-2020-16/': 'Total Size: 213.7 GiB',
 'CC-MAIN-2020-24/': 'Total Size: 201.0 GiB',
 'CC-MAIN-2020-29/': 'Total Size: 227.1 GiB',
 'CC-MAIN-2020-34/': 'Total Size: 178.2 GiB',
 'CC-MAIN-2020-40/': 'Total Size: 264.5 GiB',
 'CC-MAIN-2020-45/': 'Total Size: 200.9 GiB',
 'CC-MAIN-2020-50/': 'Total Size: 195.6 GiB',
 'CC-MAIN-2021-04/': 'Total Size: 259.6 GiB',
 'CC-MAIN-2021-10/': 'Total Size: 202.8 GiB'}

In [6]:
total_sizes_float = [
    float(total_sizes[batch].replace('Total Size: ', '').replace(' GiB', ''))
    for batch in batches
]
total_GiB = sum(total_sizes_float)
total_GB = total_GiB * (1024 ** 3 / 1000 ** 3)
total_TB = total_GB / 1000

print(f"Total size: {round(total_GB, 2)} GB or {round(total_TB, 2)} TB")

# https://aws.amazon.com/athena/pricing/
athena_price_per_TB = 5

# Note that this cost only includes the currently released 2021 batches
# There will be more released (thus higher cost) as time goes on
current_date = datetime.datetime.now().strftime("%d/%m/%Y")
print(f"Cost of querying 2020/2021: ${round(total_TB * athena_price_per_TB, 4)}"
      "\n\n"
      f"as at {current_date}")

Total size: 2547.02 GB or 2.55 TB
Cost of querying 2020/2021: $12.7351

as at 23/04/2021


**Do equivalent for all years.**

In [7]:
index = subprocess.check_output(["aws", "s3", "ls", "--no-sign-request", 
                                     "s3://commoncrawl/cc-index/table/cc-main/warc/"])
all_batches = [x.split("crawl=")[1] for x in index.decode().split('\n')[:-1]]
all_total_sizes = {batch: get_total_size(batch) for batch in all_batches}
all_total_sizes_float = [
    float(all_total_sizes[batch].replace('Total Size: ', '').replace(' GiB', ''))
    for batch in all_batches
]
all_total_TB = sum(all_total_sizes_float) * (1024 ** 3 / 1000 ** 3) / 1000
print(f"FOR ALL YEARS ({all_batches[0][:-1]} till {all_batches[-1][:-1]})\n\n"
      f"Total size: {round(all_total_TB, 2)} TB\n"
      f"Cost of querying all years : ${round(all_total_TB * athena_price_per_TB, 4)}"
      "\n\n"
      f"as at {current_date}")

FOR ALL YEARS (CC-MAIN-2013-20 till CC-MAIN-2021-10)

Total size: 15.34 TB
Cost of querying all years : $76.6985

as at 23/04/2021
