In [1]:
import pandas as pd
import requests
import time
from datetime import datetime
import json
import logging
import os

logging.basicConfig(filename='10_nbconvert.log', level=logging.INFO)

# Request rate	4 lookups / min
# Daily quota	500 lookups / day
# Monthly quota	15.5 K lookups / month

sleep = 200  # sleep 200 seconds after each call

headers = {
    "accept": "application/json",
    # https://www.virustotal.com/gui/user/lsys/apikey
    "x-apikey": ""
}

In [None]:
chunk = "data/chunk0.csv"
df = pd.read_csv(chunk)

for ix, row in df.iterrows():
    now = datetime.now().strftime("%d %b %Y %H:%M:%S")
    domain = row["private_domain"]
    return_code = row["return_code"]
    
    ## Move on if already queried
    if return_code == 200:
        logging.info(f"({1+ix}/{len(df)}, {now}): Return code: {int(return_code)} - {domain}")
        continue

    if os.path.exists(f"payloads_json/{domain}.json"):
        logging.info(f"({1+ix}/{len(df)}, {now}): Return code: {return_code} - {domain}")
        continue

    ## Query
    # https://developers.virustotal.com/reference/domain-info
    vt_url = f"https://www.virustotal.com/api/v3/domains/{domain}"
    response = requests.get(vt_url, headers=headers)
    
    ## Save payload
    return_code = response.status_code 
    if return_code == 200:
        with open(f"payloads_json/{domain}.json", "wb") as f:
            f.write(response.content)
            
    ## Update csv to indicate row is already queried
    df.at[ix, "return_code"] = return_code
    
    logging.info(f"({1+ix}/{len(df)}, {now}): Return code: {return_code} - {domain}")
    
    time.sleep(sleep)

df.to_csv(chunk, index=False)

(1/8009, 28 Apr 2023 12:20:30): Return code: 200 - google.com
(2/8009, 28 Apr 2023 12:20:30): Return code: 200 - facebook.com
(3/8009, 28 Apr 2023 12:20:30): Return code: 200 - yahoo.com
(4/8009, 28 Apr 2023 12:20:30): Return code: 200 - bing.com
(5/8009, 28 Apr 2023 12:20:30): Return code: 200 - youtube.com
(6/8009, 28 Apr 2023 12:20:30): Return code: 200 - amazon.com
(7/8009, 28 Apr 2023 12:20:30): Return code: 200 - twitter.com
(8/8009, 28 Apr 2023 12:20:30): Return code: 200 - decipherinc.com
(9/8009, 28 Apr 2023 12:20:30): Return code: 200 - live.com
(10/8009, 28 Apr 2023 12:20:30): Return code: 200 - reddit.com
(11/8009, 28 Apr 2023 12:20:30): Return code: 200 - instagram.com
(12/8009, 28 Apr 2023 12:20:30): Return code: 200 - aol.com
(13/8009, 28 Apr 2023 12:23:51): Return code: 200 - msn.com
(14/8009, 28 Apr 2023 12:27:11): Return code: 200 - clarity.ms
(15/8009, 28 Apr 2023 12:30:32): Return code: 200 - microsoftonline.com
(16/8009, 28 Apr 2023 12:33:53): Return code: 200 - yo

In [None]:
df.to_csv(chunk, index=False)

In [None]:
df.query("return_code==200")