In [8]:
import os
import json
import time
from pathlib import Path

import pandas as pd
import requests
from dotenv import load_dotenv
from tqdm import tqdm

In [9]:
NOTEBOOKS_DIR = Path().resolve()
PROJECT_DIR = NOTEBOOKS_DIR.parent
DATA_DIR = PROJECT_DIR / 'data'

DATASET_DIR = DATA_DIR / 'pe-machine-learning-dataset'
REPORTS_DIR = DATASET_DIR / 'reports'
REPORTS_DIR.mkdir(exist_ok=True, parents=True)

In [10]:
load_dotenv(PROJECT_DIR / '.env')

True

In [11]:
df = pd.read_csv(DATASET_DIR / 'samples.csv')
df.shape

(201549, 12)

In [12]:
reports = pd.Series(map(lambda x: x.stem, REPORTS_DIR.glob('*')))
reports.shape

(3307,)

In [13]:
hashes_without_reports = df[~df.sha256.isin(reports)]
hashes_without_reports.shape

(198242, 12)

In [None]:
for idx, row in tqdm(hashes_without_reports.iterrows()):
    report = {}

    url = f'https://www.virustotal.com/api/v3/files/{row.sha256}'
    headers = {
        "accept": "application/json",
        "x-apikey": os.getenv('VT7_API'),
    }
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        report['files'] = response.json()
    else:
        raise Exception(response.text)

    time.sleep(8)

    url = f'https://www.virustotal.com/api/v3/files/{row.sha256}/behaviours'
    headers = {
        "accept": "application/json",
        "x-apikey": os.getenv('VT8_API'),
    }
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        report['files_behaviours'] = response.json()
    else:
        raise Exception(response.text)

    report_path = REPORTS_DIR / f'{row.sha256}.json'
    with report_path.open('w') as file:
        json.dump(report, file, indent=4)

    time.sleep(7)

366it [1:48:51, 17.27s/it]