In [12]:
import os
import json
import time
from pathlib import Path

import pandas as pd
import requests
from dotenv import load_dotenv
from tqdm import tqdm

In [13]:
NOTEBOOKS_DIR = Path().resolve()
PROJECT_DIR = NOTEBOOKS_DIR.parent
DATA_DIR = PROJECT_DIR / 'data'

DATASET_DIR = DATA_DIR / 'pe-machine-learning-dataset'
REPORTS_DIR = DATASET_DIR / 'reports'
REPORTS_DIR.mkdir(exist_ok=True, parents=True)

In [14]:
load_dotenv(PROJECT_DIR / '.env')

True

In [15]:
df = pd.read_csv(DATASET_DIR / 'samples.csv')
df.shape

(201549, 12)

In [16]:
for _ in range(10):
    df = df.sample(frac=1)

In [35]:
reports = pd.Series(map(lambda x: x.stem, REPORTS_DIR.glob('*')))
reports.shape

(17151,)

In [36]:
df[df.sha256.isin(reports)].list.value_counts()

list
Blacklist    8923
Whitelist    8228
Name: count, dtype: int64

In [37]:
hashes_without_reports = df[~df.sha256.isin(reports)]
hashes_without_reports.shape

(184398, 12)

In [38]:
for idx, row in tqdm(hashes_without_reports.iterrows()):
    report = {}

    url = f'https://www.virustotal.com/api/v3/files/{row.sha256}'
    headers = {
        "accept": "application/json",
        "x-apikey": os.getenv('VT1_API'),
    }
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        report['files'] = response.json()
    else:
        raise Exception(response.text)

    time.sleep(8)

    url = f'https://www.virustotal.com/api/v3/files/{row.sha256}/behaviours'
    headers = {
        "accept": "application/json",
        "x-apikey": os.getenv('VT2_API'),
    }
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        report['files_behaviours'] = response.json()
    else:
        raise Exception(response.text)

    report_path = REPORTS_DIR / f'{row.sha256}.json'
    with report_path.open('w') as file:
        json.dump(report, file, indent=4)

    time.sleep(8)

47it [13:57, 17.82s/it]


Exception: {
    "error": {
        "message": "Quota exceeded",
        "code": "QuotaExceededError"
    }
}

In [20]:
df_batch = hashes_without_reports.iloc[-5000:]

In [21]:
df_batch.to_parquet(DATA_DIR / 'df_batch.parquet')