In [1]:
import os
import shutil
import json
import time
from pathlib import Path

import pandas as pd
import requests
from dotenv import load_dotenv
from tqdm import tqdm

In [2]:
NOTEBOOKS_DIR = Path().resolve()
PROJECT_DIR = NOTEBOOKS_DIR.parent
DATA_DIR = PROJECT_DIR / 'data'

DATASET_DIR = DATA_DIR / 'pe-machine-learning-dataset'
REPORTS_DIR = DATASET_DIR / 'reports'
REPORTS_DIR.mkdir(exist_ok=True, parents=True)

In [7]:
load_dotenv(PROJECT_DIR / '.env')

True

In [8]:
df = pd.read_csv(DATASET_DIR / 'samples.csv')
df.shape

(201549, 12)

In [9]:
for _ in range(10):
    df = df.sample(frac=1)

In [10]:
reports = pd.Series(map(lambda x: x.stem, REPORTS_DIR.glob('*')))
reports.shape

(40126,)

In [11]:
df[df.sha256.isin(reports)].list.value_counts()

list
Blacklist    22199
Whitelist    17927
Name: count, dtype: int64

In [8]:
hashes_without_reports = df[~df.sha256.isin(reports)]
hashes_without_reports.shape

(161423, 12)

In [9]:
for idx, row in tqdm(hashes_without_reports.iterrows()):
    report = {}

    url = f'https://www.virustotal.com/api/v3/files/{row.sha256}'
    headers = {
        "accept": "application/json",
        "x-apikey": os.getenv('VT1_API'),
    }
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        report['files'] = response.json()
    else:
        raise Exception(response.text)

    time.sleep(8)

    url = f'https://www.virustotal.com/api/v3/files/{row.sha256}/behaviours'
    headers = {
        "accept": "application/json",
        "x-apikey": os.getenv('VT2_API'),
    }
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        report['files_behaviours'] = response.json()
    else:
        raise Exception(response.text)

    report_path = REPORTS_DIR / f'{row.sha256}.json'
    with report_path.open('w') as file:
        json.dump(report, file, indent=4)

    time.sleep(8)

437it [2:09:42, 17.81s/it]


KeyboardInterrupt: 

In [19]:
df_batch = hashes_without_reports.iloc[-5000:]

In [22]:
df_batch.to_parquet(DATA_DIR / 'df_batch.parquet')

In [12]:
df_with_reports = df[df.sha256.isin(reports)]

In [13]:
df_with_reports.shape

(40126, 12)

In [15]:
for idx, row in tqdm(df_with_reports.iterrows(), total=df_with_reports.shape[0]):
    src_sample_path = DATASET_DIR / 'samples' / str(row.id)
    dst_sample_path = DATASET_DIR / 'samples_with_reports' / str(row.id)
    shutil.copyfile(src_sample_path, dst_sample_path)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████| 40126/40126 [02:39<00:00, 251.97it/s]


In [16]:
df_with_reports.to_parquet(DATA_DIR / 'dataset_with_reports.parquet')