In [1]:
import os
import pandas as pd
import janitor
from IPython.display import display
from inequality.gini import Gini
from scipy.stats import circmean
import warnings

warnings.filterwarnings("ignore")

import numpy as np

FP_IND_DOMAIN = "../data/yg_ind_domain.csv.gz"
FP_VISITS_DATA = "../data/6.3m-ind-domain-data.csv.gz"
FP_VT_LABELS = "../data/yg_virustotal_dat.csv"
FP_IND_DEMO = "../data/profile.csv"
FILEPATHS = [value for name, value in globals().items() if name.startswith("FP_")]
for file_path in FILEPATHS:
    try:
        assert os.path.exists(file_path), f"File not found: {file_path}"
    except AssertionError:
        print(f"File not found: {file_path}.")

In [20]:
# checking all vt output labels
import json

json_dir = "./payloads_json/"

# Initialize an empty set to collect all unique keys
all_keys = set()

# Loop through each file in the directory
for filename in os.listdir(json_dir):
    if filename.endswith(".json"):
        filepath = os.path.join(json_dir, filename)
        with open(filepath, "r") as f:
            try:
                data = json.load(f)
                stats = data["data"]["attributes"]["last_analysis_stats"]
                all_keys.update(stats.keys())
            except (json.JSONDecodeError, KeyError, TypeError) as e:
                print(f"Skipping {filename}: {e}")

# Output the result
print("All unique keys in last_analysis_stats across all files:")
print(all_keys)

Skipping harris-interactive.fr.json: Unterminated string starting at: line 68 column 22 (char 2398)
Skipping contactsdirect.com.json: Unterminated string starting at: line 38 column 30 (char 1197)
Skipping survivalinternational.org.json: Expecting ',' delimiter: line 78 column 13 (char 2623)
Skipping conservativenewsdaily.net.json: Expecting property name enclosed in double quotes: line 41 column 21 (char 1215)
Skipping carsformoms.org.json: Expecting ',' delimiter: line 555 column 15 (char 22355)
Skipping minimalistquotes.com.json: Unterminated string starting at: line 38 column 21 (char 1214)
Skipping porsche.com.json: Expecting property name enclosed in double quotes: line 280 column 2 (char 11606)
Skipping westshorehome.com.json: Expecting property name enclosed in double quotes: line 37 column 3 (char 1215)
Skipping minify.mobi.json: Unterminated string starting at: line 53 column 13 (char 3153)
Skipping shareowneronline.com.json: Unterminated string starting at: line 85 column 13

Skipping minecraften.net.json: Expecting property name enclosed in double quotes: line 37 column 18 (char 1215)
Skipping minecrafteduservices.com.json: Unterminated string starting at: line 37 column 22 (char 1140)
Skipping 5280.com.json: Unterminated string starting at: line 83 column 22 (char 2755)
Skipping myhighperformancecoaching.com.json: Expecting value: line 39 column 29 (char 1215)
Skipping minim.com.json: Unterminated string starting at: line 32 column 30 (char 1140)
Skipping celebdirtylaundry.com.json: Expecting ',' delimiter: line 69 column 45 (char 3156)
Skipping trackyourdividends.com.json: Expecting property name enclosed in double quotes: line 37 column 13 (char 1215)
Skipping calltothepen.com.json: Expecting property name enclosed in double quotes: line 402 column 18 (char 15446)
Skipping aeroflot.ru.json: Unterminated string starting at: line 625 column 17 (char 23762)
Skipping allthatsinteresting.com.json: Expecting property name enclosed in double quotes: line 82 co

In [3]:
df_vt = (
    pd.read_csv(FP_VT_LABELS)
    .assign(
        malicious_bool=lambda df_: np.where(df_["malicious"] >= 2, True, False),
#         malicious_visits=lambda df_: df_["malicious_bool"] * df_["visits"],
#         malicious_min=lambda df_: df_["malicious_bool"] * df_["duration_min"],
#         malicious_hr=lambda df_: df_["malicious_bool"] * df_["duration_hr"],
        suspicious_bool=lambda df_: df_["suspicious"].astype("bool"),
    ))
df_vt

Unnamed: 0,filename,harmless,malicious,suspicious,undetected,timeout,forcepoint,sophos,bitdefender,comodo,alphamnt,malicious_bool,suspicious_bool
0,teatroporno.com,68,0,0,20,0,sex,sexually explicit,porn,,,False,False
1,commissionsoup.com,68,0,0,20,0,financial data and services,,financial,,Business/Economy,False,False
2,filesanywhere.com,67,0,0,20,0,personal network storage and backup,personal network storage,computersandsoftware,,"File Sharing/Storage, Information Technology",False,False
3,lasc.org,67,0,0,20,0,,,,,Government/Legal,False,False
4,faron.com,67,0,0,21,0,,,,,"Business/Economy, Health",False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
64015,givex.com,68,0,0,19,0,,,,,Business/Economy,False,False
64016,diocesemo.org,65,0,0,22,0,widely-known religions,,education,,Religion,False,False
64017,karatemart.com,67,0,0,20,0,,,,,"Shopping, Weapons",False,False
64018,inthegardenradio.com,66,1,0,21,0,shopping,,onlineshop,media sharing,,False,False


In [4]:
df_vt.query("malicious_bool==1")

Unnamed: 0,filename,harmless,malicious,suspicious,undetected,timeout,forcepoint,sophos,bitdefender,comodo,alphamnt,malicious_bool,suspicious_bool
7,spacecloudstore.com,63,2,1,21,0,,,parked,,Unrated,True,True
24,ptistyvymi.com,64,2,0,21,0,,,parked,,,True,False
143,coreopsisnet.ru,59,7,0,23,0,,,,,"Phishing, Scam/Illegal/Unethical",True,False
182,ospotify.com,62,3,1,22,0,,,,,"Email, Productivity Applications, Suspicious",True,True
325,findpaidfocusgroup.com,66,2,0,19,0,pay-to-surf,,business,media sharing,,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
63703,ecstaticlane.com,53,11,0,24,0,,phishing and fraud,parked,,Phishing,True,False
63750,fmuniv.edu,65,2,0,20,0,educational institutions,educational institutions,education,,Education,True,False
63788,crackedmod.com,55,11,0,22,0,,spyware and malware,,,"Hacking, Malicious",True,False
63908,downloadph.com,62,2,2,22,0,compromised websites,spyware and malware,parked,,Suspicious,True,True


In [9]:
with open("payloads_json/spacecloudstore.com.json", "r") as f:
    data = json.load(f)
data

{'data': {'attributes': {'last_dns_records': [{'type': 'A',
     'value': '149.248.3.79',
     'ttl': 60},
    {'type': 'NS', 'value': 'ns-240.awsdns-30.com', 'ttl': 21600},
    {'type': 'NS', 'value': 'ns-1912.awsdns-47.co.uk', 'ttl': 21600},
    {'type': 'NS', 'value': 'ns-1293.awsdns-33.org', 'ttl': 21600},
    {'type': 'NS', 'value': 'ns-932.awsdns-52.net', 'ttl': 21600},
    {'rname': 'awsdns-hostmaster.amazon.com',
     'retry': 900,
     'refresh': 7200,
     'minimum': 86400,
     'value': 'ns-1293.awsdns-33.org',
     'expire': 1209600,
     'ttl': 900,
     'serial': 1,
     'type': 'SOA'}],
   'jarm': '3fd3fd0003fd3fd21c3fd3fd3fd3fdc2ddcfd203d071c45b4b0ffe3d7b4b89',
   'whois': 'Administrative city: Phoenix\nAdministrative country: United States\nAdministrative email: 49724bce0f18a05bs@privacyguardian.org\nAdministrative state: AZ\nCreate date: 2022-06-02\nDomain name: spacecloudstore.com\nDomain registrar id: 1479\nDomain registrar url: http://www.namesilo.com\nExpiry date:

In [10]:
data

{'data': {'attributes': {'last_dns_records': [{'type': 'A',
     'value': '149.248.3.79',
     'ttl': 60},
    {'type': 'NS', 'value': 'ns-240.awsdns-30.com', 'ttl': 21600},
    {'type': 'NS', 'value': 'ns-1912.awsdns-47.co.uk', 'ttl': 21600},
    {'type': 'NS', 'value': 'ns-1293.awsdns-33.org', 'ttl': 21600},
    {'type': 'NS', 'value': 'ns-932.awsdns-52.net', 'ttl': 21600},
    {'rname': 'awsdns-hostmaster.amazon.com',
     'retry': 900,
     'refresh': 7200,
     'minimum': 86400,
     'value': 'ns-1293.awsdns-33.org',
     'expire': 1209600,
     'ttl': 900,
     'serial': 1,
     'type': 'SOA'}],
   'jarm': '3fd3fd0003fd3fd21c3fd3fd3fd3fdc2ddcfd203d071c45b4b0ffe3d7b4b89',
   'whois': 'Administrative city: Phoenix\nAdministrative country: United States\nAdministrative email: 49724bce0f18a05bs@privacyguardian.org\nAdministrative state: AZ\nCreate date: 2022-06-02\nDomain name: spacecloudstore.com\nDomain registrar id: 1479\nDomain registrar url: http://www.namesilo.com\nExpiry date:

In [11]:
data.keys()

dict_keys(['data'])

In [13]:
data["data"].keys()

dict_keys(['attributes', 'type', 'id', 'links'])

In [14]:
data["data"]["attributes"]

{'last_dns_records': [{'type': 'A', 'value': '149.248.3.79', 'ttl': 60},
  {'type': 'NS', 'value': 'ns-240.awsdns-30.com', 'ttl': 21600},
  {'type': 'NS', 'value': 'ns-1912.awsdns-47.co.uk', 'ttl': 21600},
  {'type': 'NS', 'value': 'ns-1293.awsdns-33.org', 'ttl': 21600},
  {'type': 'NS', 'value': 'ns-932.awsdns-52.net', 'ttl': 21600},
  {'rname': 'awsdns-hostmaster.amazon.com',
   'retry': 900,
   'refresh': 7200,
   'minimum': 86400,
   'value': 'ns-1293.awsdns-33.org',
   'expire': 1209600,
   'ttl': 900,
   'serial': 1,
   'type': 'SOA'}],
 'jarm': '3fd3fd0003fd3fd21c3fd3fd3fd3fdc2ddcfd203d071c45b4b0ffe3d7b4b89',
 'whois': 'Administrative city: Phoenix\nAdministrative country: United States\nAdministrative email: 49724bce0f18a05bs@privacyguardian.org\nAdministrative state: AZ\nCreate date: 2022-06-02\nDomain name: spacecloudstore.com\nDomain registrar id: 1479\nDomain registrar url: http://www.namesilo.com\nExpiry date: 2023-06-02\nQuery time: 2022-06-05 16:04:51\nRegistrant address

In [17]:
data["data"]["attributes"]["last_analysis_results"]

{'Bkav': {'category': 'undetected',
  'result': 'unrated',
  'method': 'blacklist',
  'engine_name': 'Bkav'},
 'CMC Threat Intelligence': {'category': 'harmless',
  'result': 'clean',
  'method': 'blacklist',
  'engine_name': 'CMC Threat Intelligence'},
 'Snort IP sample list': {'category': 'harmless',
  'result': 'clean',
  'method': 'blacklist',
  'engine_name': 'Snort IP sample list'},
 '0xSI_f33d': {'category': 'undetected',
  'result': 'unrated',
  'method': 'blacklist',
  'engine_name': '0xSI_f33d'},
 'ViriBack': {'category': 'harmless',
  'result': 'clean',
  'method': 'blacklist',
  'engine_name': 'ViriBack'},
 'PhishLabs': {'category': 'undetected',
  'result': 'unrated',
  'method': 'blacklist',
  'engine_name': 'PhishLabs'},
 'K7AntiVirus': {'category': 'harmless',
  'result': 'clean',
  'method': 'blacklist',
  'engine_name': 'K7AntiVirus'},
 'CINS Army': {'category': 'harmless',
  'result': 'clean',
  'method': 'blacklist',
  'engine_name': 'CINS Army'},
 'Quttera': {'cate

In [19]:
data["data"]["attributes"]["last_analysis_stats"]

{'harmless': 63,
 'malicious': 2,
 'suspicious': 1,
 'undetected': 21,
 'timeout': 0}