In [1]:
import os
import pandas as pd
import janitor
import warnings
from IPython.display import display

warnings.filterwarnings("ignore")

import numpy as np

FP_VISITS_DATA = "../data/6.3m-ind-domain-data.csv.gz"
FP_VT_LABELS = "../data/yg_virustotal_dat.csv"
FILEPATHS = [value for name, value in globals().items() if name.startswith("FP_")]
for file_path in FILEPATHS:
    try:
        assert os.path.exists(file_path), f"File not found: {file_path}"
    except AssertionError:
        print(f"File not found: {file_path}.")

from tqdm.notebook import tqdm


import time
import random
import requests

In [3]:
df_vt_visits = (
    pd.read_csv(FP_VT_LABELS)
    # ===============================================
    # Defining labels
    .assign(
        malicious_bool=lambda df_: np.where(df_["malicious"] >= 2, True, False),
        suspicious_bool=lambda df_: df_["suspicious"].astype("bool"),
    ).rename_column("filename", "domain")
    # ===============================================
    # Get visit weights
    .merge(
        (
            pd.read_csv(FP_VISITS_DATA)
            .rename_column("private_domain", "domain")
            .groupby("domain", as_index=False)
            .agg(visits=("domain", "size"), visit_duration=("visit_duration", "sum"))
        ),
        how="left",
        on="domain",
        validate="1:1",
    )
    # ===============================================
    # retrieve YG category for completeness
    .merge(
        (
            pd.read_csv(FP_VISITS_DATA)
            .rename_column("private_domain", "domain")
            .groupby(["domain", "category"])
            .size()
            .reset_index(name="visits")
            .sort_values("visits", ascending=False, ignore_index=True)
            .remove_columns("visits")
            .rename_column("category", "yougov")
            .drop_duplicates("domain", keep="first")
        ),
        how="left",
        on="domain",
        validate="1:1",
    )
    # ===============================================
    .reorder_columns(
        [
            "domain",
            "malicious",
            "forcepoint",
            "sophos",
            "bitdefender",
            "comodo",
            "alphamnt",
            "yougov",
        ]
    )
)
display(df_vt_visits)
df_vt_visits.info()

Unnamed: 0,domain,malicious,forcepoint,sophos,bitdefender,comodo,alphamnt,yougov,harmless,suspicious,undetected,timeout,malicious_bool,suspicious_bool,visits,visit_duration
0,teatroporno.com,0,sex,sexually explicit,porn,,,Adult,68,0,20,0,False,False,3,8
1,commissionsoup.com,0,financial data and services,,financial,,Business/Economy,Business,68,0,20,0,False,False,85,248
2,filesanywhere.com,0,personal network storage and backup,personal network storage,computersandsoftware,,"File Sharing/Storage, Information Technology",,67,0,20,0,False,False,10,76
3,lasc.org,0,,,,,Government/Legal,,67,0,20,0,False,False,2,122
4,faron.com,0,,,,,"Business/Economy, Health",,67,0,21,0,False,False,4,30
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64015,givex.com,0,,,,,Business/Economy,Business,68,0,19,0,False,False,2,156
64016,diocesemo.org,0,widely-known religions,,education,,Religion,,65,0,22,0,False,False,11,782
64017,karatemart.com,0,,,,,"Shopping, Weapons",,67,0,20,0,False,False,2,12
64018,inthegardenradio.com,1,shopping,,onlineshop,media sharing,,,66,0,21,0,False,False,1,0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 64020 entries, 0 to 64019
Data columns (total 16 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   domain           64020 non-null  object
 1   malicious        64020 non-null  int64 
 2   forcepoint       47931 non-null  object
 3   sophos           27111 non-null  object
 4   bitdefender      43373 non-null  object
 5   comodo           13121 non-null  object
 6   alphamnt         36054 non-null  object
 7   yougov           25611 non-null  object
 8   harmless         64020 non-null  int64 
 9   suspicious       64020 non-null  int64 
 10  undetected       64020 non-null  int64 
 11  timeout          64020 non-null  int64 
 12  malicious_bool   64020 non-null  bool  
 13  suspicious_bool  64020 non-null  bool  
 14  visits           64020 non-null  int64 
 15  visit_duration   64020 non-null  int64 
dtypes: bool(2), int64(7), object(7)
memory usage: 7.0+ MB


In [4]:
domain_list = df_vt_visits["domain"].tolist()

domain_list.append("http://testsafebrowsing.appspot.com/s/malware.html")
domain_list.append("testsafebrowsing.appspot.com/s/malware.html")
domain_list.append("http://testsafebrowsing.appspot.com")
domain_list.append("testsafebrowsing.appspot.com")
domain_list[:10]

['teatroporno.com',
 'commissionsoup.com',
 'filesanywhere.com',
 'lasc.org',
 'faron.com',
 'usjobsource.com',
 'flyelite.com',
 'spacecloudstore.com',
 'witchculttranslation.com',
 'pornamigo.com',
 'nestleusa.com',
 'resellerratings.com',
 'sumday.com',
 'typeit.org',
 'wkyc.com',
 'activeapis.com',
 'mrklaff.com',
 'triofitnesstraining.com',
 'fileformat.info',
 'tolunastart.com',
 'xxxchina.net',
 'livingbode.com',
 'motominer.com',
 'mynorthnews.org',
 'ptistyvymi.com',
 'gobarrybucs.com',
 'sflix.se',
 'nordisdirect.net',
 'naturalwellnessroutine.com',
 'arzelzoning.com',
 'pickalender.com',
 'kathyhochul.com',
 'wavebrowser.co',
 'adultdeals.com',
 'keyboardkraze.com',
 'bark.us',
 'polinews.org',
 'actuallysnake.com',
 'kcsm.org',
 'independent.com',
 'hardreset.info',
 'lasertron.us',
 'smtsmartusa.com',
 'fakeddoors.com',
 'preferredstockchannel.com',
 'paisly.com',
 'betahustler.com.ng',
 'highpointscientific.com',
 'directionsforme.org',
 'nwwahome.com',
 'thetechout.com',

In [2]:
with open("../gsb_api_key.txt", "r") as f:
    API_KEY = f.read().strip()

ENDPOINT_URL = (
    f"https://safebrowsing.googleapis.com/v4/threatMatches:find?key={API_KEY}"
)

In [5]:
def chunks(lst, n):
    """Yield successive n-sized chunks."""
    for i in range(0, len(lst), n):
        yield lst[i : i + n]


all_results = []

# for batch in chunks(domain_list, 500):
for batch in tqdm(
    chunks(domain_list, 500),
    total=(len(domain_list) + 499) // 500,
    desc="Checking batches",
):
    # ------------------------------------------------------------
    # init payload
    payload = {
        "client": {"clientId": "xx", "clientVersion": "0"},
        "threatInfo": {
            "threatTypes": [
                "MALWARE",
                "SOCIAL_ENGINEERING",
                "POTENTIALLY_HARMFUL_APPLICATION",
                "UNWANTED_SOFTWARE",
            ],
            "platformTypes": ["ANY_PLATFORM"],
            "threatEntryTypes": ["URL"],
            "threatEntries": [{"url": url} for url in batch],
        },
    }

    response = requests.post(ENDPOINT_URL, json=payload)

    # ------------------------------------------------------------
    # post
    try:
        response = requests.post(ENDPOINT_URL, json=payload, timeout=10)

        if response.status_code == 200:
            result = response.json()

            if "matches" in result:
                for match in result["matches"]:
                    all_results.append(
                        {
                            "domain": match["threat"]["url"],
                            "threatType": match["threatType"],
                            "platformType": match["platformType"],
                        }
                    )
            else:
                for url in batch:
                    all_results.append(
                        {"domain": url, "threatType": None, "platformType": None}
                    )
        else:
            print(f"Error {response.status_code}: {response.text}")

    except Exception as e:
        print(f"Request failed: {e}")

    time.sleep(random.uniform(0.1, 1.5))

df_results = pd.DataFrame(all_results)
df_results.to_csv("..data/gsb_results_domains.csv", index=False)

Checking batches:   0%|          | 0/129 [00:00<?, ?it/s]

In [9]:
df_results

Unnamed: 0,domain,threatType,platformType
0,teatroporno.com,,
1,commissionsoup.com,,
2,filesanywhere.com,,
3,lasc.org,,
4,faron.com,,
...,...,...,...
53521,mhanational.org,,
53522,trendpornvids.com,,
53523,lmclakers.org,,
53524,http://testsafebrowsing.appspot.com/s/malware....,MALWARE,ANY_PLATFORM


In [10]:
df_results.query("threatType==threatType").reset_index()

Unnamed: 0,index,domain,threatType,platformType
0,2500,people-wet.com,SOCIAL_ENGINEERING,ANY_PLATFORM
1,7501,alertsx.com,SOCIAL_ENGINEERING,ANY_PLATFORM
2,7502,card.co,SOCIAL_ENGINEERING,ANY_PLATFORM
3,17503,22-now.com,SOCIAL_ENGINEERING,ANY_PLATFORM
4,18504,antezuumer.com,SOCIAL_ENGINEERING,ANY_PLATFORM
5,22505,prelandappslab.com,SOCIAL_ENGINEERING,ANY_PLATFORM
6,27006,latenightlovers.com,SOCIAL_ENGINEERING,ANY_PLATFORM
7,28007,join-the-update.com,SOCIAL_ENGINEERING,ANY_PLATFORM
8,28508,goforandroid.com,UNWANTED_SOFTWARE,ANY_PLATFORM
9,30509,dariwholesales.com,SOCIAL_ENGINEERING,ANY_PLATFORM


In [12]:
df_results.query("threatType==threatType").merge(df_vt_visits, on="domain", how="left")

Unnamed: 0,domain,threatType,platformType,malicious,forcepoint,sophos,bitdefender,comodo,alphamnt,yougov,harmless,suspicious,undetected,timeout,malicious_bool,suspicious_bool,visits,visit_duration
0,people-wet.com,SOCIAL_ENGINEERING,ANY_PLATFORM,0.0,adult content,sexually explicit,onlinedating,,,"Adult, Business",68.0,0.0,20.0,0.0,False,False,4.0,82.0
1,alertsx.com,SOCIAL_ENGINEERING,ANY_PLATFORM,0.0,financial data and services,,blogs,,,"Business, Education",67.0,0.0,20.0,0.0,False,False,3.0,2.0
2,card.co,SOCIAL_ENGINEERING,ANY_PLATFORM,0.0,,,misc,,Business/Economy,,64.0,1.0,23.0,0.0,False,True,1.0,0.0
3,22-now.com,SOCIAL_ENGINEERING,ANY_PLATFORM,0.0,,,,media sharing,,,67.0,0.0,21.0,0.0,False,False,3.0,0.0
4,antezuumer.com,SOCIAL_ENGINEERING,ANY_PLATFORM,0.0,,,,media sharing,,,65.0,0.0,22.0,0.0,False,False,4.0,12.0
5,prelandappslab.com,SOCIAL_ENGINEERING,ANY_PLATFORM,5.0,information technology,content delivery,parked,,Malicious,,64.0,0.0,19.0,0.0,True,False,65.0,414.0
6,latenightlovers.com,SOCIAL_ENGINEERING,ANY_PLATFORM,0.0,web infrastructure,,tabloids,,,,67.0,0.0,21.0,0.0,False,False,4.0,2.0
7,join-the-update.com,SOCIAL_ENGINEERING,ANY_PLATFORM,1.0,web infrastructure,,misc,,,,66.0,0.0,21.0,0.0,False,False,4.0,86.0
8,goforandroid.com,UNWANTED_SOFTWARE,ANY_PLATFORM,0.0,information technology,parked domains,onlineshop,,"Information Technology, Shopping, Software Dow...",,69.0,0.0,19.0,0.0,False,False,5.0,16.0
9,dariwholesales.com,SOCIAL_ENGINEERING,ANY_PLATFORM,0.0,shopping,,,media sharing,,,67.0,0.0,20.0,0.0,False,False,1.0,0.0


## Subdomains

In [20]:
FP_WEB = "../data/yg/realityMine_web_2022-06-01_2022-06-30.csv"
df_web = pd.read_csv(FP_WEB)
df_web.head()

Unnamed: 0,caseid,group_name,client_id,client_key,os_name,os_version,device_manufacturer,device_model,device_type,session_start_time,...,ref_domain,content_type,content_length,search_term,page_duration,private_domain,category,page_url_anonymized,predecessor_url_anonymized,succesor_url_anonymized
0,262543201,YouGov.USA,3939812436,lvni7yj5xjdehlaizs7y7bo3pi,Windows,10,ASUSTeK COMPUTER INC.,ROG Strix G712LWS_G712LWS,Laptop/Desktop,2022-06-07 19:32:42.634,...,,,,,5,heb.com,"Food and Recipes, Shopping",https://www.heb.com/,,https://www.heb.com/weekly-ads/weekly-deals/
1,262543201,YouGov.USA,3939812436,lvni7yj5xjdehlaizs7y7bo3pi,Windows,10,ASUSTeK COMPUTER INC.,ROG Strix G712LWS_G712LWS,Laptop/Desktop,2022-06-07 19:32:47.937,...,,,,,9,heb.com,"Food and Recipes, Shopping",https://www.heb.com/weekly-ads/weekly-deals/,https://www.heb.com/,https://www.kroger.com/savings/weeklyad/
2,262543201,YouGov.USA,3939812436,lvni7yj5xjdehlaizs7y7bo3pi,Windows,10,ASUSTeK COMPUTER INC.,ROG Strix G712LWS_G712LWS,Laptop/Desktop,2022-06-07 19:35:00.322,...,,,,,40,kroger.com,"Business, Shopping",https://www.kroger.com/savings/weeklyad/,https://www.heb.com/weekly-ads/weekly-deals/,https://www.google.com/search?ANONYMIZED
3,262543201,YouGov.USA,3939812436,lvni7yj5xjdehlaizs7y7bo3pi,Windows,10,ASUSTeK COMPUTER INC.,ROG Strix G712LWS_G712LWS,Laptop/Desktop,2022-06-07 19:35:40.594,...,,,,smas,9,google.com,Search Engines and Portals,https://www.google.com/search?ANONYMIZED,https://www.kroger.com/savings/weeklyad/,https://www.google.com/search?ANONYMIZED
4,262543201,YouGov.USA,3939812436,lvni7yj5xjdehlaizs7y7bo3pi,Windows,10,ASUSTeK COMPUTER INC.,ROG Strix G712LWS_G712LWS,Laptop/Desktop,2022-06-07 19:35:49.116,...,,,,samsclub,10,google.com,Search Engines and Portals,https://www.google.com/search?ANONYMIZED,https://www.google.com/search?ANONYMIZED,https://www.samsclub.com/


In [39]:
import re


def clean_url_regex(url):
    if not url:
        return None

    # 1. Remove anything starting with "?ANONYMIZED" or "&ANONYMIZED"
    url = re.sub(r"[\?&]ANONYMIZED.*$", "", url)

    # 2. Remove ":443" at the end
    url = re.sub(r":443$", "", url)

    return url


df_web["page_url_anonymized"] = df_web["page_url_anonymized"].apply(clean_url_regex)

In [40]:
pd.set_option("display.max_colwidth", None)
df_web.head(20)

Unnamed: 0,caseid,group_name,client_id,client_key,os_name,os_version,device_manufacturer,device_model,device_type,session_start_time,...,ref_domain,content_type,content_length,search_term,page_duration,private_domain,category,page_url_anonymized,predecessor_url_anonymized,succesor_url_anonymized
0,262543201,YouGov.USA,3939812436,lvni7yj5xjdehlaizs7y7bo3pi,Windows,10,ASUSTeK COMPUTER INC.,ROG Strix G712LWS_G712LWS,Laptop/Desktop,2022-06-07 19:32:42.634,...,,,,,5,heb.com,"Food and Recipes, Shopping",https://www.heb.com/,,https://www.heb.com/weekly-ads/weekly-deals/
1,262543201,YouGov.USA,3939812436,lvni7yj5xjdehlaizs7y7bo3pi,Windows,10,ASUSTeK COMPUTER INC.,ROG Strix G712LWS_G712LWS,Laptop/Desktop,2022-06-07 19:32:47.937,...,,,,,9,heb.com,"Food and Recipes, Shopping",https://www.heb.com/weekly-ads/weekly-deals/,https://www.heb.com/,https://www.kroger.com/savings/weeklyad/
2,262543201,YouGov.USA,3939812436,lvni7yj5xjdehlaizs7y7bo3pi,Windows,10,ASUSTeK COMPUTER INC.,ROG Strix G712LWS_G712LWS,Laptop/Desktop,2022-06-07 19:35:00.322,...,,,,,40,kroger.com,"Business, Shopping",https://www.kroger.com/savings/weeklyad/,https://www.heb.com/weekly-ads/weekly-deals/,https://www.google.com/search?ANONYMIZED
3,262543201,YouGov.USA,3939812436,lvni7yj5xjdehlaizs7y7bo3pi,Windows,10,ASUSTeK COMPUTER INC.,ROG Strix G712LWS_G712LWS,Laptop/Desktop,2022-06-07 19:35:40.594,...,,,,smas,9,google.com,Search Engines and Portals,https://www.google.com/search,https://www.kroger.com/savings/weeklyad/,https://www.google.com/search?ANONYMIZED
4,262543201,YouGov.USA,3939812436,lvni7yj5xjdehlaizs7y7bo3pi,Windows,10,ASUSTeK COMPUTER INC.,ROG Strix G712LWS_G712LWS,Laptop/Desktop,2022-06-07 19:35:49.116,...,,,,samsclub,10,google.com,Search Engines and Portals,https://www.google.com/search,https://www.google.com/search?ANONYMIZED,https://www.samsclub.com/
5,262543201,YouGov.USA,3939812436,lvni7yj5xjdehlaizs7y7bo3pi,Windows,10,ASUSTeK COMPUTER INC.,ROG Strix G712LWS_G712LWS,Laptop/Desktop,2022-06-07 19:35:59.666,...,,,,,16,samsclub.com,Shopping,https://www.samsclub.com/,https://www.google.com/search?ANONYMIZED,https://www.samsclub.com/s/soda
6,262543201,YouGov.USA,3939812436,lvni7yj5xjdehlaizs7y7bo3pi,Windows,10,ASUSTeK COMPUTER INC.,ROG Strix G712LWS_G712LWS,Laptop/Desktop,2022-06-07 19:36:15.647,...,,,,,29,samsclub.com,Shopping,https://www.samsclub.com/s/soda,https://www.samsclub.com/,https://www.samsclub.com/club/friendswood-tx-sams-club/4702?ANONYMIZED
7,262543201,YouGov.USA,3939812436,lvni7yj5xjdehlaizs7y7bo3pi,Windows,10,ASUSTeK COMPUTER INC.,ROG Strix G712LWS_G712LWS,Laptop/Desktop,2022-06-07 19:36:44.528,...,,,,,5,samsclub.com,Shopping,https://www.samsclub.com/club/friendswood-tx-sams-club/4702,https://www.samsclub.com/s/soda,https://www.samsclub.com/s/soda
8,262543201,YouGov.USA,3939812436,lvni7yj5xjdehlaizs7y7bo3pi,Windows,10,ASUSTeK COMPUTER INC.,ROG Strix G712LWS_G712LWS,Laptop/Desktop,2022-06-07 19:36:49.050,...,,,,,65,samsclub.com,Shopping,https://www.samsclub.com/s/soda,https://www.samsclub.com/club/friendswood-tx-sams-club/4702?ANONYMIZED,https://www.samsclub.com/locator?ANONYMIZED
9,262543201,YouGov.USA,3939812436,lvni7yj5xjdehlaizs7y7bo3pi,Windows,10,ASUSTeK COMPUTER INC.,ROG Strix G712LWS_G712LWS,Laptop/Desktop,2022-06-07 19:37:54.841,...,,,,,20,samsclub.com,Shopping,https://www.samsclub.com/locator,https://www.samsclub.com/s/soda,https://www.samsclub.com/club/friendswood-tx-sams-club/4702?ANONYMIZED


In [41]:
df_web["page_url_anonymized"].nunique()

688418

In [44]:
subdomain_list = list(set(df_web["page_url_anonymized"]))
subdomain_list[:40]

['https://mail.yahoo.com/d/folders/44/messages/ANN2ePpTb9CEYoo56AibIFMPJBk',
 'https://twitter.com/transscribe/status/1541757189735456777/photo/2',
 'https://gleam.io/auth/instagram/callback',
 'https://www.yougov.chat/tasks/18c6d836b2f54e21b8e2d7b4a927770e',
 'https://www.gocomics.com/frazz/2022/06/06',
 'https://www.google.com/maps/place/New+Oxford,+PA+17350/@39.8675387,-77.1987232,11.71z/data=!4m5!3m4!1s0x89c85701bb695e51:0xad7bb740cf73c75!8m2!3d39.8637086!4d-77.0558143',
 'https://www.youtube.com/c/WeBelievemusic',
 'https://www.bestbuy.com/site/fitbit-versa-3-health-fitness-smartwatch-soft-gold/6425999.p',
 'https://portal.viewers-voice.com/logout',
 'https://store.steampowered.com/app/1994380/',
 'https://www.cbsnews.com/newyork/pictures/75th-annual-tony-awards-red-carpet/42/',
 'http://www.bubblegame.org',
 'https://www.myheritage.com/research/record-10147-137461179-/maria-cadman-in-billiongraves',
 'https://www.google.com/maps/@37.6026977,-99.292796,3a,75y,8.01h,80.44t/data=!3m

In [46]:
all_results_subdomain = []

# for batch in chunks(domain_list, 500):
for batch in tqdm(
    chunks(subdomain_list, 500),
    total=(len(subdomain_list) + 499) // 500,
    desc="Checking batches",
):
    # ------------------------------------------------------------
    # init payload
    payload = {
        "client": {"clientId": "xx", "clientVersion": "0"},
        "threatInfo": {
            "threatTypes": [
                "MALWARE",
                "SOCIAL_ENGINEERING",
                "POTENTIALLY_HARMFUL_APPLICATION",
                "UNWANTED_SOFTWARE",
            ],
            "platformTypes": ["ANY_PLATFORM"],
            "threatEntryTypes": ["URL"],
            "threatEntries": [{"url": url} for url in batch],
        },
    }

    response = requests.post(ENDPOINT_URL, json=payload)

    # ------------------------------------------------------------
    # post
    try:
        response = requests.post(ENDPOINT_URL, json=payload, timeout=10)

        if response.status_code == 200:
            result = response.json()

            if "matches" in result:
                for match in result["matches"]:
                    all_results_subdomain.append(
                        {
                            "domain": match["threat"]["url"],
                            "threatType": match["threatType"],
                            "platformType": match["platformType"],
                        }
                    )
            else:
                for url in batch:
                    all_results_subdomain.append(
                        {"domain": url, "threatType": None, "platformType": None}
                    )
        else:
            print(f"Error {response.status_code}: {response.text}")

    except Exception as e:
        print(f"Request failed: {e}")

    time.sleep(random.uniform(0.1, 1.5))

df_results_subdomain = pd.DataFrame(all_results_subdomain)
df_results_subdomain.to_csv("..data/gsb_results_subdomains.csv", index=False)

Checking batches:   0%|          | 0/1377 [00:00<?, ?it/s]

Error 400: {
  "error": {
    "code": 400,
    "message": "Invalid URL: http:/go2crowd.com/",
    "status": "INVALID_ARGUMENT"
  }
}

Error 400: {
  "error": {
    "code": 400,
    "message": "Invalid URL: https:/.reddit.com/r/LivestreamFail/comments/v6y03r/fitness_streamer_makes_sure_chat_has_optimal/",
    "status": "INVALID_ARGUMENT"
  }
}

Error 400: {
  "error": {
    "code": 400,
    "message": "Invalid threat entry type or missing threat entry itself.",
    "status": "INVALID_ARGUMENT"
  }
}

Error 400: {
  "error": {
    "code": 400,
    "message": "Invalid URL: https:/eddit.com/r/cumsluts/top",
    "status": "INVALID_ARGUMENT"
  }
}

Error 400: {
  "error": {
    "code": 400,
    "message": "Invalid URL: https:/reddit.com/r/nsfw_gif/top",
    "status": "INVALID_ARGUMENT"
  }
}

Error 400: {
  "error": {
    "code": 400,
    "message": "Invalid URL: https:/reddit.com/r/manga/comments/v9axwu/disc_athletic_girls_by_pandacorya_oneshot/",
    "status": "INVALID_ARGUMENT"
  }
}

Err

In [47]:
df_results_subdomain

Unnamed: 0,domain,threatType,platformType
0,https://mail.yahoo.com/d/folders/44/messages/ANN2ePpTb9CEYoo56AibIFMPJBk,,
1,https://twitter.com/transscribe/status/1541757189735456777/photo/2,,
2,https://gleam.io/auth/instagram/callback,,
3,https://www.yougov.chat/tasks/18c6d836b2f54e21b8e2d7b4a927770e,,
4,https://www.gocomics.com/frazz/2022/06/06,,
...,...,...,...
656969,https://www.facebook.com/messages/t/5027123227409780,,
656970,https://www.infinitesweeps.com/sweepstake/239633-Grey-Goose-In-Bloom-Sweepstakes.html,,
656971,https://mail.google.com/mail/u/0/#inbox/FMfcgzGpGdmqfDMgFCnVFcKbLjQcngzk,,
656972,http://secure.foodandwateraction.org,,


In [49]:
df_results_subdomain.query("threatType==threatType").reset_index()

Unnamed: 0,index,domain,threatType,platformType
0,4000,https://www.americanhopesprograms.com/score/,SOCIAL_ENGINEERING,ANY_PLATFORM
1,4001,https://www.americanhopesprograms.com/score/,SOCIAL_ENGINEERING,ANY_PLATFORM
2,33002,https://erebelfare.xyz/Download%20WITHOUT%20torrent%20(dstudio).iso,UNWANTED_SOFTWARE,ANY_PLATFORM
3,36503,https://www.wholeeshopping.com/items/21248944,SOCIAL_ENGINEERING,ANY_PLATFORM
4,37004,http://push-fbfgefb-3210.boustahe.com,SOCIAL_ENGINEERING,ANY_PLATFORM
5,44005,https://www.wholeeshopping.com/items/22019280,SOCIAL_ENGINEERING,ANY_PLATFORM
6,47006,https://www.wholeeshopping.com/items/22299209,SOCIAL_ENGINEERING,ANY_PLATFORM
7,47507,https://www.wholeeshopping.com/items/20929500,SOCIAL_ENGINEERING,ANY_PLATFORM
8,53508,https://www.wholeeshopping.com/items/20723969,SOCIAL_ENGINEERING,ANY_PLATFORM
9,87509,https://fallbox.icu/5f132ae1ccc6ac4e,UNWANTED_SOFTWARE,ANY_PLATFORM
