In [2]:

import os
import json

domain_path = "domains/US"
json_files = []

# Walk through the directory and collect all JSON files
for root, dirs, files in os.walk(domain_path):
    for file in files:
        if file.endswith('.json'):
            file_path = os.path.join(root, file)
            with open(file_path, 'r', encoding='utf-8') as f:
                json_data = json.load(f)
                json_files.append(json_data)



In [3]:
len(json_files)


24497

In [4]:
# Initialize lists to store the extracted data
cnames = []
non_cnames = []

# Process each JSON file
for json_file in json_files:
    domain = json_file.get('domain')
    
    # Get cnames from resources
    resources = json_file.get('resources', [])
    for i in resources:
        rule = i.get('rule', None)
        fgpr = i.get('fingerprinting', None)
        for j in i.get('cnames', []):
            org = j['original']
            res = j['resolved']
            cnames.append({
                'domain': domain,
                'rule': rule,
                'fingerprint': fgpr,
                'original': org,
                'resolved': res
            })
        if i.get('cnames', []) == []:
            non_cnames.append({
                'domain': domain,
                'rule': rule,
                'fingerprint': fgpr,
                'original': None,
                'resolved': None
            })


In [5]:
print(len(cnames))
print(len(non_cnames))


85387
460051


In [6]:
# Convert the extracted data to a pandas DataFrame
import pandas as pd

df1 = pd.DataFrame(cnames)
df2 = pd.DataFrame(non_cnames)


In [7]:
df1.head()


Unnamed: 0,domain,rule,fingerprint,original,resolved
0,08bca1adcd937acee5c029a1cec0e4b9.edgekey.net,08bca1adcd937acee5c029a1cec0e4b9\.edgekey\.net...,0,sli.simonandschuster.com,08bca1adcd937acee5c029a1cec0e4b9.edgekey.net
1,0d2aa2d5db16270946d663b492be0815.edgekey.net,0d2aa2d5db16270946d663b492be0815\.edgekey\.net...,0,sli.timescall.com,0d2aa2d5db16270946d663b492be0815.edgekey.net
2,0i0i0i0.com,0i0i0i0\.com\/QgKhd137\.js,2,x9ner.anygay.com,0i0i0i0.com
3,0i0i0i0.com,0i0i0i0\.com\/QgKhd137\.js,2,u1zga.letsporn.com,0i0i0i0.com
4,0i0i0i0.com,0i0i0i0\.com\/F3s5noc\.js,2,a.fapcat.com,0i0i0i0.com


In [8]:
df2.head()


Unnamed: 0,domain,rule,fingerprint,original,resolved
0,.org,\.org\/widget\/42788_1697572386\.js,0,,
1,02d917pe-firstalert-prod-slotb.vercel.app,02d917pe-firstalert-prod-slotb\.vercel\.app\/u...,0,,
2,02d917pe-firstalert-prod-slotb.vercel.app,02d917pe-firstalert-prod-slotb\.vercel\.app\/u...,0,,
3,02d917pe-firstalert-prod-slotb.vercel.app,02d917pe-firstalert-prod-slotb\.vercel\.app\/u...,0,,
4,02d917pe-firstalert-prod-slotb.vercel.app,02d917pe-firstalert-prod-slotb\.vercel\.app\/u...,0,,


In [9]:
# Save to CSV

df1.to_csv('cnames.csv', index=False)
df2.to_csv('non_cnames.csv', index=False)


In [10]:
# Count entries with fingerprint >= 2   
high_fingerprint_count_cnames = len(df1[df1['fingerprint'] >= 2])
high_fingerprint_count_non_cnames = len(df2[df2['fingerprint'] >= 2])

print(f"Number of CNAME entries with fingerprint >= 2: {high_fingerprint_count_cnames}")
print(f"Number of non-CNAME entries with fingerprint >= 2: {high_fingerprint_count_non_cnames}")
print(f"Total entries with fingerprint >= 2: {high_fingerprint_count_cnames + high_fingerprint_count_non_cnames}")


Number of CNAME entries with fingerprint >= 2: 2465
Number of non-CNAME entries with fingerprint >= 2: 19539
Total entries with fingerprint >= 2: 22004


In [11]:
# Get entries with fingerprint >= 2 and < 2
high_fp = df1[df1['fingerprint'] >= 2]
low_fp = df1[df1['fingerprint'] < 2]

# Find the minimum count between the two groups to balance them
min_count = min(len(high_fp), len(low_fp))

# Sample equal numbers from each group
balanced_high = high_fp.sample(n=min_count, random_state=42)
balanced_low = low_fp.sample(n=min_count, random_state=42)

# Combine and shuffle the balanced datasets
balanced_df = pd.concat([balanced_high, balanced_low]).sample(frac=1, random_state=42)

# Save to CSV
balanced_df.to_csv('cnames_balanced.csv', index=False)
