In [None]:
import pandas as pd
import json
import os
from tqdm import tqdm
import re

In [None]:
BASE_DIR = './output/'
GLOBAL_DEFENSE_DETECT_RES_DIR = './detect_protecting_records/'
HOSTING_DOMAIN_LIMIT = 50
VAILD_RECORD_ANNOTATION_FLOOR_PERCENT = 0.01

In [None]:
def cmd(command):
    ans = os.popen(command).read()
    return ans

import seaborn as sns
import matplotlib.pyplot as plt
sns.set_theme(style="ticks")
def plot_hist(data, title='test'):
    print(title)
    ax = sns.displot(
        data,
        log_scale=(False, True),
    )
    plt.show()
    plt.savefig(ANALYSIS_DIR + title + '.pdf',pad_inches=0.0)


import ipaddress

def is_protect(ip):
    try:
        result = ipaddress.ip_address(ip)
        if not result.is_global:
            return True
    except:
        return False
    return False

# print(is_protect('0.0.0.0'))
# print(is_protect('127.0.0.1'))
# print(is_protect('110.75.139.5'))

def load_line_file(source):
    res = []
    with open(source, 'r') as f:
        for line in f:
            line = line.strip()
            if len(line) == 0:
                continue
            res.append(line)
    return res

def to_line_file(arr, dest):
    with open(dest, 'w') as f:
        f.write('\n'.join(arr))


In [None]:

cmd('mkdir -p ' + BASE_DIR + 'analyze/a')
ANALYSIS_DIR = BASE_DIR + 'analyze/a/'

### get correct records

In [None]:
# final_ip_address, asn, city, ptr, cert_hash
annotated_addresses = pd.read_csv(BASE_DIR + 'result/annotated_record.csv')
# domain, resolver, rtype, final_ip_address, timestamp
resolve_domain_address =  pd.read_csv(BASE_DIR + 'middle/resolver_return_record.csv')


print(annotated_addresses.shape)
print(resolve_domain_address.shape)
print(resolve_domain_address['domain'].unique().shape)
print(resolve_domain_address['resolver'].unique().shape)


if not os.path.exists(ANALYSIS_DIR + 'correct_addresses.csv'):
    full_info_addresses = pd.merge(resolve_domain_address, annotated_addresses, on='final_ip_address', how='left')

    domain_count = full_info_addresses.groupby(['domain']).agg({'domain': 'count'}).rename(columns={'domain': 'domain_count'})


    # final_ip_address, asn, city, ptr, cert_hash
    full2_info_addresses = full_info_addresses.join(domain_count, on='domain')

    for key in ['final_ip_address', 'asn', 'city', 'ptr', 'cert_hash']: 
    # for key in ['final_ip_address', ]: 
        tsta = full_info_addresses.groupby(['domain', key]).agg({key: 'count'}).rename(columns={key: key + '_count'}).reset_index()
        full2_info_addresses = pd.merge(full2_info_addresses, tsta, how='left')

    full3_info_addresses = full2_info_addresses[
            (full2_info_addresses['final_ip_address_count']/full2_info_addresses['domain_count'] > VAILD_RECORD_ANNOTATION_FLOOR_PERCENT)|
            (full2_info_addresses['asn_count']/full2_info_addresses['domain_count'] > VAILD_RECORD_ANNOTATION_FLOOR_PERCENT)|
            (full2_info_addresses['city_count']/full2_info_addresses['domain_count'] > VAILD_RECORD_ANNOTATION_FLOOR_PERCENT)|
            (full2_info_addresses['ptr_count']/full2_info_addresses['domain_count'] > VAILD_RECORD_ANNOTATION_FLOOR_PERCENT)|
            (full2_info_addresses['cert_hash_count']/full2_info_addresses['domain_count'] > VAILD_RECORD_ANNOTATION_FLOOR_PERCENT)
        ]


    final_standard_addresses = full3_info_addresses[['domain','rtype', 'final_ip_address']].drop_duplicates(keep='first',inplace=False)
    final_standard_addresses['distribution_check'] = True
    final_standard_addresses.to_csv(ANALYSIS_DIR + 'correct_addresses.csv', index=False)
else:
    final_standard_addresses = pd.read_csv(ANALYSIS_DIR + 'correct_addresses.csv')

In [None]:
final_standard_addresses

### load records collected from hosting providers

In [None]:
# remove records with RA_flag
# domain, nameserver, ip_address, RA_flag
hosting_arr = []

with open(BASE_DIR + 'result/oversea_top_list.txt_alookup-thd-300.json', 'r') as f:
    for line in tqdm(f):
        ns = re.search(r'"name_server":"(.*?):53"', line).group(1)

        if '"recursion_available":true' in line:
            RA_flag = True
        else:
            RA_flag = False

        try:
            obj = json.loads(line)
            domain = obj['name']
            addresses = obj['data']['ipv4_addresses']

            for item in addresses:
                hosting_arr.append([domain, ns, item, RA_flag])
        except KeyError:
            pass

hosting_record = pd.DataFrame(hosting_arr, columns=['domain', 'nameserver', 'ip_address', 'RA_flag'])
print(hosting_record.shape)

# mark correct records
hosting_record = hosting_record.rename(columns={'ip_address': 'final_ip_address'})
process_standarded_record = pd.merge(hosting_record, final_standard_addresses, on=['domain', 'final_ip_address'], how='left')
process_standarded_record['rtype'] = 'A'
process_standarded_record['distribution_check'].fillna(False, inplace=True)

In [None]:
# remove protective records
process_standarded_record['protected_check'] =  process_standarded_record['final_ip_address'].apply(is_protect)

protective_ns = set()
with open(GLOBAL_DEFENSE_DETECT_RES_DIR + 'alookup_check_protect.json', 'r') as f:
    for line in tqdm(f):
        if ',"data":{},' in line:
            continue
        try:
            obj = json.loads(line)
            status = obj['status']
            if status != 'NOERROR':
                continue

            ns = re.search(r'"name_server":"(.*?):53"', line).group(1)
            protective_ns.add(ns)
        except KeyError:
            pass

process_standarded_record.loc[process_standarded_record['nameserver'].isin(protective_ns), 'protected_check'] = True

In [None]:
abnormal_records = process_standarded_record[
    (process_standarded_record['RA_flag'] == False)&
    (process_standarded_record['distribution_check'] == False)&
    (process_standarded_record['protected_check'] == False)
]

In [None]:
abnormal_records.to_csv('./output/analyze/a/suspicious_records.csv', index=False)

In [None]:
abnormal_records