In [None]:
DATA_PATH = './output/'
RESULT_PATH = './output/analyze/txt/'
VIRUS_TOTAL_RESULT_PATH = '/Users/zhangshanfeng/data/hosting_platform_no_validation/22_11_17_txt_record/main_txt_ur_data/vt/'

In [None]:
import pandas as pd
import json
import os
from tqdm import tqdm
import re

def cmd(command):
    ans = os.popen(command).read()
    return ans

cmd('mkdir -p ' + RESULT_PATH)

In [None]:
# load TXT record
ur_arr = []
standard_arr = []

load_list = [
    [ur_arr, 'oversea_top_list.txt_TXT-thd-300.json'],
    [standard_arr, 'get_standard_record-resolve_to_address.txt_TXT-thd-300.json'],
]
for litem in load_list:
    arr = litem[0]

    with open(DATA_PATH + 'result/' + litem[-1], 'r') as f:
        for line in tqdm(f, desc=litem[-1]):
            ns = re.search(r'"name_server":"(.*?):53"', line).group(1)

            if '"recursion_available":true' in line:
                RA_flag = True
            else:
                RA_flag = False

            try:
                obj = json.loads(line)
                domain = obj['name']
                answers = obj['data']['answers']
                for item in answers:
                    arr.append([ns, domain, item['ttl'], RA_flag, item['answer']])
            except KeyError:
                pass

In [None]:
ur_txt = pd.DataFrame(ur_arr, columns=['nameserver', 'domain', 'ttl', 'RA_flag', 'rdata'])
# ur_txt.to_csv(RESULT_PATH + 'ur_txt.csv')
# ur_txt = pd.read_csv(RESULT_PATH + 'ur_txt.csv').drop(columns='Unnamed: 0')
ur_txt

In [None]:
standard_txt = pd.DataFrame(standard_arr, columns=['nameserver', 'domain', 'ttl', 'RA_flag', 'rdata'])
# # standard_txt.to_csv(RESULT_PATH + 'standard_txt.csv')
# standard_txt = pd.read_csv(RESULT_PATH + 'standard_txt.csv')
standard_txt

In [None]:
# mark public service standard record
dry_standard_txt = standard_txt[['rdata']].drop_duplicates()
dry_standard_txt.loc[:, 'standard'] = True
all_ur_txt = pd.merge(ur_txt, dry_standard_txt, on=['rdata'], how='left').fillna(False)
# all_ur_txt = all_ur_txt[all_ur_txt.standard == False].drop(columns='standard')
all_ur_txt

In [None]:
# Here, we omiited the code of comparing PDNS data.

In [None]:
# remove RA response
# remove public service standard record
abnormal_ur_txt = all_ur_txt[
    (all_ur_txt.RA_flag == False) &
    (all_ur_txt.standard == False)
].drop_duplicates()
abnormal_ur_txt

In [None]:
# filter protected ns
protective_ns = set()
with open(DATA_PATH + './detect_protecting_records/txt_check_protect.json', 'r') as f:
    for line in tqdm(f):
        if ',"data":{},' in line:
            continue
        try:
            obj = json.loads(line)
            status = obj['status']
            if status != 'NOERROR':
                continue
            ns = re.search(r'"resolver":"(.*?):53"', line).group(1)
            protective_ns.add(ns)
        except KeyError:
            pass

In [None]:
all_ur_txt.loc[:, 'protected'] = False
all_ur_txt.loc[(all_ur_txt.nameserver.isin(protective_ns)), 'protected'] = True

In [None]:
abnormal_ur_txt = abnormal_ur_txt[~abnormal_ur_txt.nameserver.isin(protective_ns)]
abnormal_ur_txt

In [None]:
# sort known pattern TXT records
patterns = list(pd.read_excel('./txt_pattern.xlsx').fillna('').values)
def match_pattern(s):
    c = l = 'unknown'
    r = nr = '-'
    for p in patterns:
        mp = p[2][18:-2]
        # print(mp)
        if re.match(mp, s):
            if p[3] != '':
                mnp = p[3][18:-2]
                if re.match(mnp, s):
                    continue
            c = p[0]
            l = p[1]
            r = p[2]
            nr = p[3]
            break
    # print(c, l, r, nr)
    return c, l, r, nr
# match_pattern('facebook-domain-verification=39xu4jzl7roi7x0n93ldkxjiaarx50')
abnormal_ur_txt['Category'], abnormal_ur_txt['Label'], abnormal_ur_txt['Regular Expression'], abnormal_ur_txt['NOT Regular Expression'] = zip(*abnormal_ur_txt['rdata'].apply(match_pattern))

In [None]:
all_ur_txt['Category'], all_ur_txt['Label'], all_ur_txt['Regular Expression'], all_ur_txt['NOT Regular Expression'] = zip(*all_ur_txt['rdata'].apply(match_pattern))

In [None]:
abnormal_ur_txt

In [None]:
all_ur_txt

In [None]:
label_statistic = abnormal_ur_txt.groupby(['Category', 'Label']).agg({'rdata': 'count'}).sort_values('rdata', ascending=False)
label_statistic.to_csv(RESULT_PATH + 'txt_category.csv')
label_statistic

In [None]:
for label in abnormal_ur_txt.Label.unique():
    abnormal_ur_txt[abnormal_ur_txt.Label == label].to_csv(RESULT_PATH + 'label_details/' + label.replace(' ', '_') + '.csv', index=False)
    abnormal_ur_txt[abnormal_ur_txt.Label == label][['rdata']].groupby('rdata').agg({'rdata': 'count'}).rename(columns={'rdata': 'count'}).sort_values(by='count', ascending=False).to_csv(RESULT_PATH + 'label_unique/' + label.replace(' ', '_') + '.csv')
    abnormal_ur_txt[abnormal_ur_txt.Label == label][['nameserver', 'target', 'rdata']].drop_duplicates(keep='first').to_csv(RESULT_PATH + 'domain_ns_unique/' + label.replace(' ', '_') + '.csv', index=False)
    abnormal_ur_txt[abnormal_ur_txt.Label == label][['target', 'nameserver']].drop_duplicates(keep='first').to_csv(RESULT_PATH + 'domain_ns_unique/' + label.replace(' ', '_') + '_no_details.csv', index=False)

In [None]:
import ipaddress
def is_protect(ip):
    try:
        result = ipaddress.ip_address(ip)
        if not result.is_global:
            return True
    except:
        return False
    return False

In [None]:
# Measure related A records
abnormal_ur_txt[['domain','nameserver']].to_csv('./middle/get_TXT_ur_related_targets.txt', index=False, header=False)
cmd('./get_TXT_ur_related_a.sh')

In [None]:
tarr = []
with open(DATA_PATH + './middle/txt_related_a.json') as f:
    for line in f:
        try:
            obj = json.loads(line)

            status = obj['status']
            if status != 'NOERROR':
                continue

            domain = obj['name']
            resolver = re.search('"resolver":"(.*?):53"', line).group(1)
            for sitem in obj['data']['ipv4_addresses']:
                try:
                    if not is_protect(sitem):
                        tarr.append([resolver, domain, sitem])
                except KeyError:
                    pass

                # map(add_arr_dict, ns_addresses, [ns_name, ]*len(ns_addresses))
                # map(add_num_dict, ns_addresses)

        except KeyError:
            pass
related_a = pd.DataFrame(tarr, columns=['nameserver', 'target', 'ip_address'])
related_a

In [None]:
# unique IP address in A records related TXT
related_a['ip_address'].unique()

In [None]:
abnormal_ur_txt.to_csv(RESULT_PATH + 'abnormal_ur_txt.csv', index=False)
abnormal_ur_txt

In [None]:
# extract IP address in TXT records
ip_regex = '\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}'
import ipaddress
def valid_ip(address):
    try:
        ipaddress.ip_address(address)
        if address[-2:] == '.0' or is_protect(address):
            return False
        return True
    except:
        return False

tarr = []
def get_txt_address(s):
    all_match = re.findall(ip_regex, s['rdata'])
    for m in all_match:
        if valid_ip(m):
            tarr.append([s['nameserver'], s['target'], s['domain'], s['RA_flag'],s['standard'], s['rdata'], s['Category'], s['Label'], m])

abnormal_ur_txt.apply(get_txt_address, axis=1)
txt_ip = pd.DataFrame(tarr, columns=['nameserver', 'target', 'domain', 'RA_flag', 'standard', 'rdata', 'Category', 'Label', 'ip_address']).drop_duplicates()

In [None]:
txt_ip.to_csv(RESULT_PATH + 'txt_ip.csv', index=False)
txt_ip