# Imports

In [29]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from tldextract import extract
import wordninja
from tqdm import tqdm
import re
from collections import Counter
import math
import dnstwist 

In [2]:
unlabeled_data = pd.read_csv('final_unlabeled.csv')
malicious_data = pd.read_csv('final_malicious.csv')
whitelist_data = pd.read_csv('final_whitelist_with_keyword.csv')

In [3]:
unlabeled_list = unlabeled_data['Domain'].values.tolist()
malicious_list = malicious_data['Domain'].values.tolist()
whitelist_list = whitelist_data['Domain'].values.tolist()

# Length

In [4]:
def check_num_word(list):
    lens = []
    l = 0
    for i in list:
        l += len(wordninja.split(i))
        lens.append(len(wordninja.split(i)))
    print("The average number of words is: {:.2f}".format(l/len(list)))
    return lens

def check_num_char(list):
    lens = []
    l = 0
    for i in list:
        le = 0
        for j in extract(i):
            le += len(j)
        l += le
        lens.append(le)
    print("The average number of characters is: {:.2f}".format(l/len(list)))
    return lens

In [5]:
malicious_num_words = check_num_word(malicious_list)
whitelist_num_words = check_num_word(whitelist_list)
unlabeled_num_words = check_num_word(unlabeled_list)

The average number of words is: 5.22
The average number of words is: 7.70
The average number of words is: 8.47


In [6]:
malicious_num_chars = check_num_char(malicious_list)
whitelist_num_chars = check_num_char(whitelist_list)
unlabeled_num_chars = check_num_char(unlabeled_list)

The average number of characters is: 20.21
The average number of characters is: 26.30
The average number of characters is: 30.66


# Hyphen

In [7]:
def check_hyphen(list):
    hyphen = []
    for i in list:
        if '-' in i:
            hyphen.append(1)
        else:
            hyphen.append(0)
    print("The number of domains containing hyphens is "
          "{:}, making {:.2f}% of the data".format(hyphen.count(1), (hyphen.count(1)*100/len(list))))
    return hyphen

In [8]:
malicious_hyphen = check_hyphen(malicious_list)
whitelist_hyphen = check_hyphen(whitelist_list)
unlabeled_hyphen = check_hyphen(unlabeled_list)

The number of domains containing hyphens is 16959, making 27.51% of the data
The number of domains containing hyphens is 1135, making 19.01% of the data
The number of domains containing hyphens is 154509, making 42.29% of the data


# Entropy

In [74]:
def entropy(s):
    p, lns = Counter(s), float(len(s))
    return -sum(count/lns * math.log(count/lns,2) for count in p.values())

def check_entropy(l):
    entropies = []
    e = 0
    for i in l:
        entropies.append(entropy(i))
        e += entropy(i)
    print("The average entropy is {:.2f}".format(e/(len(l))))
    return entropies


In [75]:
def check_tld(array):
    new_list = []
    count = 0
    for i in array:
        tsd, td, tsu = extract(i) # subdomain, domain, suffix
        if tsd:
            if td:
                dom = tsd + '.' + td + '.' + tsu
            else:
                dom = tsd + '.' + tsu
        elif td:
            dom = td + '.' + tsu
        else:
            dom = tsu
        new_list.append(dom)
    return(new_list)

In [76]:
def check_td(array):
    new_list = []
    for i in array:
        tsd, td, tsu = extract(i) #subdomain, domain, suffix
        if td:
            dom = td
        else:
            dom = tsu
        new_list.append(dom)
    return(new_list)

In [77]:
def check_td1(array):
    new_list = []
    for i in array:
        tsd, td, tsu = extract(i) #subdomain, domain, suffix
        if td:
            dom = td + '.' + tsu
        else:
            dom = tsu
        new_list.append(dom)
    return(new_list)

In [78]:
def check_tld1(array):
    new_list = []
    count = 0
    for i in array:
        tsd, td, tsu = extract(i) # subdomain, domain, suffix
        if tsd:
            if td:
                dom = tsd + '.' + td
            else:
                dom = tsd
        elif td:
            dom = td
        else:
            dom = tsu
        new_list.append(dom)
    return(new_list)

** entropy with subdomain and suffix **

In [79]:
malicious_entropy = check_entropy(malicious_list)
whitelist_entropy = check_entropy(whitelist_list)
unlabeled_entropy = check_entropy(unlabeled_list)

The average entropy is 3.66
The average entropy is 3.75
The average entropy is 3.79


** entropy without suffix **

In [80]:
malicious_entropy1 = check_entropy(check_tld1(malicious_list))
whitelist_entropy1 = check_entropy(check_tld1(whitelist_list))
unlabeled_entropy1 = check_entropy(check_tld1(unlabeled_list))

The average entropy is 3.45
The average entropy is 3.58
The average entropy is 3.70


** entropy without subdomain and suffix **

In [81]:
malicious_entropy_nosd = check_entropy(check_td(malicious_list))
whitelist_entropy_nosd = check_entropy(check_td(whitelist_list))
unlabeled_entropy_nosd = check_entropy(check_td(unlabeled_list))

The average entropy is 3.45
The average entropy is 2.70
The average entropy is 2.83


# Tranco Rank

In [16]:
top1m = pd.read_csv('top-1m.csv', header=None)

In [17]:
# dropping the subdomain to check the rank of the domain
def check_td_1(array):
    new_list = []
    for i in array:
        tsd, td, tsu = extract(i) #subdomain, domain, suffix
        if td:
            dom = td + '.' + tsu
        else:
            dom = tsu
        new_list.append(dom)
    return(new_list)

In [18]:
def check_tranco_rank(list):
    ranks = []
    for i in list:
        if i in top1m[1].values:
            ranks.append(1)
        else:
            ranks.append(0)
    print("{:.2f}% of the domains are on the Tranco list.".format((ranks.count(1)*100/len(list))))
    return ranks

In [19]:
malicious_rank = check_tranco_rank(check_td1(malicious_list))
whitelist_rank = check_tranco_rank(check_td1(whitelist_list))
unlabeled_rank = check_tranco_rank(check_td1(unlabeled_list))

0.46% of the domains are on the Tranco list.
40.76% of the domains are on the Tranco list.
41.31% of the domains are on the Tranco list.


# Longest word Ratio

In [20]:
def check_ratio(list):
    ratios = []
    s = 0
    for i in list:
        original_len = len(i)
        longest_element = max([len(x) for x in wordninja.split(i)])
        ratio = longest_element/original_len
        s += ratio
        ratios.append(ratio)
    print("The average longest word "
          "ratio is {:.2f}".format(s/(len(list))))
    return ratios

In [21]:
malicious_ratio = check_ratio(malicious_list)
whitelist_ratio = check_ratio(whitelist_list)
unlabeled_ratio = check_ratio(unlabeled_list)

The average longest word ratio is 0.35
The average longest word ratio is 0.25
The average longest word ratio is 0.27


# Typosquatting

In [71]:
def generate_doms(keys):
    fuzz_doms = []
    fake_doms = []
    # generate twisted domains from the provided keywords
    
    for i in keys:
        i += '.com' # add a random suffix to pass through the fuzz generator
        fuzz = dnstwist.DomainFuzz(i)
        fuzz.generate()
        fuzz_doms += fuzz.domains
        
    dic = {}
    
    for i in fuzz_doms:
        key = i['fuzzer']
        value = i['domain-name']
        if key in dic:
            if not isinstance(dic[key], list): 
                dic[key] = [dic[key]]
            dic[key].append(value)
        else:
            dic[key] = value
    
    # drop the keys that are not relevent enough
    # original keys: original*, addition, bitsquatting, homoglyph, hyphenation, insertion, 
    # repetition, replacement, transposition, various, vowel-swap
    [dic.pop(x, None) for x in ['original*', 'subdomain', 'addition']]
            
    for key in dic.keys():
        if isinstance(dic[key], list):
            for l in dic[key]:
                _, dom, _ = extract(l)
                fake_doms.append(dom)
        else:
            _, dom, _ = extract(key)
            fake_doms.append(dom)
            
                    
        # fuzz_doms += fuzz.domains[1:] # drop the first result in list as it is the original keyword
    for n in keys:
        for k in fake_doms:
            if k in n:
                fake_doms.remove(k)
    
    #fake_doms.remove('covida') # portuguese of covid
    
    return fake_doms

def check_typo(twists, array):
    typo_doms = []
    for i in array:
        temp_mark = 0
        for j in twists:
            if (len(j) > 3) and (j in i):
                temp_mark += 1 
        #        print(i, j)
        if temp_mark > 0:
            typo_doms.append(1)
        else:
            typo_doms.append(0)
    print("Detected {:.2f}% of domains with "
          "typosquatting.".format((typo_doms.count(1)*100/len(typo_doms))))
    return typo_doms

In [68]:
keywords = ['corona', 'covid' , 'wuhan', 'ncov-19', 'coronavirus',
            'virus', 'covid-19', 'covid19', 'wuhanvirus',
            'novelvirus']

fake_doms =  generate_doms(keywords)

In [72]:
whitelist_typo = check_typo(fake_doms, whitelist_list)
malicious_typo = check_typo(fake_doms, malicious_list)
unlabeled_typo = check_typo(fake_doms, unlabeled_list)

Detected 1.27% of domains with typosquatting.
Detected 4.71% of domains with typosquatting.
Detected 1.99% of domains with typosquatting.


# Freenom TLD

In [23]:
def check_freenom(list):
    freenoms = []
    for i in list:
        if i[-2:] == 'ml':
            freenoms.append(1)
        elif i[-2:] == 'cf':
            freenoms.append(1)
        elif i[-2:] == 'gq':
            freenoms.append(1)
        elif i[-2:] == 'tk':
            freenoms.append(1)
        elif i[-2:] == 'ga':
            freenoms.append(1)
        else:
            freenoms.append(0)
    print("{:.2f}% of domains have a "
          "freenom TLD.".format((freenoms.count(1)*100/len(freenoms))))
    return freenoms

In [24]:
malicious_freenom = check_freenom(malicious_list)
whitelist_freenom = check_freenom(whitelist_list)
unlabeled_freenom = check_freenom(unlabeled_list)

2.28% of domains have a freenom TLD.
0.00% of domains have a freenom TLD.
0.31% of domains have a freenom TLD.


# Numbers other than 19

In [25]:
def check_num(list):
    nums = []
    for i in list:
        num = re.findall(r'\d', i)
        if "19" in i:
            if len(num) > 2:
                nums.append(1)
            else:
                nums.append(0)
        elif len(num):
            nums.append(1)
        else:
            nums.append(0)
    print("{:.2f}% of domains contain "
          "numbers different than 19".format((nums.count(1)*100/len(nums))))
    return nums

In [26]:
malicious_num = check_num(malicious_list)
whitelist_num = check_num(whitelist_list)
unlabeled_num = check_num(unlabeled_list)

7.51% of domains contain numbers different than 19
7.18% of domains contain numbers different than 19
28.54% of domains contain numbers different than 19


# Number of subdomains levels

In [96]:
def check_subdom(list):
    subdoms = []
    s = 0
    for i in list:
        subdom, _ , _ = extract(i)
        if subdom:
            subdoms.append(subdom.count('.') + 1)
            s += subdom.count('.') + 1
        else:
            subdoms.append(0)
    print("The average subdomain level is {:.2f}".format(s/(len(list))))
    return subdoms

In [97]:
malicious_levels = check_subdom(malicious_list)
whitelist_levels = check_subdom(whitelist_list)
unlabeled_levels = check_subdom(unlabeled_list)

The average subdomain level is 0.01
The average subdomain level is 1.14
The average subdomain level is 1.66


# Combining

In [106]:
whitelist_label = np.zeros((len(whitelist_list),), dtype=int)
malicious_label = np.ones((len(malicious_list),), dtype=int)

In [107]:
whitelist_dict = {"Domain":whitelist_list,"Num_words": whitelist_num_words,
                  "Num_chars": whitelist_num_chars, "Hyphen": whitelist_hyphen,
                  "Entropy_sdsu": whitelist_entropy, "Entropy_nosdsu": whitelist_entropy_nosd,
                  "Entropy_nosu": whitelist_entropy1, "Tranco_Rank": whitelist_rank,
                   "Longest_word_ratio": whitelist_ratio, "Typos": whitelist_typo,
                   "Freenom_TLD": whitelist_freenom, "Other_numbers": whitelist_num,
                   "Subdomain levels": whitelist_levels, "Label": whitelist_label}
whitelist_df = pd.DataFrame(whitelist_dict)

In [108]:
malicious_dict = {"Domain":malicious_list,"Num_words": malicious_num_words,
                  "Num_chars": malicious_num_chars, "Hyphen": malicious_hyphen,
                  "Entropy_sdsu": malicious_entropy, "Entropy_nosdsu": malicious_entropy_nosd,
                  "Entropy_nosu": malicious_entropy1, "Tranco_Rank": malicious_rank,
                   "Longest_word_ratio": malicious_ratio, "Typos": malicious_typo,
                   "Freenom_TLD": malicious_freenom, "Other_numbers": malicious_num,
                   "Subdomain levels": malicious_levels, "Label": malicious_label}
malicious_df = pd.DataFrame(malicious_dict)

In [101]:
unlabeled_dict = {"Domain":unlabeled_list,"Num_words": unlabeled_num_words,
                  "Num_chars": unlabeled_num_chars, "Hyphen": unlabeled_hyphen,
                  "Entropy_sdsu": unlabeled_entropy, "Entropy_nosdsu": unlabeled_entropy_nosd,
                  "Entropy_nosu": unlabeled_entropy1, "Tranco_Rank": unlabeled_rank,
                   "Longest_word_ratio": unlabeled_ratio, "Typos": unlabeled_typo,
                   "Freenom_TLD": unlabeled_freenom, "Other_numbers": unlabeled_num,
                   "Subdomain levels": unlabeled_levels}
unlabeled_df = pd.DataFrame(unlabeled_dict)

In [109]:
final_dataset = pd.concat([whitelist_df, malicious_df]).reset_index(drop=True)

In [116]:
final_dataset = final_dataset.sample(frac=1).reset_index(drop=True)

In [117]:
whitelist_df.to_csv('final_whitelist.csv', index=False,header=True)
malicious_df.to_csv('final_malicious.csv', index=False,header=True)
unlabeled_df.to_csv('final_unlabeled.csv', index=False,header=True)
final_dataset.to_csv('final_dataset.csv', index=False,header=True)