In [None]:
cd Desktop/data_hacking/dga_detection/

In [None]:
ls

In [None]:
import sklearn.feature_extraction
import pandas as pd
import matplotlib.pylab as plt

In [None]:
# 设置绘图大小
plt.rcParams['figure.figsize'] = (14.0, 5.0)
plt.rcParams['axes.grid'] = True

In [None]:
#读取合法域名
alexa_dataframe = pd.read_csv('data/alexa_100k.csv', names=['rank','url'], header=None, encoding='utf-8')
alexa_dataframe.head() #读取出来所有的域名数据

In [None]:
#提取域名
import tldextract
import numpy as np

def domain_extract(url):
    ext = tldextract.extract(url)
    if (not ext.suffix):
        return np.nan
    else:
        return ext.domain

alexa_dataframe['domain'] = [ domain_extract(url) for url in alexa_dataframe['url']]
del alexa_dataframe['rank']
del alexa_dataframe['url']
alexa_dataframe.count()
alexa_dataframe.head()

In [None]:
alexa_dataframe.tail()

In [None]:
alexa_dataframe.isnull().count()

In [None]:
#发现有很多nan数据，丢弃
alexa_dataframe = alexa_dataframe.dropna()
#重复的域名数据对这次的训练没什么意义，所以抛弃
alexa_dataframe = alexa_dataframe.drop_duplicates()

In [None]:
alexa_dataframe.count()

In [None]:
#设置分类
alexa_dataframe['class'] = 'legit'
#打乱数据
alexa_dataframe = alexa_dataframe.reindex(np.random.permutation(alexa_dataframe.index))
alexa_totol = alexa_dataframe.shape[0]
print "合法域名总数 %d" %alexa_totol

#据说掐头去尾后效果更好我们最后测试一下
hole_out_alexa = alexa_dataframe[int(alexa_totol*.9):]
alexa_dataframe = alexa_dataframe[:int(alexa_totol*.9)]
print "使用的合法域名总数 %d" %alexa_dataframe.shape[0]

In [None]:
alexa_dataframe.head()

In [None]:
######开始读取dga域名
dga_dataframe = pd.read_csv('data/dga_domains.txt',names=['raw_domain'],header=None,encoding='utf-8')
dga_dataframe.head()
#print dga_dataframe.count()

In [None]:
test = [domain_extract(url) for url in dga_dataframe['raw_domain']]
test = pd.DataFrame(test)
print dga_dataframe.count()
test.count()
#很明显从测试的结果来看，直接用tldextract提取dga域名效果差极了，我们注意到域名可以似乎直接通过.分割

In [None]:
#分割出域名
dga_dataframe['domain'] = dga_dataframe.applymap(lambda x: x.split('.')[0].lower())
#print dga_dataframe.head()
del dga_dataframe['raw_domain']
#删除为空和重复的数据
dga_dataframe = dga_dataframe.dropna()
dga_dataframe = dga_dataframe.drop_duplicates()
dga_totol = dga_dataframe.shape[0]
print "dga 域名总数为 %d" %dga_totol
#print dga_dataframe.head()

#设置分类
dga_dataframe['class'] = 'dga'
#听说掐头去尾效果更好。
hold_out_dga = dga_dataframe[int(dga_totol*0.9):]
dga_dataframe = dga_dataframe[:int(dga_totol*0.9)]
print "使用的域名总数为 %d" %dga_dataframe.shape[0]


In [None]:
dga_dataframe.head()

In [None]:
#把所有域名链接到一起
all_domains = pd.concat([alexa_dataframe,dga_dataframe],ignore_index=True)
print all_domains.head()
all_domains.count()

In [None]:
#从之前可以看到合法域名和dga域名有着明显长度区别。
all_domains['length'] = [len(x) for x in all_domains['domain']]

In [None]:
import math
from collections import Counter
#熵的计算公式
def entropy(x):
    p,lns = Counter(x) ,float(len(x))
    return -sum(count/lns * math.log(count/lns,2) for count in p.values())
#同样，将域名的信息熵作为一个特征
all_domains['entropy'] = [entropy(x) for x in all_domains['domain']]
all_domains.head()

In [None]:
all_domains.tail()

In [None]:
##################开始绘图

In [None]:
#通过盒图来查看整个数据分布
import matplotlib.pylab as plt
all_domains.boxplot('length','class')
plt.ylabel('length')
all_domains.boxplot('entropy','class')
plt.ylabel('entropy')
plt.show()

In [None]:
cond = all_domains['class'] == 'dga'
dga = all_domains[cond]
alexa = all_domains[~cond]
plt.scatter(alexa['length'],alexa['entropy'],s=140,c='#aaaaff',label='Alexa',alpha=.2)
plt.scatter(dga['length'],dga['entropy'],s=40,c='r',label='dga',alpha=.3)
plt.xlabel('domain length')
plt.ylabel('domain entropy')
plt.show()

In [None]:
#构建训练X输入矩阵
X = all_domains.as_matrix(['length','entropy'])
#构建y结果
y= np.array(all_domains['class'].tolist())

In [None]:
import sklearn.ensemble
clf = sklearn.ensemble.RandomForestClassifier(n_estimators=20)

In [None]:
from sklearn.cross_validation import cross_val_score
scores = cross_val_score(clf,X,y,cv=5,n_jobs=4)
print scores
#emmmmm这个大概97.5的准确率甚是喜人啊

In [None]:
from sklearn.cross_validation import train_test_split
#取一个8比2的分割来通过混淆矩阵看一下预测效果
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)

In [None]:
from sklearn.metrics import confusion_matrix
labels = ['legit','dga']
cm = confusion_matrix(y_test,y_pred,labels)
print "legit     dga"
print cm

In [None]:
##但是我们看到几乎所有的域名都被分到合法区域，对dga的探测效果非常差
##试试增加更多的特征能不能提升效果
##利用文本分析来创建特征,首先提取通用向量
alexa_vc = sklearn.feature_extraction.text.CountVectorizer(analyzer = 'char')
alexa_vc

In [None]:
################### NGrams算法
counts_matrix = alexa_vc.fit_transform(alexa_dataframe['domain'])
alexa_counts = np.log10(counts_matrix.sum(axis=0).getA1())
ngrams_list = alexa_vc.get_feature_names()

In [None]:
#创建字典
word_dataframe = pd.read_csv('data/words.txt',names=['word'],header=None,dtype={'word':np.str},encoding='utf-8')
#################高能注意##############
#word_dataframe = word_dataframe[word_dataframe['word'].map(lambda x: str(x).isalpha())]
word_dataframe = word_dataframe.applymap(lambda x: str(x).lower())
word_dataframe.head()

In [None]:
dict_vc = sklearn.feature_extraction.text.CountVectorizer(analyzer='char', ngram_range=(3,5), min_df=1e-5, max_df=1.0)
counts_matrix = dict_vc.fit_transform(word_dataframe['word'])
dict_counts = np.log10(counts_matrix.sum(axis=0).getA1())
ngrams_list = dict_vc.get_feature_names()
ngrams_list

In [None]:
def ngram_count(domain):
    alexa_match = alexa_counts * alexa_vc.transform([domain]).T
    dict_match = dict_counts * dict_vc.transform([domain]).T
    print '%s Alexa match %d Dict match: %d' %(domain,alexa_match,dict_match)
##测试一些例子
# Examples:
ngram_count('google')
ngram_count('facebook')
ngram_count('1cb8a5f36f')
ngram_count('pterodactylfarts')
ngram_count('ptes9dro-dwacty2lfa5rrts')
ngram_count('beyonce')
ngram_count('bey666on4ce')

In [None]:
all_domains['alexa_grams'] = alexa_counts * alexa_vc.transform(all_domains['domain']).T
all_domains['word_grams'] = dict_counts * dict_vc.transform(all_domains['domain']).T    
all_domains.head()

In [None]:
all_domains.tail()

In [None]:
#总会有存在两个ngram差值不存在的位置
all_domains['diff'] = all_domains['alexa_grams'] - all_domains['word_grams']
all_domains.sort_values(['diff'],ascending=True).head(10)

In [None]:
all_domains.sort_values(['diff'], ascending=False).head(30)
#很好，差值得划分出了dga

In [None]:
###############绘图时间


In [None]:
cond = all_domains['class'] == 'dga'
dga = all_domains[cond]
legit = all_domains[~cond]
plt.scatter(legit['entropy'], legit['alexa_grams'],  s=120, c='#aaaaff', label='Alexa', alpha=.2)
plt.scatter(dga['entropy'], dga['alexa_grams'], s=40, c='r', label='DGA', alpha=.3)
plt.legend()
plt.xlabel('Domain Entropy')
plt.ylabel('Alexa Gram Matches')
plt.show()

In [None]:
cond = all_domains['class'] == 'dga'
dga = all_domains[cond]
legit = all_domains[~cond]
plt.scatter(legit['length'], legit['alexa_grams'], s=120, c='#aaaaff', label='Alexa', alpha=.1)
plt.scatter(dga['length'], dga['alexa_grams'], s=40, c='r', label='DGA', alpha=.3)
plt.legend()
plt.xlabel('Domain Length')
plt.ylabel('Alexa NGram Matches')
plt.show()

In [None]:
cond = all_domains['class'] == 'dga'
dga = all_domains[cond]
legit = all_domains[~cond]
plt.scatter(legit['word_grams'], legit['alexa_grams'], s=120, c='#aaaaff', label='Alexa', alpha=.1)
plt.scatter(dga['word_grams'], dga['alexa_grams'], s=40, c='r', label='DGA', alpha=.3)
plt.legend()
plt.xlabel('Domain Length')
plt.ylabel('Alexa NGram Matches')
plt.show()

In [None]:
all_domains[(all_domains['word_grams']==0)].head()
print 'test'
###在word_gram ==0及完全匹配到dict的域名

In [None]:
#用我们现在的四个特征。不对，五个特征试试cm值是否可观
X = all_domains.as_matrix(['length','entropy','alexa_grams','word_grams'])
y = np.array(all_domains['domain'].tolist())

from sklearn.cross_validation import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
print 'test'

In [None]:
from sklearn.metrics import confusion_matrix
labels = ['legit','dga']
cm = confusion_matrix(y_test,y_pred,labels)
cm