<a href="https://colab.research.google.com/github/zoeyyyzou/VisualStudio/blob/master/cctx_pcap_analyser.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# URL Engine

> 参考文章：[Machine Learning, NLP: Text Classification using scikit-learn, python and NLTK.](https://towardsdatascience.com/machine-learning-nlp-text-classification-using-scikit-learn-python-and-nltk-c52b92a7c73a)
> 
> 参考代码地址：[javedsha/text-classification](https://github.com/javedsha/text-classification/blob/master/Text%2BClassification%2Busing%2Bpython%2C%2Bscikit%2Band%2Bnltk.ipynb)

## 1. 采用的数据集
1. 安全URL数据集取自 [ISCX-URL2016 datasets](https://www.unb.ca/cic/datasets/url-2016.html)
    - 总共有 35378 条 URL
    - 前 30000 条 URL 作为训练集
    - 后 15378 条 URL 作为测试集

2. 危险URL数据集取自 CCTX's URI observables
    - 总共有 39687 条 URL
    - 前 30000 条 URL 作为训练集
    - 后 19687 条 URL 作为测试集

In [None]:
from sklearn.utils import Bunch
import csv

def getSafeUris():
    """
    获取安全 URL 列表
    """
    result = [];
    with open("drive/MyDrive/20project/safe_uri.csv") as file:
        csvReader = csv.reader(file)
        for item in csvReader:
            result.append(item[0])
    return result;

def getDangerousUris():
    """
    获取CCTX提供的异常 URL 列表
    """
    result = [];
    with open("drive/MyDrive/20project/dangerous_uri.csv") as file:
        csvReader = csv.reader(file)
        for item in csvReader:
            result.append(item[0])
    return result

def buildBunch():
    safeUris = getSafeUris()
    print("Safe uri counts: ", len(safeUris))
    dangerousUris = getDangerousUris()
    print("Dangerous uri counts: ", len(dangerousUris))
    
    # 构造训练集（取 safe uris 和 dangerous uris 的前 30000 个作为训练集）
    trainBunch = Bunch()
    trainBunch.DESCR = f"This is a url train datasets for CCTX to build a AI engine to judge a url is safe"
    trainBunch.target_names = ["Safe", "Dangerous"]
    trainBunch.data = safeUris[:30000] + dangerousUris[:30000]
    trainBunch.target = [0 for i in range(30000)] + [1 for i in range(30000)]

    # 构造测试集（取 safe uris 和 dangerous uris 的30000之后的url作为测试集）
    testBunch = Bunch()
    testBunch.DESCR = f"This is a url test datasets for CCTX to build a AI engine to judge a url is safe"
    testBunch.target_names = ["Safe", "Dangerous"]
    testBunch.data = safeUris[30000:] + dangerousUris[30000:]
    testBunch.target = [0 for i in range(len(safeUris[30000:]))] + [1 for i in range(len(dangerousUris[30000:]))]
    return trainBunch, testBunch

# 首先构造训练集和测试集
trainBunch, testBunch = buildBunch()

FileNotFoundError: ignored

In [None]:
# 从训练集中取出url，提取向量特征
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer(ngram_range=(1,3))
analyser = count_vect.build_analyzer()
print(analyser("http://www.artisanstradingcompany.com/sa/?ON901N2p=RdgT1aLjDnhdMeiuJ/6rIDjQOP2LVUvx47adxjmuI/rjhFPW6aYj6aqCXCPH7j5ZW+2qnsVwzKk=&zVeH=PbytEF&sql=1"))
X_train_counts = count_vect.fit_transform(trainBunch.data)
X_train_counts.shape

['http', 'www', 'artisanstradingcompany', 'com', 'sa', 'on901n2p', 'rdgt1aljdnhdmeiuj', '6ridjqop2lvuvx47adxjmui', 'rjhfpw6ayj6aqcxcph7j5zw', '2qnsvwzkk', 'zveh', 'pbytef', 'sql', 'http www', 'www artisanstradingcompany', 'artisanstradingcompany com', 'com sa', 'sa on901n2p', 'on901n2p rdgt1aljdnhdmeiuj', 'rdgt1aljdnhdmeiuj 6ridjqop2lvuvx47adxjmui', '6ridjqop2lvuvx47adxjmui rjhfpw6ayj6aqcxcph7j5zw', 'rjhfpw6ayj6aqcxcph7j5zw 2qnsvwzkk', '2qnsvwzkk zveh', 'zveh pbytef', 'pbytef sql', 'http www artisanstradingcompany', 'www artisanstradingcompany com', 'artisanstradingcompany com sa', 'com sa on901n2p', 'sa on901n2p rdgt1aljdnhdmeiuj', 'on901n2p rdgt1aljdnhdmeiuj 6ridjqop2lvuvx47adxjmui', 'rdgt1aljdnhdmeiuj 6ridjqop2lvuvx47adxjmui rjhfpw6ayj6aqcxcph7j5zw', '6ridjqop2lvuvx47adxjmui rjhfpw6ayj6aqcxcph7j5zw 2qnsvwzkk', 'rjhfpw6ayj6aqcxcph7j5zw 2qnsvwzkk zveh', '2qnsvwzkk zveh pbytef', 'zveh pbytef sql']


(60000, 668044)

In [None]:
# TF-IDF
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(60000, 668044)

In [None]:
# 使用朴素贝叶斯算法训练模型
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train_tfidf, trainBunch.target)

In [None]:
# Building a pipeline: We can write less code and do all of the above, by building a pipeline as follows:
# The names ‘vect’ , ‘tfidf’ and ‘clf’ are arbitrary but will be used later.
# We will be using the 'text_clf' going forward.
# 使用朴素贝叶斯算法训练模型
from sklearn.pipeline import Pipeline

# 使用 Pipeline，将上面三步合并到一起执行
text_clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', MultinomialNB())])

text_clf = text_clf.fit(trainBunch.data, trainBunch.target)

In [None]:
# Performance of NB Classifier
import numpy as np
predicted = text_clf.predict(testBunch.data)
print(f"朴素贝叶斯模型，测试集准确度: {np.mean(predicted == testBunch.target)}")

predicted = text_clf.predict(trainBunch.data)
print(f"朴素贝叶斯模型，训练集准确度: {np.mean(predicted == trainBunch.target)}")

朴素贝叶斯模型，测试集准确度: 0.9737139064055759
朴素贝叶斯模型，训练集准确度: 0.994


In [None]:
# Training Support Vector Machines - SVM and calculating its performance
# 使用支持向量机算法训练模型
from sklearn.linear_model import SGDClassifier
text_clf_svm = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                         ('clf-svm', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, max_iter=5, random_state=42))])

text_clf_svm = text_clf_svm.fit(trainBunch.data, trainBunch.target)
predicted_svm = text_clf_svm.predict(testBunch.data)
print(f"支持向量机模型，测试集准确度: {np.mean(predicted_svm == testBunch.target)}")

predicted_svm = text_clf_svm.predict(trainBunch.data)
print(f"支持向量机模型，训练集准确度: {np.mean(predicted_svm == trainBunch.target)}")



支持向量机模型，测试集准确度: 0.9814802522402921
支持向量机模型，训练集准确度: 0.9756666666666667


In [None]:
# Grid Search
# Here, we are creating a list of parameters for which we would like to do performance tuning. 
# All the parameters name start with the classifier name (remember the arbitrary name we gave). 
# E.g. vect__ngram_range; here we are telling to use unigram and bigrams and choose the one which is optimal.

# 尝试使用网格搜索算法，找到使用朴素贝叶斯算法时的最优的参数
from sklearn.model_selection import GridSearchCV
parameters = {'vect__ngram_range': [(1, 1), (1, 4)],
              'tfidf__use_idf': (True, False),
              'clf__alpha': (1e-2, 1e-3)}


# Next, we create an instance of the grid search by passing the classifier, parameters 
# and n_jobs=-1 which tells to use multiple cores from user machine.
gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1)
gs_clf = gs_clf.fit(trainBunch.data, trainBunch.target)

# To see the best mean score and the params, run the following code
gs_clf.best_score_
gs_clf.best_params_

{'clf__alpha': 0.01, 'tfidf__use_idf': False, 'vect__ngram_range': (1, 4)}

In [None]:
# 使用上面的最优参数，尝试再次训练朴素贝叶斯算法
text_clf = Pipeline([('vect', CountVectorizer(ngram_range=(1, 4))), 
                     ('tfidf', TfidfTransformer(use_idf=False)), 
                     ('clf', MultinomialNB(alpha=0.01))])

text_clf = text_clf.fit(trainBunch.data, trainBunch.target)
predicted = text_clf.predict(testBunch.data)
print(f"朴素贝叶斯模型，测试集准确度: {np.mean(predicted == testBunch.target)}")

predicted = text_clf.predict(trainBunch.data)
print(f"朴素贝叶斯模型，训练集准确度: {np.mean(predicted == trainBunch.target)}")

朴素贝叶斯模型，测试集准确度: 0.9912379688018587
朴素贝叶斯模型，训练集准确度: 1.0


In [None]:
text_clf.predict(["https://blog.csdn.net/kancy110/article/details/73715739"])

array([0])

# Domain Engine
> 参考文章：[Machine Learning, NLP: Text Classification using scikit-learn, python and NLTK.](https://towardsdatascience.com/machine-learning-nlp-text-classification-using-scikit-learn-python-and-nltk-c52b92a7c73a)
> 
> 参考代码地址：[javedsha/text-classification](https://github.com/javedsha/text-classification/blob/master/Text%2BClassification%2Busing%2Bpython%2C%2Bscikit%2Band%2Bnltk.ipynb)

## 1. 采用的数据集
1. 安全 Domain 数据集取自 [alexa static top 1-m](http://s3.amazonaws.com/alexa-static/top-1m.csv.zip)
    - 总共有大约 61 万个 Domain，为了样本数量一致，我们取出其中的 74700 个域名
    - 前 60000 个 Domain 作为训练集
    - 后 14700 个 Domain 作为测试集

2. 危险URL数据集取自 CCTX's domain observables
    - 总共有 74700 个 Domain
    - 前 60000 个 Domain 作为训练集
    - 后 14700 个 Domain 作为测试集

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

texts=["dog cat fish","dog cat cat","fish bird", 'bird'] # “dog cat fish” 为输入列表元素,即代表一个文章的字符串
cv = CountVectorizer()#创建词袋数据结构
cv_fit=cv.fit_transform(texts)
#上述代码等价于下面两行
#cv.fit(texts)
#cv_fit=cv.transform(texts)

print(cv.get_feature_names())    #['bird', 'cat', 'dog', 'fish'] 列表形式呈现文章生成的词典

print(cv.vocabulary_	)              # {‘dog’:2,'cat':1,'fish':3,'bird':0} 字典形式呈现，key：词，value:词频

print(cv_fit)

print(cv_fit.toarray()) #.toarray() 是将结果转化为稀疏矩阵矩阵的表示方式；

print(cv_fit.toarray().sum(axis=0))  #每个词在所有文档中的词频

['bird', 'cat', 'dog', 'fish']
{'dog': 2, 'cat': 1, 'fish': 3, 'bird': 0}
  (0, 2)	1
  (0, 1)	1
  (0, 3)	1
  (1, 2)	1
  (1, 1)	2
  (2, 3)	1
  (2, 0)	1
  (3, 0)	1
[[0 1 1 1]
 [0 2 1 0]
 [1 0 0 1]
 [1 0 0 0]]
[2 3 2 2]


3.327819531114783
0.375


In [None]:
from sklearn.utils import Bunch
import random
import csv
from sklearn import model_selection

def getSafeUris():
    """
    获取安全 Domain 列表
    """
    result = [];
    with open("drive/MyDrive/20project/safe_domain.csv") as file:
        csvReader = csv.reader(file)
        for item in csvReader:
            result.append(item[0])
    return result;

def getDangerousUris():
    """
    获取CCTX提供的异常 Domain 列表
    """
    result = [];
    with open("drive/MyDrive/20project/dangerous_domain.csv") as file:
        csvReader = csv.reader(file)
        for item in csvReader:
            result.append(item[0])
    return result

def buildBunch():
    safeUris = getSafeUris()
    # random.shuffle(safeUris)
    print("Safe domain counts: ", len(safeUris))
    dangerousUris = getDangerousUris()
    # random.shuffle(dangerousUris)
    print("Dangerous domain counts: ", len(dangerousUris))

    x_train, x_test, y_train, y_test = model_selection.train_test_split(safeUris + dangerousUris,
                                                                        [0 for i in range(len(safeUris))] + [1 for i in range(len(dangerousUris))],
                                                                        test_size=0.2, random_state=0)
    
    # 构造训练集（取 safe domain 和 dangerous domain 的前 60000 个作为训练集）
    trainBunch = Bunch()
    trainBunch.DESCR = f"This is a domain train datasets for CCTX to build a AI engine to judge a url is safe"
    trainBunch.target_names = ["Safe", "Dangerous"]
    # trainBunch.data = safeUris[:60000] + dangerousUris[:60000]
    # trainBunch.target = [0 for i in range(60000)] + [1 for i in range(60000)]
    trainBunch.data = x_train
    trainBunch.target = y_train

    # 构造测试集（取 safe domain 和 dangerous domain 的60000之后的domain作为测试集）
    testBunch = Bunch()
    testBunch.DESCR = f"This is a domain test datasets for CCTX to build a AI engine to judge a url is safe"
    testBunch.target_names = ["Safe", "Dangerous"]
    # testBunch.data = safeUris[60000:] + dangerousUris[60000:]
    # testBunch.target = [0 for i in range(len(safeUris[60000:]))] + [1 for i in range(len(dangerousUris[60000:]))]
    testBunch.data = x_test;
    testBunch.target = y_test;
    return trainBunch, testBunch

# 首先构造训练集和测试集
trainBunch, testBunch = buildBunch()
print(trainBunch.target)

Safe domain counts:  74555
Dangerous domain counts:  74555
[0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np

def cal_entropy(data):
    """
    计算一个域名的信息熵
    """
    import math
    if not data:
        return 0
    valid_chars = set(data)

    entropy = 0
    for x in valid_chars:
        p_x = float(data.count(x))/len(data)
        if p_x > 0:
            entropy += - p_x * math.log(p_x,2)
    return entropy
# print(cal_entropy("drive.google.com"))
def cal_vowel(data):
    '''
    计算元音字母所占的比例
    '''
    vowel=['a','e','i','o','u']
    cnt = 0
    if not data:
        return 0
    for char in data.lower():
        if char in vowel:
            cnt += 1
    return float(cnt/len(data)) 
# print(cal_vowel("drive.google.com"))
def cal_number(data):
    """
    计算数字所占的比例
    """
    number = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
    if not data:
        return 0
    cnt = 0
    for char in data:
        if char in number:
            cnt += 1
    return float(cnt/len(data))
def cal_domain_len(data):
    """
    计算域名的长度
    """
    return len(data)
def cal_union_char_num(data):
    """
    计算域名中唯一字符的数量
    """
    if not data:
        return 0
    valid_chars = set(data)
    return len(valid_chars)

def cal_union_char_rate(data):
    """
    计算域名中唯一字符的比例
    """
    if not data:
        return 0
    return float(cal_union_char_num(data) / len(data))


def getFeatures(datas: [str]):
    result = []
    entropy_features = []
    vowel_features = []
    number_features = []
    domain_len_features = []
    union_char_num_features = []
    union_char_rate_features = []
    for domain in datas:
        entropy_features.append(cal_entropy(domain))
        vowel_features.append(cal_vowel(domain))
        number_features.append(cal_number(domain))
        domain_len_features.append(cal_domain_len(domain))
        union_char_num_features.append(cal_union_char_num(domain))
        union_char_rate_features.append(cal_union_char_rate(domain))
    return np.array([entropy_features, vowel_features, number_features, domain_len_features, union_char_num_features, union_char_rate_features]).T

from sklearn.feature_extraction.text import TfidfVectorizer
from scipy import sparse
class DomainFeatureExtractor(TransformerMixin, BaseEstimator):
    def __init__(self, ngram_range: tuple, analyzer: str, **params):
        # 自己提取的特征
        # self.customFeatures = []
        self.analyzer = analyzer
        self.ngram_range = ngram_range
        self.tfidfVectorizer = TfidfVectorizer(ngram_range=ngram_range, analyzer=analyzer, **params)
        self.minMaxScaler = MinMaxScaler()

    def transform(self, X):
        """
        Parameters
        ----------
        X : 域名列表

        Returns
        -------
        Xt : array-like of shape (n_samples, n_features)
            Transformed data.
        """
        print("do transform: ", len(X))
        customFeatures = getFeatures(X)
        customVect = self.minMaxScaler.transform(customFeatures)
        countVect = self.tfidfVectorizer.transform(X)
        # return sparse.csr_matrix(customVect)
        result = sparse.hstack((countVect, sparse.csr_matrix(customVect)))
        print(result.shape)
        return result

    def fit_transform(self, raw_documents, y=None):
        countVect = self.tfidfVectorizer.fit_transform(raw_documents)
        customFeatures = getFeatures(raw_documents)
        customVect = self.minMaxScaler.fit_transform(customFeatures)
        # return sparse.csr_matrix(customVect)
        result = sparse.hstack((countVect, sparse.csr_matrix(customVect)))
        print(result.shape)
        return result

    def fit(self, X, y=None):
        self.fit_transform(X, y)
        return self

trainFeatures = 
with open("train")

from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier

# 使用上面的最优参数，尝试再次训练朴素贝叶斯算法
text_clf = Pipeline([('vect', DomainFeatureExtractor(ngram_range=(2, 4), analyzer='char')), 
                    #  ('tfidf', TfidfTransformer(use_idf=False)), 
                    #  ('clf', MultinomialNB(alpha=0.01))
                    # ('clf-svm', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, max_iter=5, random_state=42))
                    ('clfrfc', RandomForestClassifier())
                    # ('clf-abc', AdaBoostClassifier())
                     ])

# text_clf = text_clf.fit(trainBunch.data, trainBunch.target)
# predicted = text_clf.predict(testBunch.data)
# print(f"随机森林模型，测试集准确度: {np.mean(predicted == testBunch.target)}")

# predicted = text_clf.predict(trainBunch.data)
# print(f"随机森林模型，训练集准确度: {np.mean(predicted == trainBunch.target)}")
# std_cps = MinMaxScaler().fit_transform(getFeatures(trainBunch.data))
# print(std_cps)

# from sklearn.feature_extraction.text import CountVectorizer
# count_vect = CountVectorizer(ngram_range=(2,2), analyzer='char')
# X_train_counts = count_vect.fit_transform(trainBunch.data)

# # TF-IDF
# from sklearn.feature_extraction.text import TfidfTransformer
# tfidf_transformer = TfidfTransformer()
# X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

# X_train_features = np.hstack((X_train_tfidf.toarray(),std_cps))

# # 使用朴素贝叶斯算法训练模型
# from sklearn.naive_bayes import MultinomialNB
# clf = MultinomialNB().fit(X_train_features, trainBunch.target)

# predicted = clf.predict(testBunch.data)
# print(f"朴素贝叶斯模型，测试集准确度: {np.mean(predicted == testBunch.target)}")

# predicted = clf.predict(trainBunch.data)
# print(f"朴素贝叶斯模型，训练集准确度: {np.mean(predicted == trainBunch.target)}")

In [None]:
# 使用网格搜索算法对随机森林模型进行调参
from sklearn.model_selection import GridSearchCV

parameters = {'vect__ngram_range': [(2, 4)],
              'vect__analyzer': ['char'],
              'clfrfc__n_estimators': [300, 400, 500, 600]
              }

# Next, we create an instance of the grid search by passing the classifier, parameters
# and n_jobs=-1 which tells to use multiple cores from user machine.
gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1, verbose=1)
gs_clf = gs_clf.fit(trainBunch.data, trainBunch.target)

# To see the best mean score and the params, run the following code
print(gs_clf.best_score_)
print(gs_clf.best_params_)

Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


In [None]:

# 从训练集中取出url，提取向量特征
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer(ngram_range=(1, 4), analyzer='word')
analyser = count_vect.build_analyzer()
print(analyser("drive.google.com"))
X_train_counts = count_vect.fit_transform(trainBunch.data)
print(X_train_counts.shape)
#print(X_train_counts.toarray())

['drive', 'google', 'com', 'drive google', 'google com', 'drive google com']
(119288, 330478)


In [None]:
# TF-IDF
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(119288, 330478)

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
import numpy as np
# 使用上面的最优参数，尝试再次训练朴素贝叶斯算法
text_clf = Pipeline([('vect', CountVectorizer(ngram_range=(1, 4))), 
                     ('tfidf', TfidfTransformer(use_idf=False)), 
                     ('clf', MultinomialNB(alpha=0.01))])

text_clf = text_clf.fit(trainBunch.data, trainBunch.target)
predicted = text_clf.predict(testBunch.data)
print(f"朴素贝叶斯模型，测试集准确度: {np.mean(predicted == testBunch.target)}")

predicted = text_clf.predict(trainBunch.data)
print(f"朴素贝叶斯模型，训练集准确度: {np.mean(predicted == trainBunch.target)}")

朴素贝叶斯模型，测试集准确度: 0.7995439608342834
朴素贝叶斯模型，训练集准确度: 0.9989940312520957


In [None]:
from sklearn.linear_model import LogisticRegression

# 使用上面的最优参数，尝试再次训练朴素贝叶斯算法
text_clf = Pipeline([('vect', CountVectorizer(ngram_range=(1, 4))), 
                     ('tfidf', TfidfTransformer(use_idf=False)), 
                     ('clf', LogisticRegression())])

text_clf = text_clf.fit(trainBunch.data, trainBunch.target)
predicted = text_clf.predict(testBunch.data)
print(f"朴素贝叶斯模型，测试集准确度: {np.mean(predicted == testBunch.target)}")

predicted = text_clf.predict(trainBunch.data)
print(f"朴素贝叶斯模型，训练集准确度: {np.mean(predicted == trainBunch.target)}")

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


朴素贝叶斯模型，测试集准确度: 0.7662315930388219
朴素贝叶斯模型，训练集准确度: 0.8600066934404283


In [None]:
# Stemming Code

import nltk
nltk.download()

from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english", ignore_stopwords=True)

class StemmedCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(StemmedCountVectorizer, self).build_analyzer()
        return lambda doc: ([stemmer.stem(w) for w in analyzer(doc)])
    
stemmed_count_vect = StemmedCountVectorizer(stop_words='english')

text_mnb_stemmed = Pipeline([('vect', stemmed_count_vect), ('tfidf', TfidfTransformer()), 
                             ('mnb', MultinomialNB(fit_prior=False))])

text_mnb_stemmed = text_mnb_stemmed.fit(trainBunch.data, trainBunch.target)

predicted_mnb_stemmed = text_mnb_stemmed.predict(testBunch.data)

np.mean(predicted_mnb_stemmed == testBunch.target)

NLTK Downloader
---------------------------------------------------------------------------
    d) Download   l) List    u) Update   c) Config   h) Help   q) Quit
---------------------------------------------------------------------------
Downloader> d

Download which package (l=list; x=cancel)?
  Identifier> x

---------------------------------------------------------------------------
    d) Download   l) List    u) Update   c) Config   h) Help   q) Quit
---------------------------------------------------------------------------
Downloader> u

Nothing to update.

---------------------------------------------------------------------------
    d) Download   l) List    u) Update   c) Config   h) Help   q) Quit
---------------------------------------------------------------------------
Downloader> d

Download which package (l=list; x=cancel)?
  Identifier> l
Packages:
  [ ] abc................. Australian Broadcasting Commission 2006
  [ ] alpino.............. Alpino Dutch Treebank
  [ ]

0.8278571428571428

In [None]:
# Grid Search
# Here, we are creating a list of parameters for which we would like to do performance tuning. 
# All the parameters name start with the classifier name (remember the arbitrary name we gave). 
# E.g. vect__ngram_range; here we are telling to use unigram and bigrams and choose the one which is optimal.

# 尝试使用网格搜索算法，找到使用朴素贝叶斯算法时的最优的参数
from sklearn.model_selection import GridSearchCV
parameters = {'vect__ngram_range': [(1, 1), (1, 4)],
              'tfidf__use_idf': (True, False),
              'clf__alpha': (1e-2, 1e-3)}


# Next, we create an instance of the grid search by passing the classifier, parameters 
# and n_jobs=-1 which tells to use multiple cores from user machine.
gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1)
gs_clf = gs_clf.fit(trainBunch.data, trainBunch.target)

# To see the best mean score and the params, run the following code
gs_clf.best_score_
gs_clf.best_params_

{'clf__alpha': 0.01, 'tfidf__use_idf': False, 'vect__ngram_range': (1, 4)}

# IPv4 Engine

# IPv6 Engine

# Email Address Engine

# Filehash Engine