In [1]:
def load_data(file):
    with open(file, 'r', encoding='utf-8') as f:
        data = f.readlines()
    result = []
    for d in data:
        d = d.strip()
        if len(d) > 0:
            result.append(d)
    return result

In [2]:
normal_requests = load_data('normal.txt')
anomalous_requests = load_data('anomalous.txt')

all_requests = normal_requests + anomalous_requests
y_normal = [0] * len(normal_requests)
y_anomalous = [1] * len(anomalous_requests)
y = y_normal + y_anomalous

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

In [4]:
vectorizer = TfidfVectorizer(min_df=0.0, analyzer="word", sublinear_tf=True)
X = vectorizer.fit_transform(all_requests)

In [5]:
#vectorizer.vocabulary_
X.shape

(61065, 33550)

In [19]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=666)

# 1 k近邻

In [20]:
%%time
#复杂性太高，无法得出结果
# from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
# from sklearn.preprocessing import StandardScaler

# 数据归一化
standardScalar = StandardScaler(with_mean=False)
standardScalar.fit(X_train)
X_train = standardScalar.transform(X_train)
X_test = standardScalar.transform(X_test)

# # 网格搜索的参数
# param_grid = [
#     {
#         'weights': ['uniform'],
#         'n_neighbors': [i for i in range(2, 11)] #从1开始容易过拟合
#     },
#     {
#         'weights': ['distance'],
#         'n_neighbors': [i for i in range(2, 11)],
#         'p': [i for i in range(1, 6)]
#     }
# ]

# cv其实也是一个超参数，一般越大越好，但是越大训练时间越长
#grid_search = GridSearchCV(KNeighborsClassifier(), param_grid, n_jobs=-1, cv=5)
knn_clf = KNeighborsClassifier()
knn_clf.fit(X_train, y_train)

Wall time: 49.6 ms
Parser   : 280 ms


KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [21]:
knn_clf.score(X_test, y_test)

0.9233603537214443

In [22]:
y_predict = knn_clf.predict(X_test)

from sklearn.metrics import precision_score, recall_score, f1_score

print(precision_score(y_test, y_predict))
print(recall_score(y_test, y_predict))
print(f1_score(y_test, y_predict))

0.9194262813752373
0.8872379401587625
0.9030453697949038


# 2 逻辑回归

In [14]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

param_grid = [
    {
        'C': [0.1, 1, 3, 5, 7],
        'penalty': ['l1', 'l2']
    }
]

grid_search = GridSearchCV(LogisticRegression(), param_grid, n_jobs=-1, cv=5)


In [15]:
%%time
grid_search.fit(X_train, y_train)



Wall time: 50.9 s


GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='warn',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='warn',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='warn', n_jobs=-1,
             param_grid=[{'C': [0.1, 1, 3, 5, 7], 'penalty': ['l1', 'l2']}],
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [12]:
grid_search.best_score_

0.9680463440596087

In [13]:
grid_search.best_params_

{'C': 7, 'penalty': 'l2'}

In [14]:
best_knn_clf = grid_search.best_estimator_
best_knn_clf.score(X_test, y_test)

0.9737165315647262

In [16]:
y_predict = best_knn_clf.predict(X_test)

from sklearn.metrics import precision_score, recall_score, f1_score

print(precision_score(y_test, y_predict))
print(recall_score(y_test, y_predict))
print(f1_score(y_test, y_predict))

0.9922813036020584
0.941990637085284
0.9664821969301451


# 3 决策树

In [17]:
from sklearn.tree import DecisionTreeClassifier

param_grid = [
    {
        'max_depth':[i for i in range(1, 10)],
        'min_samples_leaf':[i for i in range(1, 20)],
        'min_samples_split':[i for i in range(10, 30)],
    }
]

grid_search = GridSearchCV(DecisionTreeClassifier(), param_grid, n_jobs=-1, cv=5)

In [18]:
%%time
grid_search.fit(X_train, y_train)

Wall time: 1h 7min 6s


GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=DecisionTreeClassifier(class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort=False, random_state=None,
                                              splitter='best'),
             iid='warn', n_jobs=-1,
             param_grid=[{'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9],
                          'min_samples_leaf': [1, 2, 3, 4, 5, 6, 7, 8,

In [19]:
grid_search.best_score_

0.8979775648898715

In [20]:
grid_search.best_params_

{'max_depth': 9, 'min_samples_leaf': 19, 'min_samples_split': 10}

In [21]:
best_tree_clf = grid_search.best_estimator_
best_tree_clf.score(X_test, y_test)

0.90084336362892

In [23]:
y_predict = best_tree_clf.predict(X_test)

from sklearn.metrics import precision_score, recall_score, f1_score

print(precision_score(y_test, y_predict))
print(recall_score(y_test, y_predict))
print(f1_score(y_test, y_predict))

0.951904296875
0.7936087929981681
0.8655788655788657


# 4 SVM

In [25]:
from sklearn.preprocessing import StandardScaler

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=666)
# 数据归一化
standardScalar = StandardScaler(with_mean=False)
standardScalar.fit(X_train)
X_train = standardScalar.transform(X_train)
X_test = standardScalar.transform(X_test)

In [27]:
%%time
from sklearn.svm import SVC

svm_clf = SVC()
svm_clf.fit(X_train, y_train)



Wall time: 10min 22s


SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='rbf', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [28]:
svm_clf.score(X_train, y_train)

0.9745558011954475

In [29]:
svm_clf.score(X_test, y_test)

0.9619258167526407

In [30]:
y_predict = svm_clf.predict(X_test)

from sklearn.metrics import precision_score, recall_score, f1_score

print(precision_score(y_test, y_predict))
print(recall_score(y_test, y_predict))
print(f1_score(y_test, y_predict))

0.9623700623700624
0.9421941787095461
0.9521752545510646


# 5 随机森林

In [7]:
from sklearn.ensemble  import RandomForestClassifier

rf_clf = RandomForestClassifier(n_estimators=500,
                               random_state=666,
                               oob_score=True,
                               n_jobs=-1)

In [8]:
%%time
rf_clf.fit(X_train, y_train)

Wall time: 2min 50s


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=500,
                       n_jobs=-1, oob_score=True, random_state=666, verbose=0,
                       warm_start=False)

In [9]:
rf_clf.score(X_test, y_test)

0.9647916154916892

In [10]:
y_predict = rf_clf.predict(X_test)

from sklearn.metrics import precision_score, recall_score, f1_score

print(precision_score(y_test, y_predict))
print(recall_score(y_test, y_predict))
print(f1_score(y_test, y_predict))

0.9618792499484855
0.9501323020557704
0.9559696907638747
