In [1]:
import numpy as np

normal = np.loadtxt("vector_normal")
anomalous = np.loadtxt("vector_anomalous")

all_requests = np.concatenate([normal, anomalous])
X = all_requests

y_normal = np.zeros(shape=(normal.shape[0]), dtype='int')
y_anomalous = np.ones(shape=(anomalous.shape[0]), dtype='int')
y = np.concatenate([y_normal, y_anomalous])

# 1 K近邻算法

划分测试集和训练集

In [2]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=666)

### 网格搜索
网格搜索将使用交叉验证的方式来评估超参数的所有可能的组合

In [3]:
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
#from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

# 数据归一化
standardScalar = StandardScaler()
standardScalar.fit(X_train)
X_train = standardScalar.transform(X_train)
X_test_std = standardScalar.transform(X_test)

# 网格搜索的参数
param_grid = [
    {
        'weights': ['uniform'],
        'n_neighbors': [i for i in range(2, 11)] #从1开始容易过拟合
    },
    {
        'weights': ['distance'],
        'n_neighbors': [i for i in range(2, 11)],
        'p': [i for i in range(1, 6)]
    }
]

# cv其实也是一个超参数，一般越大越好，但是越大训练时间越长
grid_search = GridSearchCV(KNeighborsClassifier(), param_grid, n_jobs=-1, cv=5)

# pipe_grid_knn = Pipeline([
#     ("sta_scaler", StandardScaler()),
#     ("grid_sea", grid_search)
# ])

In [4]:
%%time
grid_search.fit(X_train, y_train)

Wall time: 52.1 s


GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                            metric='minkowski',
                                            metric_params=None, n_jobs=None,
                                            n_neighbors=5, p=2,
                                            weights='uniform'),
             iid='warn', n_jobs=-1,
             param_grid=[{'n_neighbors': [2, 3, 4, 5, 6, 7, 8, 9, 10],
                          'weights': ['uniform']},
                         {'n_neighbors': [2, 3, 4, 5, 6, 7, 8, 9, 10],
                          'p': [1, 2, 3, 4, 5], 'weights': ['distance']}],
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [5]:
grid_search.best_score_

0.8759619101163076

In [6]:
grid_search.best_params_

{'n_neighbors': 10, 'p': 3, 'weights': 'distance'}

In [7]:
best_knn_clf = grid_search.best_estimator_
best_knn_clf.score(X_test_std, y_test)

0.8733661278988053

### 分类结果评价

In [8]:
from sklearn.metrics import confusion_matrix

y_predict = best_knn_clf.predict(X_test_std)
confusion_matrix(y_test, y_predict)

array([[2735,  481],
       [ 420, 3479]], dtype=int64)

In [9]:
from sklearn.metrics import precision_score, recall_score, f1_score

print(precision_score(y_test, y_predict))
print(recall_score(y_test, y_predict))
print(f1_score(y_test, y_predict))

0.8785353535353535
0.8922800718132855
0.8853543707850872


In [10]:
X_test = X_test_std

# 2 逻辑回归

In [10]:
from sklearn.linear_model import LogisticRegression

param_grid = [
    {
        'C': [0.1, 1, 3, 5, 7],
        'penalty': ['l1', 'l2']
    }
]

grid_search = GridSearchCV(LogisticRegression(), param_grid, n_jobs=-1, cv=5)


In [11]:
%%time
grid_search.fit(X_train, y_train)

Wall time: 2min 48s




GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='warn',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='warn',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='warn', n_jobs=-1,
             param_grid=[{'C': [0.1, 1, 3, 5, 7], 'penalty': ['l1', 'l2']}],
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [13]:
grid_search.best_score_

0.6869882989563935

In [14]:
grid_search.best_params_

{'C': 7, 'penalty': 'l1'}

In [16]:
best_log_clf = grid_search.best_estimator_
best_log_clf.score(X_test, y_test)

0.6929023190442727

In [17]:
y_predict = best_log_clf.predict(X_test_std)

from sklearn.metrics import precision_score, recall_score, f1_score

print(precision_score(y_test, y_predict))
print(recall_score(y_test, y_predict))
print(f1_score(y_test, y_predict))

0.7455587392550144
0.6673506027186458
0.7042901610502098


# 3 决策树

In [18]:
from sklearn.tree import DecisionTreeClassifier

param_grid = [
    {
        'max_depth':[i for i in range(1, 10)],
        'min_samples_leaf':[i for i in range(1, 20)],
        'min_samples_split':[i for i in range(10, 30)],
    }
]

grid_search = GridSearchCV(DecisionTreeClassifier(), param_grid, n_jobs=-1, cv=5)

In [19]:
%%time
grid_search.fit(X_train, y_train)

Wall time: 3min 46s


GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=DecisionTreeClassifier(class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort=False, random_state=None,
                                              splitter='best'),
             iid='warn', n_jobs=-1,
             param_grid=[{'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9],
                          'min_samples_leaf': [1, 2, 3, 4, 5, 6, 7, 8,

In [20]:
grid_search.best_score_

0.7973224638954285

In [22]:
grid_search.best_params_

{'max_depth': 9, 'min_samples_leaf': 1, 'min_samples_split': 27}

In [23]:
best_tree_clf = grid_search.best_estimator_
best_tree_clf.score(X_test, y_test)

0.8042164441321152

In [24]:
y_predict = best_tree_clf.predict(X_test_std)

from sklearn.metrics import precision_score, recall_score, f1_score

print(precision_score(y_test, y_predict))
print(recall_score(y_test, y_predict))
print(f1_score(y_test, y_predict))

0.7658039881204921
0.9258784303667608
0.8382677348194589


决策树可以不用对原始数据进行缩放，但是上面的步骤进行了归一化操作，下面采用原始数据进行一次训练

In [25]:
X_train_raw, X_test_raw, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=666)

In [27]:
tree_clf = DecisionTreeClassifier(max_depth=9, min_samples_leaf=1, min_samples_split=27)

In [29]:
tree_clf.fit(X_train_raw, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=9,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=27,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [33]:
tree_clf.score(X_train_raw, y_train)

0.80965599634562

In [35]:
tree_clf.score(X_test_raw, y_test)

0.8042164441321152

In [36]:
y_predict = best_tree_clf.predict(X_test_raw)

from sklearn.metrics import precision_score, recall_score, f1_score

print(precision_score(y_test, y_predict))
print(recall_score(y_test, y_predict))
print(f1_score(y_test, y_predict))

0.5479971890372453
1.0
0.708007989831124


# 4 SVM

In [37]:
from sklearn.preprocessing import StandardScaler

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=666)
# 数据归一化
standardScalar = StandardScaler()
standardScalar.fit(X_train)
X_train = standardScalar.transform(X_train)
X_test = standardScalar.transform(X_test)

In [48]:
from sklearn.svm import SVC

param_grid = [
    {
        'kernel': ["poly"],
        'degree': [1, 2, 3],
        'C': [0.1, 1, 3, 5]
    }
]

grid_search = GridSearchCV(SVC(), param_grid, n_jobs=-1, cv=5)

In [49]:
%%time
grid_search.fit(X_train, y_train)

Wall time: 8min 26s


GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='auto_deprecated', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='warn', n_jobs=-1,
             param_grid=[{'C': [0.1, 1, 3, 5], 'degree': [1, 2, 3],
                          'kernel': ['poly']}],
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [50]:
grid_search.best_params_

{'C': 5, 'degree': 3, 'kernel': 'poly'}

In [51]:
grid_search.best_score_

0.7022734460100496

In [53]:
best_svm_clf = grid_search.best_estimator_
best_svm_clf.score(X_test, y_test)

0.7114546732255798

In [54]:
y_predict = best_svm_clf.predict(X_test)

from sklearn.metrics import precision_score, recall_score, f1_score

print(precision_score(y_test, y_predict))
print(recall_score(y_test, y_predict))
print(f1_score(y_test, y_predict))

0.7404898384575299
0.7289048473967684
0.7346516737753651


# 5 随机森林

In [3]:
from sklearn.ensemble  import RandomForestClassifier

In [4]:
rf_clf = RandomForestClassifier(n_estimators=500,
                               random_state=666,
                               oob_score=True,
                               n_jobs=-1)

In [5]:
%%time
rf_clf.fit(X_train, y_train)

Wall time: 3.77 s


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=500,
                       n_jobs=-1, oob_score=True, random_state=666, verbose=0,
                       warm_start=False)

In [8]:
rf_clf.score(X_test, y_test)

0.947013352073085

In [9]:
y_predict = rf_clf.predict(X_test)

from sklearn.metrics import precision_score, recall_score, f1_score

print(precision_score(y_test, y_predict))
print(recall_score(y_test, y_predict))
print(f1_score(y_test, y_predict))

0.9471813103098019
0.956655552705822
0.9518948577261708
