# Этап 02 Выявление аномалий в данных, и обучение модели выявлять аномалий

In [1]:
#!pip install catboost

In [2]:
from tqdm.auto import tqdm
import numpy as np
import pandas as pd
from sklearn import cluster
import catboost as cb
from sklearn.model_selection import train_test_split

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
from src.preprocess import TrafficPreprocess

In [4]:
#Загрузим данные
df = pd.read_csv('data/part_10.csv.gz',compression='gzip')

In [5]:
#df = df[:2000]

In [6]:
#Инициализируем класс Препроцессинга и загрузим модели
traffic_preprocess = TrafficPreprocess()
traffic_preprocess.agent_model_load()
traffic_preprocess.load_models_etap_02()

In [7]:
#Выполним первый этап препроцессинга данных (выделения признаков)
list_result_dict = []
for idx,rows in tqdm(df.iterrows()):
    data = dict(rows)
    #print(data)
    result_dict = traffic_preprocess.preprocess_stage_one(data)
    list_result_dict.append(result_dict)
    #break

57845it [00:03, 15893.11it/s]


In [8]:
#Загрузим модель кластеризации USER AGENT
traffic_preprocess.agent_model_load()

In [9]:
#Выполним второй этап препроцессинга данных (выделения признаков), и удалим записи с неверным IP
add_result_dict = []
for result_dict in tqdm(list_result_dict):
    result_dict = traffic_preprocess.preprocess_stage_two(result_dict)
    if result_dict['RIGHT_CLIENT_IP'] == result_dict['CLIENT_IP']:
        add_result_dict.append(result_dict)
    #break
list_result_dict.clear()
del(list_result_dict)

100%|██████████| 57845/57845 [13:50<00:00, 69.66it/s] 


In [10]:
df = pd.DataFrame(add_result_dict)

In [11]:
#Сгенерируем итоговый массив признаков для обучения модели
list_futures = []
for result_dict in tqdm(add_result_dict):
    futures = traffic_preprocess.clusters_create_futures_array(result_dict)
    list_futures.append(futures)
    #break
add_result_dict.clear()    
del(add_result_dict)

100%|██████████| 57694/57694 [00:00<00:00, 100720.31it/s]


In [12]:
dbscan = cluster.DBSCAN(eps=9.0, min_samples=100)

In [13]:
#Выполним кластеризацию
dbscan.fit(list_futures)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [14]:
labels = dbscan.labels_

In [15]:
#Задача кластеризации отделить аномальный трафик от неаномального, поэтому разделим данные на два класса
#1 - трафик аномальный, 0 - обычный
cb_labels = labels==-1
cb_labels = cb_labels.astype(int)

In [16]:
#Соотношение классов
sum(cb_labels==0), sum(cb_labels==1)

(41607, 16087)

In [17]:
df['cluster'] = cb_labels

In [18]:
#Подозрительный трафик
df[df['cluster'] == 1].head(3)

Unnamed: 0,CLIENT_IP,CLIENT_USERAGENT,REQUEST_SIZE,RESPONSE_CODE,MATCHED_VARIABLE_SRC,MATCHED_VARIABLE_NAME,MATCHED_VARIABLE_VALUE,EVENT_ID,RIGHT_CLIENT_IP,preproc_CLIENT_IP_country,preproc_CLIENT_USERAGENT,preproc_REQUEST_SIZE,preproc_RESPONSE_CODE_valid,preproc_RESPONSE_CODE,preproc_MATCHED_VARIABLE_SRC_valid,agent_vector,agent_cluster,matched_var_name,matched_var_value,cluster
0,188.138.92.55,,166,404,REQUEST_URI,,//tmp/20160925122692indo.php.vob,AVdhXFgVq1Ppo9zF5Fxu,188.138.92.55,DE,,166,True,404,True,"(0, 91)\t1.0",13,"[0.0227108, -0.02569362, 0.0186324, 0.02271437...","[0.02438902, -0.05339618, -0.007877337, -0.030...",1
2,176.123.240.64,Mozilla/5.0 (Windows NT 6.0; rv:34.0) Gecko/20...,395,403,REQUEST_PATH,,"/opinion/page1_3.php\"" and \""\""x\""\""=\""\""x\""",iz7SN2YBrgKk_RFNZW_U,176.123.240.64,KG,mozilla/5.0 (windows nt 6.0; rv:34.0) gecko/20...,395,True,403,True,"(0, 149)\t0.2966918534009629\n (0, 121)\t0....",10,"[0.0227108, -0.02569362, 0.0186324, 0.02271437...","[0.0007681509, -0.001140485, -0.013785048, -0....",1
5,79.183.87.247,Mozilla/5.0 (Linux; Android 6.0; LG-H815 Build...,682,404,REQUEST_HEADERS,REQUEST_HEADERS.User-Agent,Mozilla/5.0 (Linux; Android 6.0; LG-H815 Build...,nFzwHGQB5cBXmMW1y_TD,79.183.87.247,IL,mozilla/5.0 (linux; android 6.0; lg-h815 build...,682,True,404,True,"(0, 122)\t0.16717495982469774\n (0, 87)\t0....",3,"[-0.010601818, -0.047821425, 0.016080828, -0.0...","[0.011902368, -0.019429851, -0.004136016, -0.0...",1


In [19]:
df_data = df[df['cluster'] == 1]
df_data.shape

(16087, 20)

In [20]:
df_data_0 = df[df['cluster'] == 0][:df_data.shape[0]]
df_data_0.shape

(16087, 20)

In [21]:
#Сбалансируем классы в выборке
df_data = pd.concat([df_data,df_data_0])

In [22]:
df_data.shape

(32174, 20)

In [23]:
df_data_labels = list(df_data['cluster'])
df_data.drop('cluster',inplace=True,axis=1)
len(df_data_labels)

32174

In [24]:
df_data.shape

(32174, 19)

In [25]:
list_futures = []
for idx, rows in tqdm(df_data.iterrows()):
    result_dict = dict(rows)
    futures = traffic_preprocess.clusters_create_futures_array(result_dict)
    list_futures.append(futures)

32174it [00:02, 12368.08it/s]


In [26]:
X_train, X_test, y_train, y_test = train_test_split(list_futures, df_data_labels, test_size=0.2, random_state=42)

In [27]:
#Обучим модель классифицировать трафик на обычный и аномальный
model = cb.CatBoostClassifier(iterations=100, depth=3, learning_rate=0.1, loss_function='Logloss',eval_metric='Accuracy')
val_set = (X_test, y_test)
model.fit(X_train, y_train, eval_set=val_set, verbose=True)

0:	learn: 0.7606356	test: 0.7566434	best: 0.7566434 (0)	total: 99.8ms	remaining: 9.88s
1:	learn: 0.8080734	test: 0.7968920	best: 0.7968920 (1)	total: 122ms	remaining: 5.96s
2:	learn: 0.8044213	test: 0.7956488	best: 0.7968920 (1)	total: 139ms	remaining: 4.48s
3:	learn: 0.7994095	test: 0.7912976	best: 0.7968920 (1)	total: 157ms	remaining: 3.77s
4:	learn: 0.8027118	test: 0.7973582	best: 0.7973582 (4)	total: 175ms	remaining: 3.32s
5:	learn: 0.8269552	test: 0.8203574	best: 0.8203574 (5)	total: 195ms	remaining: 3.06s
6:	learn: 0.8339873	test: 0.8287490	best: 0.8287490 (6)	total: 216ms	remaining: 2.87s
7:	learn: 0.8347255	test: 0.8298368	best: 0.8298368 (7)	total: 236ms	remaining: 2.71s
8:	learn: 0.8494503	test: 0.8447552	best: 0.8447552 (8)	total: 263ms	remaining: 2.65s
9:	learn: 0.8527915	test: 0.8486402	best: 0.8486402 (9)	total: 284ms	remaining: 2.56s
10:	learn: 0.8565989	test: 0.8515929	best: 0.8515929 (10)	total: 308ms	remaining: 2.49s
11:	learn: 0.8581141	test: 0.8526807	best: 0.852680

<catboost.core.CatBoostClassifier at 0x7f01bce2bfa0>

In [28]:
#Сохраним модель
model.save_model("models/cb_model_traffic.cb")