# Этап 03 Кластеризация аномального трафика на возможные классы

In [1]:
#!pip install catboost

In [2]:
import pickle
from tqdm.auto import tqdm
import numpy as np
import pandas as pd
from sklearn import cluster
import matplotlib.pyplot as plt

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
from src.preprocess import TrafficPreprocess

In [4]:
#Загрузим данные
df = pd.read_csv('data/part_10.csv.gz',compression='gzip')

In [5]:
#df = df[:1000]

In [6]:
#Инициализируем класс Препроцессинга и загрузим модели
traffic_preprocess = TrafficPreprocess()
traffic_preprocess.agent_model_load()
traffic_preprocess.load_models_etap_02()
traffic_preprocess.load_models_etap_03()

In [7]:
#Выполним первый этап препроцессинга данных (выделения признаков)
list_result_dict = []
for idx,rows in tqdm(df.iterrows()):
    data = dict(rows)
    #print(data)
    result_dict = traffic_preprocess.preprocess_stage_one(data)
    list_result_dict.append(result_dict)
    #break

57845it [00:05, 10301.96it/s]


In [8]:
#Загрузим модель кластеризации USER AGENT
traffic_preprocess.agent_model_load()

In [9]:
#Выполним второй этап препроцессинга данных (выделения признаков), и удалим записи с неверным IP
add_result_dict = []
for result_dict in tqdm(list_result_dict):
    result_dict = traffic_preprocess.preprocess_stage_two(result_dict)
    if result_dict['RIGHT_CLIENT_IP'] == result_dict['CLIENT_IP']:
        add_result_dict.append(result_dict)
    #break
list_result_dict.clear()
del(list_result_dict)

100%|██████████| 57845/57845 [13:10<00:00, 73.17it/s] 


In [10]:
df = pd.DataFrame(add_result_dict)

In [11]:
#Сгенерируем итоговый массив признаков для кластеризации
list_futures = []
for result_dict in tqdm(add_result_dict):
    futures = traffic_preprocess.clusters_create_futures_array(result_dict)
    list_futures.append(futures)
    #break
# add_result_dict.clear()    
# del(add_result_dict)

100%|██████████| 57694/57694 [00:00<00:00, 106182.02it/s]


In [12]:
list_trafic_class = []
for futures in tqdm(list_futures):
    trafic_class = int(traffic_preprocess.trafic_model_predict(futures)[0])
    list_trafic_class.append(trafic_class)

100%|██████████| 57694/57694 [00:32<00:00, 1793.11it/s]


In [13]:
type(trafic_class)

int

In [14]:
df = pd.DataFrame(add_result_dict)

In [15]:
df['trafic_class'] = list_trafic_class

In [16]:
#Оставим только аномальный трафик для кластеризации
df = df[df['trafic_class'] == 1]
df.reset_index(drop=True, inplace=True)
df.shape

(17555, 20)

In [17]:
list_futures = []
for idx, rows in tqdm(df.iterrows()):
    result_dict = dict(rows)
    futures = traffic_preprocess.clusters_create_futures_array(result_dict)
    list_futures.append(futures)

17555it [00:01, 12485.80it/s]


In [18]:
df.head(3)

Unnamed: 0,CLIENT_IP,CLIENT_USERAGENT,REQUEST_SIZE,RESPONSE_CODE,MATCHED_VARIABLE_SRC,MATCHED_VARIABLE_NAME,MATCHED_VARIABLE_VALUE,EVENT_ID,RIGHT_CLIENT_IP,preproc_CLIENT_IP_country,preproc_CLIENT_USERAGENT,preproc_REQUEST_SIZE,preproc_RESPONSE_CODE_valid,preproc_RESPONSE_CODE,preproc_MATCHED_VARIABLE_SRC_valid,agent_vector,agent_cluster,matched_var_name,matched_var_value,trafic_class
0,188.138.92.55,,166,404,REQUEST_URI,,//tmp/20160925122692indo.php.vob,AVdhXFgVq1Ppo9zF5Fxu,188.138.92.55,DE,,166,True,404,True,"(0, 91)\t1.0",13,"[0.0227108, -0.02569362, 0.0186324, 0.02271437...","[0.02438902, -0.05339618, -0.007877337, -0.030...",1
1,176.123.240.64,Mozilla/5.0 (Windows NT 6.0; rv:34.0) Gecko/20...,395,403,REQUEST_PATH,,"/opinion/page1_3.php\"" and \""\""x\""\""=\""\""x\""",iz7SN2YBrgKk_RFNZW_U,176.123.240.64,KG,mozilla/5.0 (windows nt 6.0; rv:34.0) gecko/20...,395,True,403,True,"(0, 149)\t0.2966918534009629\n (0, 121)\t0....",10,"[0.0227108, -0.02569362, 0.0186324, 0.02271437...","[0.0007681509, -0.001140485, -0.013785048, -0....",1
2,79.183.87.247,Mozilla/5.0 (Linux; Android 6.0; LG-H815 Build...,682,404,REQUEST_HEADERS,REQUEST_HEADERS.User-Agent,Mozilla/5.0 (Linux; Android 6.0; LG-H815 Build...,nFzwHGQB5cBXmMW1y_TD,79.183.87.247,IL,mozilla/5.0 (linux; android 6.0; lg-h815 build...,682,True,404,True,"(0, 122)\t0.16717495982469774\n (0, 87)\t0....",3,"[-0.010601818, -0.047821425, 0.016080828, -0.0...","[0.011902368, -0.019429851, -0.004136016, -0.0...",1


In [19]:
def calculate_sse(data, k):
    '''
    # Функция для вычисления SSE (сумма квадратов ошибок) для заданного количества кластеров
    '''
    kmeans = cluster.KMeans(n_clusters=k, random_state=0)
    kmeans.fit(data)
    return kmeans.inertia_

In [20]:
#Реализация метода локтя для поиска количества кластеров, закоментировано, так как выполняется долго
# %%time
# # Диапазон значений количества кластеров для проверки
# k_values = range(10, 100,10)

# # Вычисляем SSE для каждого значения количества кластеров
# sse_values = [calculate_sse(list_futures, k) for k in k_values]

# # Визуализация значения SSE в зависимости от количества кластеров
# plt.plot(k_values, sse_values)
# plt.xlabel('Number of clusters')
# plt.ylabel('SSE')
# plt.show()

### Методом локтя показал примерно 30 кластеров
![Кластеризация](images/trafic_clusters_2023-09-11_15-03.png)

In [21]:
trafic_kmeans = cluster.KMeans(n_clusters=30, random_state=0)

In [22]:
trafic_kmeans.fit(list_futures)

  super()._check_params_vs_input(X, default_n_init=10)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [23]:
pickle.dump(trafic_kmeans, open('models/trafic_kmeans_model.pkl', 'wb'))

In [24]:
trafic_kmeans.labels_

array([24, 24, 24, ...,  0, 24, 18], dtype=int32)

In [25]:
df['trafic_cluster'] = trafic_kmeans.labels_ + 2

In [26]:
df[df['trafic_cluster'] == 22]

Unnamed: 0,CLIENT_IP,CLIENT_USERAGENT,REQUEST_SIZE,RESPONSE_CODE,MATCHED_VARIABLE_SRC,MATCHED_VARIABLE_NAME,MATCHED_VARIABLE_VALUE,EVENT_ID,RIGHT_CLIENT_IP,preproc_CLIENT_IP_country,...,preproc_REQUEST_SIZE,preproc_RESPONSE_CODE_valid,preproc_RESPONSE_CODE,preproc_MATCHED_VARIABLE_SRC_valid,agent_vector,agent_cluster,matched_var_name,matched_var_value,trafic_class,trafic_cluster
38,80.82.63.40,Mozilla/5.0 (Windows NT 6.1; Win64; x64) Apple...,57542,403,REQUEST_METHOD\';\'REQUEST_CONTE,,POST\';\'multipart/form-data; boundary=----Web...,P_h3BGQBjksgoq1e3MzX,80.82.63.40,RU,...,57542,True,403,False,"(0, 149)\t0.336235330920567\n (0, 122)\t0.3...",6,"[0.0227108, -0.02569362, 0.0186324, 0.02271437...","[0.02003487, -0.051908743, 0.008781476, -0.036...",1,22
170,87.117.38.96,Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...,52146,200,REQUEST_METHOD\';\'REQUEST_CONTE,,POST\';\'multipart/form-data; boundary=----Web...,OMXf_mMBjksgoq1eY85M,87.117.38.96,RU,...,52146,True,200,False,"(0, 149)\t0.336235330920567\n (0, 122)\t0.3...",6,"[0.0227108, -0.02569362, 0.0186324, 0.02271437...","[0.028715735, -0.05755848, 0.03030021, -0.0320...",1,22
574,213.219.81.215,Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; ...,60114,200,REQUEST_XML,REQUEST_XML./font/br/br/br/br/br/br/br/br/br/b...,Kind regards,t8f3_mMBjksgoq1eA2u9,213.219.81.215,EE,...,60114,True,200,True,"(0, 153)\t0.3420667488772841\n (0, 121)\t0....",9,"[0.033321343, 0.014193869, -0.039755046, -0.03...","[0.038312573, -0.029420255, -0.026229883, 0.03...",1,22
1646,87.117.26.84,Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKi...,55354,200,REQUEST_METHOD\';\'REQUEST_CONTE,,POST\';\'multipart/form-data; boundary=----Web...,Lsfy_mMBjksgoq1ejA8t,87.117.26.84,RU,...,55354,True,200,False,"(0, 149)\t0.336235330920567\n (0, 122)\t0.3...",6,"[0.0227108, -0.02569362, 0.0186324, 0.02271437...","[0.029849088, -0.05569985, 0.032137148, -0.039...",1,22
1781,77.234.92.162,Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...,46580,400,REQUEST_PATH,REQUEST_PATH,/mail/bud/pkin.nsf/($Drafts)/5893E88D5B0A73264...,0MK7_mMBjksgoq1e-ePL,77.234.92.162,HU,...,46580,True,400,True,"(0, 149)\t0.336235330920567\n (0, 122)\t0.3...",6,"[0.025891421, -0.018017475, 0.0048930054, 0.01...","[-0.016689485, -0.07626411, -0.013808313, -0.0...",1,22
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15715,188.43.52.163,curl/7.22.0 (i686-pc-linux-gnu) libcurl/7.22.0...,62819,200,REQUEST_ARGS_KEYS,M{5¯ۯXµ#©نFî>R·8çpPϟðêóóÿ,M{5¯ۯXµ#©نFî>R·8çpPϟðêóóÿ,AVdj4JPiq1Ppo9zF6e_6,188.43.52.163,RU,...,62819,True,200,True,"(0, 161)\t0.3525892577111039\n (0, 106)\t0....",18,"[0.041661818, 0.017217504, -0.009976071, -0.01...","[0.041661818, 0.017217504, -0.009976071, -0.01...",1,22
16000,109.252.87.245,Mozilla/5.0 (Windows NT 10.0; WOW64; rv:49.0) ...,50988,200,REQUEST_POST_ARGS,tv35,"<p style=""text-align: center;""><strong><em><sp...",AVdYSfZWq1Ppo9zF0TlK,109.252.87.245,RU,...,50988,True,200,True,"(0, 149)\t0.2966918534009629\n (0, 121)\t0....",10,"[0.019111095, -0.039343804, -0.0069465227, 0.0...","[0.0043054847, -0.01418118, -0.012153769, -0.0...",1,22
16104,195.200.245.88,Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKi...,42197,405,REQUEST_METHOD\';\'REQUEST_CONTE,,POST\';\'multipart/form-data; boundary=-------...,hM0__2MBjksgoq1eTDE7,195.200.245.88,UA,...,42197,True,405,False,"(0, 149)\t0.336235330920567\n (0, 122)\t0.3...",6,"[0.0227108, -0.02569362, 0.0186324, 0.02271437...","[0.012512429, -0.047556367, 0.0032810648, -0.0...",1,22
17050,93.189.114.202,Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...,53269,400,REQUEST_POST_ARGS,REQUEST_POST_ARGS.h_PageText,RE: FW: Reconciliations assets Dear Alexandr...,bMK8_mMBjksgoq1ej-9R,93.189.114.202,HU,...,53269,True,400,True,"(0, 149)\t0.336235330920567\n (0, 122)\t0.3...",6,"[-0.00863258, -0.02879528, -0.0025003578, -0.0...","[-0.020162715, -0.07685178, -0.03981817, -0.01...",1,22
