# Задача
Общая задача проекта:

Разработать модель, которая будет классифицировать трафик на нормальный и злонамеренный. При этом модель должна работать максимально качественно, так как цена ошибки может быть очень высока. Оценить качество модели по различным метрикам классификации: precision, recall, f1_score, accuracy. Разработать REST API сервис, который будет принимать на вход данные трафика и возвращать класс этого трафика.

В рамках ноутбука проведем минимальный анализ данных и попробуем обучить модель. Если метрики окажутся высокими, перейдем к реализации модульного проекта.

In [23]:
import pandas as pd
import numpy as np

from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score, f1_score, accuracy_score, precision_score, roc_auc_score
from ydata_profiling import ProfileReport

In [2]:
try:
    initial_data = pd.read_csv('network_traffic_data.csv')
except:
    raise Exception('Check the file')

## Знакомство с данными

In [3]:
initial_data.head()

Unnamed: 0,Destination Port,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,80,5480074,3,1,12,0,6,0,4.0,3.465,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
1,443,711977,9,10,703,3950,267,0,78.1,103.3,...,32,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
2,53,153398,2,2,80,224,40,40,40.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
3,53,57660,1,1,46,128,46,46,46.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
4,8446,767,3,1,43,6,31,6,14.336,14.44,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN


In [4]:
initial_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 539616 entries, 0 to 539615
Data columns (total 79 columns):
 #   Column                        Non-Null Count   Dtype  
---  ------                        --------------   -----  
 0    Destination Port             539616 non-null  int64  
 1    Flow Duration                539616 non-null  int64  
 2    Total Fwd Packets            539616 non-null  int64  
 3    Total Backward Packets       539616 non-null  int64  
 4   Total Length of Fwd Packets   539616 non-null  int64  
 5    Total Length of Bwd Packets  539616 non-null  int64  
 6    Fwd Packet Length Max        539616 non-null  int64  
 7    Fwd Packet Length Min        539616 non-null  int64  
 8    Fwd Packet Length Mean       539616 non-null  float64
 9    Fwd Packet Length Std        539616 non-null  float64
 10  Bwd Packet Length Max         539616 non-null  int64  
 11   Bwd Packet Length Min        539616 non-null  int64  
 12   Bwd Packet Length Mean       539616 non-nul

In [5]:
initial_data.describe()

Unnamed: 0,Destination Port,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,act_data_pkt_fwd,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min
count,539616.0,539616.0,539616.0,539616.0,539616.0,539616.0,539616.0,539616.0,539616.0,539616.0,...,539616.0,539616.0,539616.0,539616.0,539616.0,539616.0,539616.0,539616.0,539616.0,539616.0
mean,5563.061394,21066930.0,6.119972,5.985495,408.2371,7999.779,169.449966,11.855382,44.681011,59.525095,...,2.946516,-2429.314,115498.4,40375.7,167288.2,89830.17,16101550.0,958564.9,16872270.0,15391500.0
std,14921.262751,38121720.0,345.109317,463.490949,7478.351,1014442.0,560.533869,60.961661,150.87388,217.729412,...,267.510434,1052329.0,778753.0,434703.2,1059626.0,709971.0,33154630.0,6461275.0,34038510.0,32956950.0
min,0.0,-12.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,-536870700.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,80.0,73.0,1.0,1.0,2.0,0.0,2.0,0.0,2.0,0.0,...,0.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,80.0,49739.0,2.0,2.0,44.0,105.0,23.0,0.0,11.22,0.0,...,1.0,24.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,443.0,10640300.0,6.0,5.0,320.0,8216.25,272.0,6.0,48.56,91.7,...,2.0,32.0,6.0,0.0,6.0,5.0,7125041.0,0.0,7143036.0,6028677.0
max,65533.0,120000000.0,200755.0,270686.0,2866110.0,591000000.0,24820.0,2065.0,5940.0,7050.0,...,192491.0,138.0,102000000.0,63500000.0,102000000.0,102000000.0,120000000.0,76900000.0,120000000.0,120000000.0


Видим, что есть пропуски в столбце Flow Bytes/s, что вероятно означает скорость потока данных. Этот показатель можно будет безболезненно заполнить медианой. Также в заголовках датасета есть лишние пробелы.

In [6]:
# посмотрим, как распределяются классы целевого признака
initial_data['Label'].value_counts()

BENIGN                        240000
DoS Hulk                      115974
PortScan                       79660
DDoS                           64366
DoS GoldenEye                  10293
FTP-Patator                     7938
SSH-Patator                     5897
DoS slowloris                   5796
DoS Slowhttptest                5499
Bot                             1966
Web Attack � Brute Force        1507
Web Attack � XSS                 652
Infiltration                      36
Web Attack � Sql Injection        21
Heartbleed                        11
Name: Label, dtype: int64

Классы не очень-то равны, поэтому при оценивании больше внимания будем обращать на взвешенные оценки.

In [7]:
# посмотрим порты
initial_data[' Destination Port'].value_counts()

80       229165
53       101315
443       53430
21         8638
22         7245
          ...  
62846         1
61995         1
64275         1
41909         1
3051          1
Name:  Destination Port, Length: 24926, dtype: int64

Порты кажутся категориальным признаком, так как их не имеет смысла оценивать в среднем или проводить арифметические операции. Так как будем использовать Catboost, то при обучении выделим, что этот признак категориальный.

In [8]:
# построим минимальный профайлер для всего остального, так как датасет большой
profile = ProfileReport(initial_data, title='Pandas Profiling Report', minimal=True)
profile

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]



Есть скошенные, константные и нулевые значения, но пока что ничего однозначно ломающего.

## Предобработка данных

In [9]:
# удалим лишние пробелы из названий признаков
def strip_spaces(df):
    new_columns = {}
    for col in df.columns:
        new_columns[col] = col.strip()
    df = df.rename(columns=new_columns)
    return df

In [10]:
# сделаем копию датасета
data = initial_data.copy()

In [11]:
data = strip_spaces(data)

In [12]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 539616 entries, 0 to 539615
Data columns (total 79 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   Destination Port             539616 non-null  int64  
 1   Flow Duration                539616 non-null  int64  
 2   Total Fwd Packets            539616 non-null  int64  
 3   Total Backward Packets       539616 non-null  int64  
 4   Total Length of Fwd Packets  539616 non-null  int64  
 5   Total Length of Bwd Packets  539616 non-null  int64  
 6   Fwd Packet Length Max        539616 non-null  int64  
 7   Fwd Packet Length Min        539616 non-null  int64  
 8   Fwd Packet Length Mean       539616 non-null  float64
 9   Fwd Packet Length Std        539616 non-null  float64
 10  Bwd Packet Length Max        539616 non-null  int64  
 11  Bwd Packet Length Min        539616 non-null  int64  
 12  Bwd Packet Length Mean       539616 non-null  float64
 13 

In [13]:
# обозначим категориальные и численные значения, а также признаки и таргет
cat_features = ['Destination Port']
num_features = data.drop(['Destination Port', 'Label'], axis=1).columns
features = data.drop('Label', axis=1)
target = data['Label']

In [14]:
# проверим пропуски
data['Flow Bytes/s'].isna().sum()

488

In [15]:
data['Flow Bytes/s'] = data['Flow Bytes/s'].fillna(data['Flow Bytes/s'].median())

In [16]:
data['Flow Bytes/s'].isna().sum()

0

In [17]:
data.isna().sum().sum()

0

In [18]:
# разбиваем на две выборки
features_train, features_valid, target_train, target_valid = train_test_split(
    features, target, test_size=0.25, random_state=12345, shuffle=True)

In [19]:
(features_train.shape, features_valid.shape, target_train.shape, target_valid.shape)

((404712, 78), (134904, 78), (404712,), (134904,))

In [20]:
# обучим модель
model = CatBoostClassifier(loss_function='MultiClass', iterations=100, depth=5, learning_rate=0.5, thread_count=-1)
model.fit(features_train, target_train, cat_features=cat_features, verbose=10)

0:	learn: 0.4753461	total: 877ms	remaining: 1m 26s
10:	learn: 0.1381596	total: 7.22s	remaining: 58.4s
20:	learn: 0.0587434	total: 13.7s	remaining: 51.7s
30:	learn: 0.0284496	total: 19.9s	remaining: 44.3s
40:	learn: 0.0255453	total: 25.9s	remaining: 37.3s
50:	learn: 0.0232466	total: 31.9s	remaining: 30.7s
60:	learn: 0.0214477	total: 38s	remaining: 24.3s
70:	learn: 0.0196806	total: 43.9s	remaining: 17.9s
80:	learn: 0.0178024	total: 50.4s	remaining: 11.8s
90:	learn: 0.0164461	total: 56.7s	remaining: 5.61s
99:	learn: 0.0150811	total: 1m 2s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x1c6bceb4910>

In [21]:
# посчитаем предсказания
preds = model.predict(features_valid, thread_count=-1)
predictions = model.predict_proba(features_valid)

In [22]:
# оценим метрики
print('Micro Precision: {:.2f}'.format(precision_score(target_valid, preds, average='micro')))
print('Micro Recall: {:.2f}'.format(recall_score(target_valid, preds, average='micro')))
print('Micro F1-score: {:.2f}\n'.format(f1_score(target_valid, preds, average='micro')))

print('Macro Precision: {:.2f}'.format(precision_score(target_valid, preds, average='macro')))
print('Macro Recall: {:.2f}'.format(recall_score(target_valid, preds, average='macro')))
print('Macro F1-score: {:.2f}'.format(f1_score(target_valid, preds, average='macro')))
print('Macro ROC AUC: {:.2f}\n'.format(roc_auc_score(target_valid, predictions, average='macro', multi_class='ovo')))

print('Weighted Precision: {:.2f}'.format(precision_score(target_valid, preds, average='weighted')))
print('Weighted Recall: {:.2f}'.format(recall_score(target_valid, preds, average='weighted')))
print('Weighted F1-score: {:.2f}'.format(f1_score(target_valid, preds, average='weighted')))
print('Weighted ROC AUC: {:.2f}\n'.format(roc_auc_score(target_valid, predictions, average='weighted', multi_class='ovo')))

Micro Precision: 1.00
Micro Recall: 1.00
Micro F1-score: 1.00

Macro Precision: 0.90
Macro Recall: 0.83
Macro F1-score: 0.83
Macro ROC AUC: 0.99

Weighted Precision: 1.00
Weighted Recall: 1.00
Weighted F1-score: 1.00
Weighted ROC AUC: 1.00



Как видим, метрики сразу получились отличные, только макро отстает, что скорее всего связанно с дисбалансом классов, но нам будет более интересно взвешенное значение. Учитывая текущие показатели, можно перейти к созданию модульного REST-сервиса.