# Загрузка файлов

Загрузка файлов

In [None]:
from google.colab import files

file1 = files.upload()
file2 = files.upload()

Чтение файлов (и импорт библиотек)

In [None]:
import pandas
import numpy

training_set=pandas.read_csv('/content/UNSW_NB15_training-set.csv') 
testing_set=pandas.read_csv('/content/UNSW_NB15_testing-set.csv') 

data = pandas.concat([training_set, testing_set]).reset_index(drop=True)

data.head()

Проверка сбалансированности данных

In [None]:
values_counts = data['label'].value_counts()
print(values_counts)

# Обработка данных

Удаление колонок с повреждёнными данными

In [None]:
data.loc[data['proto'] == 'any', 'proto'] = None
data.loc[data['proto'] == 'unas', 'proto'] = None

column_names = data.columns.values.tolist()

for column_name in column_names:
    if len(data[column_name].dropna()) / len(data) < 0.8:
        if column_name != "label":
            del data[column_name]
        else:
            print("Label was deleted")

print(len(column_names))
print(len(data.columns.values.tolist()))

Удаление колонок с не нужными данными

In [None]:
column_names = data.columns.values.tolist()

del data['attack_cat']
del data['id']

print(len(column_names))
print(len(data.columns.values.tolist()))

Удаление противоречащих и дублирующихся данных

In [None]:
print("Data length before deletion " + str(len(data)))

data = data.drop_duplicates(subset=data.columns.values.tolist().remove('label'), ignore_index=True)

print("Data length after deletion " + str(len(data)))

Кодирование категориальных признаков

In [None]:
from sklearn.preprocessing import OneHotEncoder

column_names = data.columns.values.tolist()

encoder = OneHotEncoder(handle_unknown='ignore',sparse=False)

encoder.fit(data[['proto']])

encoded_protocols = pandas.DataFrame(encoder.transform(data[['proto']]))
encoded_protocols.columns = encoder.get_feature_names_out()

data = data.join(encoded_protocols)

encoder.fit(data[['service']])

encoded_services = pandas.DataFrame(encoder.transform(data[['service']]))
encoded_services.columns = encoder.get_feature_names_out()

data = data.join(encoded_services)

encoder.fit(data[['state']])

encoded_states = pandas.DataFrame(encoder.transform(data[['state']]))
encoded_states.columns = encoder.get_feature_names_out()

data = data.join(encoded_states)

del data['proto']
del data['service']
del data['state']

print(len(column_names))
print(len(data.columns.values.tolist()))

Удаление аномалий

In [None]:
from sklearn.ensemble import IsolationForest
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

X = data.drop(columns=['label'])
X = IsolationForest().fit_predict(X)

pca = PCA(n_components=2)
pcs = pca.fit_transform(data)
pcDf = pandas.DataFrame(data = pcs [:, 0:2], columns = ['pc 1', 'pc 2'])
output = pandas.concat([pcDf, pandas.DataFrame(data = X, columns=['is_anomaly'])], axis = 1)

mask = X == -1

print("Data length before deletion " + str(len(data)))

data = data.drop(data[mask].index).reset_index(drop=True)

print("Data length before deletion " + str(len(data)))

plt.figure()
plt.figure(figsize=(10,10))
plt.xticks(fontsize=12)
plt.yticks(fontsize=14)
plt.xlabel('Principal Component - 1',fontsize=20)
plt.ylabel('Principal Component - 2',fontsize=20)
plt.title("2 Principal Component Analysis of  Dataset",fontsize=20)
targets = [1,-1]
colors = ['g', 'r']
for target, color in zip(targets,colors):
    indicesToKeep = output['is_anomaly'] == target
    plt.scatter(output.loc[indicesToKeep, 'pc 1']
               , output.loc[indicesToKeep, 'pc 2'], c = color, s = 50)

plt.legend(targets,prop={'size': 15})

Заполнение пропусков

In [None]:
from numpy import mean

data = data.fillna(data.apply(lambda column: mean(column.dropna()), axis=0))

data.head()

Разделение на X и Y

In [None]:
X = data.drop(columns=['label'])
Y = data['label']

Удаление неинформативных признаков

In [None]:
from sklearn.tree import DecisionTreeClassifier

model = DecisionTreeClassifier()
model.fit(X, Y)
importances = model.feature_importances_
importances = pandas.Series(importances, index=data.columns[1:])
importances = {
    "feature": importances.index.to_list(),
    "importance": importances.values
}
importances = pandas.DataFrame(importances)
importances = importances[importances["importance"] < 0.00005]["feature"].tolist()
if "label" in importances:
  importances.remove("label")

column_names = data.columns.values.tolist()

X = X.drop(columns=importances)

print(len(column_names))
print(len(X.columns.values.tolist()))

Удаление констант

In [None]:
column_names = X.columns.values.tolist()

for column_name in column_names:
    counted = X[column_name].value_counts()
    if counted.values[0] / len(X) > 0.8:
        del X[column_name]
      
    
print(len(column_names))
print(len(X.columns.values.tolist()))

Стандартизация

In [None]:
from sklearn import preprocessing

scaler = preprocessing.StandardScaler()
scaler = scaler.fit(X)
data = scaler.transform(X)
X = pandas.DataFrame(data, columns=X.columns)

X.head()

Центирование

In [None]:
X = X.apply(lambda x: x-x.mean())

X.head()

Нормализация

In [None]:
X = pandas.DataFrame(preprocessing.normalize(X), columns=X.columns)

X.head()

Проверка сбалансированости данных №2

In [None]:
values_counts = Y.value_counts()
print(values_counts)

Балансировка данных

In [None]:
from sklearn.model_selection import train_test_split

training_X, testing_X, training_Y, testing_Y = train_test_split(X, Y, test_size=0.33)

Проверка сбалансированости данных №3

In [None]:
values_counts = training_Y.value_counts()
print(values_counts)

Визуализация данных

In [None]:
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)

unique_labels = numpy.unique(Y)
colors = ['g', 'r']

plt.figure(figsize=(8, 6))

for label in unique_labels:
    indices = numpy.where(Y == label)
    plt.scatter(X_pca[indices, 0], X_pca[indices, 1], color=colors[label], label=label)

plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.legend()
plt.show()

# Создание моделей

Общие импорты

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
import seaborn as sns

Bagging Classifier

In [None]:
from sklearn.ensemble import BaggingClassifier

model = BaggingClassifier()

model.fit(training_X,training_Y)
predicted_Y = model.predict(testing_X)

# confusion matrix
cm = confusion_matrix(predicted_Y, testing_Y)
sns.heatmap(cm, annot=True, cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()

# classification report
report = classification_report(predicted_Y, testing_Y)
print(report)

Gradient Boosting Classifier

In [None]:
from sklearn.ensemble import GradientBoostingClassifier


model = GradientBoostingClassifier()

model.fit(training_X,training_Y)
predicted_Y = model.predict(testing_X)

# confusion matrix
cm = confusion_matrix(predicted_Y, testing_Y)
sns.heatmap(cm, annot=True, cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()

# classification report
report = classification_report(predicted_Y, testing_Y)
print(report)

Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier()

model.fit(training_X,training_Y)
predicted_Y = model.predict(testing_X)

# confusion matrix
cm = confusion_matrix(predicted_Y, testing_Y)
sns.heatmap(cm, annot=True, cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()

# classification report
report = classification_report(predicted_Y, testing_Y)
print(report)