In [33]:
!pip install fancyimpute
!pip install torch



In [34]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import requests, zipfile, io
import missingno as msno
import multiprocessing
import scipy
import sklearn
from tqdm import tqdm
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import fancyimpute
from fancyimpute import IterativeImputer
from sklearn.preprocessing import LabelEncoder

In [35]:
# number of subprocesses to use for data loading
num_workers = multiprocessing.cpu_count()

# how many samples per batch to load
batch_size = 64

# percentage of data set to use as validation
valid_size = 0.15

In [42]:
# To download and extract a zip file via script
nidd_url = "https://download.fairdata.fi:443/download?token=eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJleHAiOjE3MTA2MjA3NTQsImRhdGFzZXQiOiI5ZDEzZWYyOC0yY2E3LTQ0YjAtOTk1MC0yMjUzNTlhZmFjNjUiLCJmaWxlIjoiL0NvbWJpbmVkLnppcCIsInByb2plY3QiOiIyMDA2OTM4IiwicmFuZG9tX3NhbHQiOiI1M2U2OGI2YSJ9.zkaF86hEoCnaIEVsYeiWyFdPgPYTFhrqbl26AC89pC0"
r = requests.get(nidd_url)
print(r)
z = zipfile.ZipFile(io.BytesIO(r.content))
z.extractall()

<Response [503]>


BadZipFile: File is not a zip file

In [None]:
df = pd.read_csv('Combined.csv', low_memory=False)
print(df.shape)
df.head()

In [None]:
num_classes = df['Attack Type'].nunique()
labels = df['Attack Type'].unique()
print(labels)
print(num_classes)

In [None]:
# Define a function to impute NaN values with the most frequent value in the group
def impute_most_frequent(group):
    mode_value = group.mode().iloc[0] if not group.mode().empty else np.nan
    return group.fillna(mode_value)

def handle_missing_values(df):
    df.drop_duplicates(inplace=True)
    df.drop(['sVid', 'dVid'], axis=1, inplace=True)
    df.dropna(subset=['sTos', 'sHops', 'sTtl', 'sDSb'], how='any', inplace=True)
    df[['dTtl', 'dDSb', 'dTos', 'dHops', 'SrcGap', 'DstGap']] = df.groupby('Attack Type')[['dTtl', 'dDSb', 'dTos', 'dHops', 'SrcGap', 'DstGap']].transform(impute_most_frequent)
    df['dDSb'] = df['dDSb'].fillna('cs0')

    df_subset_copy = df[['dTtl', 'dTos', 'dHops', 'SrcGap', 'DstGap', 'SrcTCPBase', 'DstTCPBase', 'SrcWin', 'DstWin']].copy(deep=True)
    mice_imputer = IterativeImputer()
    df_subset_copy.iloc[:, :] = mice_imputer.fit_transform(df_subset_copy)
    df[['dTtl', 'dTos', 'dHops', 'SrcGap', 'DstGap', 'SrcTCPBase', 'DstTCPBase', 'SrcWin', 'DstWin']] = df_subset_copy

In [None]:
def one_hot_encode(df):
    proto_dummies = pd.get_dummies(df.Proto, dtype=int, prefix='Proto')
    proto_dummies.drop(columns=['Proto_ipv6-icmp'], inplace=True)
    sDSb_dummies = pd.get_dummies(df.sDSb, dtype=int, prefix='sDSb')
    sDSb_dummies.drop(columns=['sDSb_39','sDSb_54', 'sDSb_4', 'sDSb_cs4'], inplace=True)
    dDSb_dummies = pd.get_dummies(df.dDSb, dtype=int, prefix='dDSb')
    dDSb_dummies.drop(columns=['dDSb_cs4'], inplace=True)
    Cause_dummies = pd.get_dummies(df.Cause, dtype=int, prefix='Cause')
    Cause_dummies.drop(columns=['Cause_Shutdown'], inplace=True)
    State_dummies = pd.get_dummies(df.State, dtype=int, prefix='State')
    State_dummies.drop(columns=['State_RSP','State_TST','State_NRS'], inplace=True)
    X, y = df.loc[:, df.columns != 'Attack Type'], df['Attack Type']
    X = pd.concat([X, proto_dummies, sDSb_dummies, dDSb_dummies, Cause_dummies, State_dummies], axis='columns')
    df = pd.concat([X, y], axis='columns')
    df.drop(columns=['Proto', 'sDSb', 'dDSb', 'Cause', 'State', 'Label', 'Unnamed: 0', 'Seq', 'RunTime', 'Mean', 'Sum', 'Min', 'Max', 'Attack Tool'], inplace=True)
    return df

In [None]:
def normalize_dataframe(df):
    X, y = df.loc[:, df.columns != 'Attack Type'], df['Attack Type']
    std_scaler = preprocessing.StandardScaler()
    min_max_scaler = preprocessing
    std_normalized = std_scaler.fit_transform(X)
    X = pd.DataFrame(std_normalized, columns = X.columns, index = X.index)
    df = pd.concat([X, y], axis='columns')
    return df

In [None]:
def label_encode(df):
    # define ordinal encoding
    le = LabelEncoder()
    df['Attack Type'] = le.fit_transform(df['Attack Type'].iloc[:].values.ravel())
    df['Attack Type'] = pd.to_numeric(df['Attack Type'], downcast='integer')
    return df

In [None]:
def pre_process(df):
    handle_missing_values(df)
    df = one_hot_encode(df)
    df = normalize_dataframe(df)
    df = label_encode(df)
    return df

In [None]:
df = pre_process(df)
df.head()

In [None]:
# Train Test split
X_train, X_test, y_train, y_test = train_test_split(df.loc[:, df.columns != 'Attack Type'], df['Attack Type'],
                                                    stratify=df['Attack Type'],
                                                    test_size=0.15)

X_train = pd.DataFrame(X_train, columns=df.columns.to_list()[:-1])
X_test = pd.DataFrame(X_test, columns=df.columns.to_list()[:-1])
y_train = pd.DataFrame(y_train, columns=['Attack Type'])
y_test = pd.DataFrame(y_test, columns=['Attack Type'])

print("Training dataset size:", X_train.shape)
print("Testing dataset size:", X_test.shape)
print("Training target size:", y_train.shape)
print("Testing target size:", y_test.shape)

In [None]:
def mutual_info(X, Y):
  mutual_info_arr = mutual_info_classif(X, Y)
  series_info = pd.Series(mutual_info_arr)
  series_info.index = X.columns
  series_top = series_info.sort_values(ascending=False)[:20]
  return series_top

In [None]:
from sklearn.feature_selection import mutual_info_classif


In [None]:
result = mutual_info(X_train, y_train)

In [None]:
new_df = df[result.keys()]

In [None]:
def get_pca_df(scaled_data, no_of_components):
  from sklearn.decomposition import PCA
  pca = PCA(n_components=no_of_components)
  Principal_components=pca.fit_transform(scaled_data)
  column_names = ["PC "+str(i) for i in range(1, no_of_components+1)]
  pca_df = pd.DataFrame(data = Principal_components, columns = column_names)
  return pca_df

In [None]:
pca_data = get_pca_df(new_df, 15)

In [None]:
pca_data

In [None]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators = 5, criterion = "gini", random_state =21)

In [None]:
rfc.fit(X_train, y_train)