<a href="https://colab.research.google.com/github/swampxx/ML-algorithms-on-ids-datasets/blob/master/ids2018_wednesday_preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2, f_classif
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier

In [None]:
df = pd.read_csv("ids_2018_thursday.csv")

In [None]:
df.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1048575 entries, 0 to 1048574
Data columns (total 80 columns):
 #   Column             Non-Null Count    Dtype  
---  ------             --------------    -----  
 0   Dst Port           1048575 non-null  int64  
 1   Protocol           1048575 non-null  int64  
 2   Timestamp          1048575 non-null  object 
 3   Flow Duration      1048575 non-null  int64  
 4   Tot Fwd Pkts       1048575 non-null  int64  
 5   Tot Bwd Pkts       1048575 non-null  int64  
 6   TotLen Fwd Pkts    1048575 non-null  int64  
 7   TotLen Bwd Pkts    1048575 non-null  int64  
 8   Fwd Pkt Len Max    1048575 non-null  int64  
 9   Fwd Pkt Len Min    1048575 non-null  int64  
 10  Fwd Pkt Len Mean   1048575 non-null  float64
 11  Fwd Pkt Len Std    1048575 non-null  float64
 12  Bwd Pkt Len Max    1048575 non-null  int64  
 13  Bwd Pkt Len Min    1048575 non-null  int64  
 14  Bwd Pkt Len Mean   1048575 non-null  float64
 15  Bwd Pkt Len Std    1048575 non-n

In [None]:
df['Protocol'].value_counts()

6     684486
17    345524
0      18565
Name: Protocol, dtype: int64

In [None]:
df["Flow Pkts/s"] = pd.to_numeric(df["Flow Pkts/s"], errors='coerce')
df.loc[df['Protocol'] == "6", 'Protocol'] = 6
df.loc[df['Protocol'] == "17", 'Protocol'] = 17
df.loc[df['Protocol'] == "0", 'Protocol'] = 0
df.dropna(inplace=True)
df.drop('Timestamp', axis=1, inplace=True)
df.info(verbose=True)

  res_values = method(rvalues)


<class 'pandas.core.frame.DataFrame'>
Int64Index: 1043654 entries, 0 to 1048574
Data columns (total 79 columns):
 #   Column             Non-Null Count    Dtype  
---  ------             --------------    -----  
 0   Dst Port           1043654 non-null  int64  
 1   Protocol           1043654 non-null  int64  
 2   Flow Duration      1043654 non-null  int64  
 3   Tot Fwd Pkts       1043654 non-null  int64  
 4   Tot Bwd Pkts       1043654 non-null  int64  
 5   TotLen Fwd Pkts    1043654 non-null  int64  
 6   TotLen Bwd Pkts    1043654 non-null  int64  
 7   Fwd Pkt Len Max    1043654 non-null  int64  
 8   Fwd Pkt Len Min    1043654 non-null  int64  
 9   Fwd Pkt Len Mean   1043654 non-null  float64
 10  Fwd Pkt Len Std    1043654 non-null  float64
 11  Bwd Pkt Len Max    1043654 non-null  int64  
 12  Bwd Pkt Len Min    1043654 non-null  int64  
 13  Bwd Pkt Len Mean   1043654 non-null  float64
 14  Bwd Pkt Len Std    1043654 non-null  float64
 15  Flow Byts/s        1043654 non-n

In [None]:
df.Label.value_counts()

Benign                   991156
DoS attacks-GoldenEye     41508
DoS attacks-Slowloris     10990
Name: Label, dtype: int64

In [None]:
train_df = df.groupby('Label').apply(pd.DataFrame.sample, frac=0.8).reset_index(level='Label', drop=True)
test_df = df.drop(train_df.index)

In [None]:
train_df.Label.value_counts()

Benign                   792925
DoS attacks-GoldenEye     33206
DoS attacks-Slowloris      8792
Name: Label, dtype: int64

In [None]:
train_df.count()

Dst Port         834923
Protocol         834923
Flow Duration    834923
Tot Fwd Pkts     834923
Tot Bwd Pkts     834923
                  ...  
Idle Mean        834923
Idle Std         834923
Idle Max         834923
Idle Min         834923
Label            834923
Length: 79, dtype: int64

In [None]:
test_df.count()

Dst Port         208731
Protocol         208731
Flow Duration    208731
Tot Fwd Pkts     208731
Tot Bwd Pkts     208731
                  ...  
Idle Mean        208731
Idle Std         208731
Idle Max         208731
Idle Min         208731
Label            208731
Length: 79, dtype: int64

In [None]:
def process_df(dataFrame):
    df1 = dataFrame.replace('Benign', 0).replace('Infilteration', 1)
    df1['Protocol'] = df1['Protocol'].astype("category")
    one_hot = pd.get_dummies(df1["Protocol"], prefix="Protocol")
    df1 = df1.drop("Protocol", axis=1)
    df1 = df1.join(one_hot)
    res = df1[~df1.isin([np.nan, np.inf, -np.inf]).any(1)]

    return res

In [None]:
train_set = process_df(train_df)
train_labels = train_set.Label.copy()
train_set.drop('Label', axis=1, inplace=True)


test_set = process_df(test_df)
test_labels = test_set.Label.copy()
test_set.drop('Label', axis=1, inplace=True)

In [None]:
train_set = train_set.astype(np.float64)
test_set = test_set.astype(np.float64)

In [None]:
X, y = train_set.values, train_labels.values
X_test, y_test = test_set.values, test_labels.values

In [None]:
rf = RandomForestClassifier()
rf.fit(X, y)

In [None]:
pred = rf.predict(X_test)
from sklearn.metrics import accuracy_score, precision_score, recall_score
recall_score(y_test, pred)

## Feature Selection


### With SelectKBest and f_classif

In [None]:
featureSelector = SelectKBest(f_classif, k=30)
X_new = featureSelector.fit_transform(X, y)
features = train_set.columns[featureSelector.get_support(indices=True)]

In [None]:
print(features)

In [None]:
X_test_kbest, y_test = test_set[features].values, test_labels.values

In [None]:
np.savetxt("train_x_processed_kbest.txt", X_new)
np.savetxt("train_y_processed_kbest.txt", y)

np.savetxt("test_x_processed_kbest.txt", X_test_kbest)
np.savetxt("test_y_processed_kbest.txt", y_test)

### Feature Selection with Importance

In [None]:
X.shape

In [None]:
fs = SelectFromModel(RandomForestClassifier())
# learn relationship from training data
fs.fit(X, y)
# transform train input data
X_train_fs = fs.transform(X)
# transform test input data
X_test_fs = fs.transform(X_test)

In [None]:
X_train_fs.shape

In [None]:
features = train_set.columns[fs.get_support()]
print(features)

In [None]:
np.savetxt("train_x_processed_fi.txt", X_train_fs)
np.savetxt("train_y_processed_fi.txt", y)

np.savetxt("test_x_processed_fi.txt", X_test_fs)
np.savetxt("test_y_processed_fi.txt", y_test)