### Imports

In [None]:
import pandas as pd
import numpy as np


### DATA

In [None]:
# possible features to use for connection oriented traces ?

# ["duration","protocol_type", "service","flag","src_bytes","dst_bytes","wrong_fragment","count","srv_count","serror_rate","srv_serror_rate","rerror_rate","srv_rerror_rate","same_srv_rate", "diff_srv_rate","dst_host_count","dst_host_srv_count","dst_host_same_srv_rate","dst_host_diff_srv_rate","class"]
# words_list = ['duration', 'src_bytes', 'dst_bytes', 'land', 'wrong_fragment', 'urgent', 'hot', 'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell', 'su_attempted', 'num_root', 'num_file_creations', 'num_shells', 'num_access_files', 'num_outbound_cmds', 'is_host_login', 'is_guest_login', 'count', 'srv_count', 'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate']



In [None]:
features = ['duration','protocol_type','service','flag','src_bytes','dst_bytes','land','wrong_fragment','urgent','hot',
            'num_failed_logins','logged_in','num_compromised','root_shell','su_attempted','num_root','num_file_creations',
            'num_shells','num_access_files','num_outbound_cmds','is_host_login','is_guest_login','count','srv_count','serror_rate',
            'srv_serror_rate','rerror_rate','srv_rerror_rate','same_srv_rate','diff_srv_rate','srv_diff_host_rate','dst_host_count',
            'dst_host_srv_count','dst_host_same_srv_rate','dst_host_diff_srv_rate','dst_host_same_src_port_rate',
            'dst_host_srv_diff_host_rate','dst_host_serror_rate','dst_host_srv_serror_rate','dst_host_rerror_rate',
            'dst_host_srv_rerror_rate','class','difficulty_level']
len(features)

In [None]:
train_df = pd.read_csv("KDDTrain+20.txt", names = features)
test_df = pd.read_csv("KDDTest+20.txt", names = features)

print(train_df.shape)
print(test_df.shape)


In [None]:
# make some stats

### DATA Preprocessing

In [None]:
# droping last column bc we don't need it

train_df = train_df.drop("difficulty_level", 1)
test_df = test_df.drop("difficulty_level",1)
train_df.shape

#shows different problems
def spot( a, b):
    diff = []

    for each in a:
        if each not in b:
            diff.append(each)
    for el in b:
        if el not in a:
            diff.append(el)
    return diff
#['protocol_type','service','flag']
s = "service"
diff = spot(train_df[s].unique(),test_df[s].unique())
print(diff)

##### One-hot encoding:

In [None]:
# these feautures have categorical values
columns = ['protocol_type','service','flag']


def one_hot(dataframe, columns):
    
    for col in columns:
        dummies = pd.get_dummies(dataframe[col], prefix=col, drop_first= False)
        dataframe = pd.concat([dataframe, dummies], axis= 1)
        dataframe = dataframe.drop(col, 1)
    return dataframe
    

In [None]:
train_df1 = one_hot(train_df, columns)
test_df1 = one_hot(test_df, columns )

In [None]:
#should be (125973, 123)
#print("TRAIN DF",train_df1.shape)
print("TEST DF",test_df1.shape)

##### Drop unnecessary features

In [None]:
train_class = train_df1.pop('class')
test_class = test_df1.pop('class')

##### Normalization

In [None]:
# Normalization will be applied to all features

def normalize(dataframe, columns):
    temp = dataframe.copy()
    for col in columns:
        max_val = dataframe[col].max()
        min_val = dataframe[col].min()
        if max_val>min_val:
            temp[col] = (dataframe[col] - min_val)/(max_val - min_val)
    
    return temp

train_df2 = normalize(train_df1, train_df1.columns)
test_df2 = normalize(test_df1, test_df1.columns)

train_df2

##### Fixing class labels

In [None]:
#fixing train and test class set labels
classlist_train = []
classlist_test = []

Dos_class = ("apache2","back","land","neptune","mailbomb","pod","processtable","smurf","teardrop","udpstorm","worm")
Probe_class = ("ipsweep","mscan","nmap","portsweep","saint","satan")
U2R_class = ("buffer_overflow","loadmodule","perl","ps","rootkit","sqlattack","xterm")
R2L_class = ("ftp_write","guess_passwd","httptunnel","imap","multihop","named","phf","sendmail","Snmpgetattack","spy","snmpguess","warezclient","warezmaster","xlock","xsnoop")

#counting lables

DoSCount_train=0
ProbeCount_train=0
U2RCount_train=0
R2LCount_train=0
NormalCount_train=0

DoSCount_test=0
ProbeCount_test=0
U2RCount_test=0
R2LCount_test=0
NormalCount_test=0



In [None]:
#Fixing labels for training set
for item in train_class:
    if item in Dos_class:
        classlist_train.append("DoS")
        DoSCount_train=DoSCount_train+1
    elif item in Probe_class:
        classlist_train.append("Probe")
        ProbeCount_train=ProbeCount_train+1
    elif item in U2R_class:
        classlist_train.append("U2R")
        U2RCount_train=U2RCount_train+1
    elif item in R2L_class:
        classlist_train.append("R2L")
        R2LCount_train=R2LCount_train+1
    else:
        classlist_train.append("Normal")
        NormalCount_train=NormalCount_train+1

print("Train class count")
print(DoSCount_train)
print(NormalCount_train)
print(ProbeCount_train)
print(R2LCount_train)
print(U2RCount_train)

In [None]:
#Fixing labels for testing set
for item in test_class:
    if item in Dos_class:
        classlist_test.append("DoS")
        DoSCount_test=DoSCount_test+1
    elif item in Probe_class:
        classlist_test.append("Probe")
        ProbeCount_test=ProbeCount_test+1
    elif item in U2R_class:
        classlist_test.append("U2R")
        U2RCount_test=U2RCount_test+1
    elif item in R2L_class:
        classlist_test.append("R2L")
        R2LCount_test=R2LCount_test+1
    else:
        classlist_test.append("Normal")
        NormalCount_test=NormalCount_test+1

print("Test class count")
print(DoSCount_test)
print(NormalCount_test)
print(ProbeCount_test)
print(R2LCount_test)
print(U2RCount_test)

In [None]:
#adding back class feature to normalized train and test set

train_df2["class"] = classlist_train
test_df2["class"] =  classlist_test


In [None]:
print(train_df2.shape)
print(test_df2.shape)

In [None]:
Y_train = train_df2["class"]
Y_test = test_df2["class"]

X_train = train_df2.drop("class",1)
X_test = test_df2.drop("class",1)
X_train

#### Model

##### CNN

In [None]:
"""import tensorflow as tf
from tensorflow import keras

# Define the CNN architecture
model = keras.Sequential([
    keras.layers.Conv1D(filters=64, kernel_size=3, activation='relu', input_shape=(25, 1)),
    keras.layers.MaxPooling1D(pool_size=2),
    keras.layers.Conv1D(filters=128, kernel_size=3, activation='relu'),
    keras.layers.MaxPooling1D(pool_size=2),
    keras.layers.Conv1D(filters=256, kernel_size=3, activation='relu'),
    keras.layers.MaxPooling1D(pool_size=2),
    keras.layers.Flatten(),
    keras.layers.Dense(units=1024, activation='relu'),
    keras.layers.Dense(units=512, activation='relu'),
    keras.layers.Dense(units=5, activation='softmax') # 5 output classes for multiclass classification
])

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_val, y_val))

# Evaluate the model
test_loss, test_acc = model.evaluate(X_test, y_test)
print('Test accuracy:', test_acc)"""


In [None]:
#imports
from keras.models import Sequential
from keras.layers import Convolution1D, MaxPooling1D, Flatten, Dense, Dropout


cnn = Sequential()
cnn.add(Convolution1D(64, 3, padding="same", activation="relu", input_shape=(117, 1)))
cnn.add(Convolution1D(64, 3, padding="same", activation="relu"))
cnn.add(MaxPooling1D(pool_size=2))
cnn.add(Convolution1D(128, 3, padding="same", activation="relu"))
cnn.add(Convolution1D(128, 3, padding="same", activation="relu"))
cnn.add(MaxPooling1D(pool_size=2))
cnn.add(Flatten())
cnn.add(Dense(128, activation="relu"))
cnn.add(Dropout(0.1))
cnn.add(Dense(128, activation="relu"))
cnn.add(Dropout(0.1))
cnn.add(Dense(64, activation="relu"))
cnn.add(Dropout(0.1))
cnn.add(Dense(64, activation="relu"))
cnn.add(Dropout(0.1))
cnn.add(Dense(5, activation="softmax"))


# Compile the model
cnn.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])



In [None]:
#splitting data 80/20
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

train_X, test_X, train_y, test_y = train_test_split(X_train, Y_train, test_size=0.2, random_state=101)

train_X = MinMaxScaler().fit_transform(train_X)
test_x = MinMaxScaler().fit_transform(test_X)




In [None]:
print(train_df2.shape)
x_columns_train = train_df2.columns.drop('class')
print(x_columns_train.shape)
x_train_array = train_df2[x_columns_train].values
x_train_1=np.reshape(x_train_array, (x_train_array.shape[0], x_train_array.shape[1], 1))
    
dummies_train = pd.get_dummies(Y_train) # Classification
outcomes = dummies_train.columns
num_classes = len(outcomes)
y_train_1 = dummies_train.values

x_columns_test = test_df2.columns.drop('class')

x_test_array = test_df2[x_columns_test].values
x_test_1=np.reshape(x_test_array, (x_test_array.shape[0], x_test_array.shape[1], 1))
    
dummies_test = pd.get_dummies(Y_test) # Classification
outcomes_test = dummies_test.columns
num_classes = len(outcomes_test)
y_test_1 = dummies_test.values  

In [None]:
print(x_train_1.shape)
print(y_train_1.shape)

In [None]:
history= cnn.fit(x_train_1, y_train_1,validation_data=(x_test_1,y_test_1), epochs=50) 