In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score


In [2]:
normal_dataframe = pd.read_csv('normal_traffic.csv')
attack_dataframe = pd.read_csv('attack_traffic.csv')

In [3]:
normal_dataframe.head()

Unnamed: 0,Source_IP,Destination_IP,Source_Port,Destination_Port,Protocol,Packet_Size
0,238.227.123.224,175.180.107.118,50960,39289,UDP,644
1,84.62.199.55,246.53.113.120,18037,64138,TCP,876
2,152.186.100.185,232.127.27.212,55692,11000,UDP,862
3,230.69.218.3,208.142.12.59,2191,52264,UDP,915
4,93.25.49.42,23.160.236.132,46088,28072,UDP,717


In [4]:
attack_dataframe.head()

Unnamed: 0,Source_IP,Destination_IP,Source_Port,Destination_Port,Protocol,Packet_Size
0,3.110.89.217,32.153.178.49,51539,16283,TCP,990
1,86.12.245.3,183.63.109.160,11862,61276,TCP,497
2,224.151.54.9,220.126.94.186,15578,54781,TCP,1033
3,89.214.35.174,167.51.167.203,47324,15275,UDP,882
4,134.157.185.76,40.200.193.163,15415,19749,TCP,592


In [5]:
normal_dataframe['flag'] = [0 for _ in range(500)]
attack_dataframe['flag'] = [1 for _ in range(500)]

In [6]:
full_dataframe = pd.concat([normal_dataframe, attack_dataframe])

In [7]:
full_dataframe.shape

(1000, 7)

In [8]:
from sklearn.utils import shuffle

In [9]:
full_dataframe = shuffle(full_dataframe, random_state=42)

In [10]:
full_dataframe.head()

Unnamed: 0,Source_IP,Destination_IP,Source_Port,Destination_Port,Protocol,Packet_Size,flag
21,130.27.205.209,84.58.52.17,7809,25167,TCP,791,1
237,177.91.14.135,129.225.114.198,16194,26621,UDP,711,1
240,244.82.28.193,114.56.109.168,58430,1793,UDP,781,1
160,102.93.218.117,42.20.29.17,38191,29724,TCP,683,1
411,61.151.182.37,103.122.129.22,15931,21399,UDP,731,0


In [11]:
def ip_to_int (x):
    octets = list(map(int, x.split('.')))
    return octets[0] * 256**3 + octets[1] * 256**2 + octets[2] * 256**1 + octets[3] * 256**0

In [12]:
full_dataframe['Source_IP'] = full_dataframe['Source_IP'].apply(lambda x: ip_to_int(x))
full_dataframe['Destination_IP'] = full_dataframe['Destination_IP'].apply(lambda x: ip_to_int(x))

In [14]:
full_dataframe['Protocol'].unique().tolist()

['TCP', 'UDP']

In [15]:
protocol_encoder = {'TCP' : 0, 'UDP' : 1}

In [16]:
full_dataframe['Protocol'] = full_dataframe['Protocol'].apply(lambda x : protocol_encoder[x])

In [17]:
full_dataframe.head()

Unnamed: 0,Source_IP,Destination_IP,Source_Port,Destination_Port,Protocol,Packet_Size,flag
21,2182860241,1413100561,7809,25167,0,791,1
237,2975534727,2179035846,16194,26621,1,711,1
240,4099022017,1916300712,58430,1793,1,781,1
160,1717426805,705961233,38191,29724,0,683,1
411,1033352741,1736081686,15931,21399,1,731,0


In [18]:
X = full_dataframe.drop(['flag'], axis=1)
Y = full_dataframe['flag']

In [19]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=23)

In [21]:
X_train.shape

(800, 6)

In [22]:
import joblib
def train_and_test_model(model : object, train_data: tuple, test_data: tuple, save: bool = False, model_name: str = 'default_model'):
    x_train, y_train = train_data
    x_test, y_test = test_data
    clf = model
    clf.fit(x_train, y_train)
    y_pred = clf.predict(x_test)
    acc = accuracy_score(y_test, y_pred)

    if save:
        joblib.dump(clf,f'{model_name}.joblib')
    return acc

    

In [23]:
train_data = (X_train, Y_train)
test_data = (X_test, Y_test)

In [24]:
rf = RandomForestClassifier()
lr = LogisticRegression()
svc = SVC()
knn = KNeighborsClassifier()

In [25]:
train_and_test_model(rf, train_data, test_data, save=True, model_name='Random_Forest')

0.455

In [26]:
train_and_test_model(lr, train_data, test_data, save=True, model_name='LogisticRegression')

0.515

In [27]:
train_and_test_model(svc, train_data, test_data, save=True, model_name='SVC')

0.52

In [28]:
train_and_test_model(knn, train_data, test_data, save=True, model_name='KNN')

0.575