In [51]:
import pandas as pd
from io import StringIO

df = pd.read_csv("CTU-IoT-Malware-Capture-1-1conn.log.labeled.csv", sep="|")

df.shape

(1008748, 23)

In [52]:
df.head()

Unnamed: 0,ts,uid,id.orig_h,id.orig_p,id.resp_h,id.resp_p,proto,service,duration,orig_bytes,...,local_resp,missed_bytes,history,orig_pkts,orig_ip_bytes,resp_pkts,resp_ip_bytes,tunnel_parents,label,detailed-label
0,1525880000.0,CUmrqr4svHuSXJy5z7,192.168.100.103,51524.0,65.127.233.163,23.0,tcp,-,2.999051,0,...,-,0.0,S,3.0,180.0,0.0,0.0,-,Malicious,PartOfAHorizontalPortScan
1,1525880000.0,CH98aB3s1kJeq6SFOc,192.168.100.103,56305.0,63.150.16.171,23.0,tcp,-,-,-,...,-,0.0,S,1.0,60.0,0.0,0.0,-,Malicious,PartOfAHorizontalPortScan
2,1525880000.0,C3GBTkINvXNjVGtN5,192.168.100.103,41101.0,111.40.23.49,23.0,tcp,-,-,-,...,-,0.0,S,1.0,60.0,0.0,0.0,-,Malicious,PartOfAHorizontalPortScan
3,1525880000.0,CDe43c1PtgynajGI6,192.168.100.103,60905.0,131.174.215.147,23.0,tcp,-,2.998796,0,...,-,0.0,S,3.0,180.0,0.0,0.0,-,Malicious,PartOfAHorizontalPortScan
4,1525880000.0,CJaDcG3MZzvf1YVYI4,192.168.100.103,44301.0,91.42.47.63,23.0,tcp,-,-,-,...,-,0.0,S,1.0,60.0,0.0,0.0,-,Malicious,PartOfAHorizontalPortScan


In [53]:
# remove duplicate rows
df = df.drop_duplicates()
df.shape

(1008748, 23)

In [54]:
# keep only ts and label
df = df[['proto', 'history', 'resp_ip_bytes', 'orig_pkts', 'resp_pkts', 'orig_ip_bytes', 'id.resp_p', 'label']]
# 'id.orig_h', 'id.resp_p', 'id.orig_p'
df.shape

(1008748, 8)

In [55]:
df.head()

Unnamed: 0,proto,history,resp_ip_bytes,orig_pkts,resp_pkts,orig_ip_bytes,id.resp_p,label
0,tcp,S,0.0,3.0,0.0,180.0,23.0,Malicious
1,tcp,S,0.0,1.0,0.0,60.0,23.0,Malicious
2,tcp,S,0.0,1.0,0.0,60.0,23.0,Malicious
3,tcp,S,0.0,3.0,0.0,180.0,23.0,Malicious
4,tcp,S,0.0,1.0,0.0,60.0,23.0,Malicious


In [56]:
# remove duplicate rows
df = df.drop_duplicates()
df.shape

(95390, 8)

In [57]:
df.head()

Unnamed: 0,proto,history,resp_ip_bytes,orig_pkts,resp_pkts,orig_ip_bytes,id.resp_p,label
0,tcp,S,0.0,3.0,0.0,180.0,23.0,Malicious
1,tcp,S,0.0,1.0,0.0,60.0,23.0,Malicious
6,tcp,S,0.0,3.0,0.0,180.0,49560.0,Benign
7,tcp,S,0.0,1.0,0.0,60.0,21288.0,Benign
9,tcp,S,0.0,1.0,0.0,60.0,8080.0,Malicious


In [58]:
df.dtypes

proto             object
history           object
resp_ip_bytes    float64
orig_pkts        float64
resp_pkts        float64
orig_ip_bytes    float64
id.resp_p        float64
label             object
dtype: object

In [59]:
df.nunique()

proto                3
history            126
resp_ip_bytes     1141
orig_pkts           54
resp_pkts           69
orig_ip_bytes     1249
id.resp_p        65426
label                2
dtype: int64

In [60]:
# perform label encoding
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df['proto'] = le.fit_transform(df['proto'])
df['history'] = le.fit_transform(df['history'])
df['label'] = le.fit_transform(df['label'])
df.head()

Unnamed: 0,proto,history,resp_ip_bytes,orig_pkts,resp_pkts,orig_ip_bytes,id.resp_p,label
0,1,22,0.0,3.0,0.0,180.0,23.0,1
1,1,22,0.0,1.0,0.0,60.0,23.0,1
6,1,22,0.0,3.0,0.0,180.0,49560.0,0
7,1,22,0.0,1.0,0.0,60.0,21288.0,0
9,1,22,0.0,1.0,0.0,60.0,8080.0,1


In [61]:
df.dtypes

proto              int32
history            int32
resp_ip_bytes    float64
orig_pkts        float64
resp_pkts        float64
orig_ip_bytes    float64
id.resp_p        float64
label              int32
dtype: object

In [90]:
# train model user 'label' as target with use of SVM
from sklearn.model_selection import train_test_split

X = df.drop('label', axis=1)
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

from sklearn.svm import SVC

model = SVC()
model.fit(X_train, y_train)

# evaluate model
from sklearn.metrics import classification_report, confusion_matrix

y_pred = model.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))


[[18454     0]
 [   93   531]]
              precision    recall  f1-score   support

           0       0.99      1.00      1.00     18454
           1       1.00      0.85      0.92       624

    accuracy                           1.00     19078
   macro avg       1.00      0.93      0.96     19078
weighted avg       1.00      1.00      0.99     19078


In [89]:
# train model user 'label' as target with use of SVM
from sklearn.model_selection import train_test_split

X = df.drop('label', axis=1)
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [91]:
# now train a model using knn
from sklearn.neighbors import KNeighborsClassifier

model = KNeighborsClassifier(n_neighbors=4)
model.fit(X_train, y_train)

# evaluate model
from sklearn.metrics import classification_report, confusion_matrix

y_pred = model.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[18452     2]
 [    8   616]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     18454
           1       1.00      0.99      0.99       624

    accuracy                           1.00     19078
   macro avg       1.00      0.99      1.00     19078
weighted avg       1.00      1.00      1.00     19078
