In [26]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from sklearn.externals import joblib
from sklearn.ensemble import RandomForestClassifier
pd.set_option('display.max_columns', None)

In [27]:
df = pd.read_csv('../sample_csv_data/ohport_multi_device_downsampling_flow.csv')
df = df.drop(columns=['Unnamed: 0'])
df = df[df.device != 'Not IOT']

In [28]:
df.groupby('device').size()

device
Amazon Echo                         1000
Belkin Wemo switch                  1000
Belkin wemo motion sensor           1000
HP Printer                          1000
Insteon Camera                      1000
Light Bulbs LiFX Smart Bulb         1000
Netatmo Welcome                     1000
Netatmo weather station             1000
PIX-STAR Photo-frame                1000
Samsung SmartCam                    1000
TP-Link Day Night Cloud camera      1000
Triby Speaker                       1000
Withings Aura smart sleep sensor    1000
Withings Smart Baby Monitor         1000
dtype: int64

In [29]:
# one hot for network & transport protocol
one_hot = pd.get_dummies(df['networkProtocol'])
df = df.join(one_hot)
one_hot = pd.get_dummies(df['transportProtocol'])
df = df.join(one_hot)

# device name to number
device_list = df.device.unique()
for d in device_list:
    print(d)
device_dict = dict(zip(device_list, range(len(device_list))))
df = df.replace({'device': device_dict})

Belkin wemo motion sensor
Belkin Wemo switch
Samsung SmartCam
Amazon Echo
Insteon Camera
Light Bulbs LiFX Smart Bulb
Withings Smart Baby Monitor
Netatmo Welcome
Withings Aura smart sleep sensor
Netatmo weather station
TP-Link Day Night Cloud camera
PIX-STAR Photo-frame
HP Printer
Triby Speaker


In [30]:
features = [
    'dns',
    'dhcp-server',
    'http',
    'ntp',
    'https',
    'smtp-ssl',
    'imap-ssl',
    'ssdp',
    'icslap',
    'stm-pproc',
    'stun',
    'ws-discovery',
    'upnp-evnt',
    'xmpp',
    'android',
    'dstPort25050',
    'dstPort49152',
    'dstPort49153',
    'dstPort49154',
    'srcPort49152',
    'srcPort49153',
    'ephemeralDstPort', 
    'registedDstPort',
    'IPv4',
    'IPv6',
    'TCP',
    'UDP',
    'duration', 
    'packetCount', 
    'totalPayloadByte', 
    'appplicationPayloadByte'
]

X = df[features].values[:]
Y = df[['device']].values[:]
Y = Y.astype('int')
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.3, random_state = 100)

In [31]:
clf = RandomForestClassifier(n_estimators=100)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

  


In [32]:
print("accurency is", accuracy_score(y_test,y_pred), "\nf1 score is", f1_score(y_test, y_pred, average=None), "\n")
scores = f1_score(y_test, y_pred, average=None)

accurency is 0.9557142857142857 
f1 score is [0.99238965 0.98827471 0.97039474 0.97063903 0.94425087 0.94545455
 0.92117465 0.9699115  0.9322314  0.9704918  0.96247961 0.8940754
 0.97142857 0.94078947] 



In [33]:
i = 0
for s in scores:
    print(s, device_list[i])
    i+=1

0.9923896499238964 Belkin wemo motion sensor
0.9882747068676717 Belkin Wemo switch
0.9703947368421053 Samsung SmartCam
0.9706390328151987 Amazon Echo
0.9442508710801394 Insteon Camera
0.9454545454545454 Light Bulbs LiFX Smart Bulb
0.9211746522411127 Withings Smart Baby Monitor
0.9699115044247787 Netatmo Welcome
0.9322314049586777 Withings Aura smart sleep sensor
0.9704918032786884 Netatmo weather station
0.9624796084828711 TP-Link Day Night Cloud camera
0.8940754039497307 PIX-STAR Photo-frame
0.9714285714285715 HP Printer
0.9407894736842104 Triby Speaker


In [34]:
i = 0
for im in clf.feature_importances_:
    print(im*100, features[i])
    i+=1

1.6049200763694773 dns
0.5558706550150009 dhcp-server
3.9561746864745055 http
1.87495712864985 ntp
4.127454315194693 https
0.7284837436300361 smtp-ssl
0.0 imap-ssl
0.08472504007472002 ssdp
0.02632939976327437 icslap
0.10579889391150338 stm-pproc
0.8598378145226274 stun
0.0 ws-discovery
0.024594305329450932 upnp-evnt
0.6686216283771361 xmpp
0.36579362266138366 android
3.9911578272706425 dstPort25050
1.685290168614463 dstPort49152
1.2260587082859906 dstPort49153
0.14446535775781544 dstPort49154
1.6731969694489126 srcPort49152
2.4366264297705036 srcPort49153
1.9071782912069706 ephemeralDstPort
2.4829720978606638 registedDstPort
0.3398553991729085 IPv4
0.28745532087552045 IPv6
1.5327626906991365 TCP
1.6330095311143096 UDP
22.23647442570796 duration
10.136399265914424 packetCount
28.98787625344822 totalPayloadByte
4.315659952877902 appplicationPayloadByte


In [35]:
joblib.dump(clf, 'multiclass_classify_model.pkl')

['multiclass_classify_model.pkl']