## Can you predict the type of thousands of IoT and non-IoT devices?


https://www.kaggle.com/c/cybersecprague2019-challenge/data

### Prepare environment & get dataset

* we need mac_vendor_lookup to convert mac addresses into vendor names
* we need nltk stopwords for ntltk processing later in the pipeline

In [1]:
import nltk
!pip install mac_vendor_lookup
nltk.download('stopwords')
#!wget https://www.dropbox.com/s/redacted/dataset.zip?raw=1 -O dataset.zip
#!unzip dataset.zip



[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\xct\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

### Load & prepare dataset

In [2]:
import pandas as pd
import numpy as np
from pandas.io.json import json_normalize

'''
Classes:
HOME_AUTOMATION    12303
PC                  6665
MEDIA_BOX           6060
TV                  5715
IP_PHONE            5193
PRINTER             4342
AUDIO               3141
GENERIC_IOT         2959
VOICE_ASSISTANT     2912
NAS                 2813
GAME_CONSOLE        2384
SURVEILLANCE        2037
MOBILE              1382
'''

with open('train.json', 'r') as f:
     df = pd.read_json(f.read(), lines=True)

Simple Service Discovery Protocol (SSDP). We convert the json object arrays into space separated strings.

In [3]:
data = df

def handle_ssdp(x):
    d = {}
    if type(x) is list:
        df = json_normalize(x)   
        for index, row in df.iterrows():
            nt = row['nt']
            loc = row['location']
            srv = row['server']            
            d['ssdp'] = nt + " " + loc + " " + srv + " "
    return d

data['ssdp_words'] = pd.DataFrame([handle_ssdp(x) for x in df['ssdp']])
data['ssdp_words'] = data['ssdp_words'].replace('', np.nan)
data.count()

device_class     57906
device_id        57906
dhcp             13505
ip               57906
mac              57906
mdns_services    30880
services         53859
ssdp             25350
upnp             22881
ssdp_words       25350
dtype: int64

Universal Plug and Play (UPnP). We convert the json object arrays into space separated strings.

In [4]:
def handle_upnp(x):
    d = {}
    if type(x) is list:
        df = json_normalize(x)   
        d['upnp'] = ''
        for index, row in df.iterrows():
            # model_name, model_description, manufacturer, multiple entries of these
            name = ''
            desc = ''
            manu = ''
            try:
                name = row['model_name']
            except KeyError:
                pass
            try:
                desc = row['model_description']
            except KeyError:
                pass
            try:
                manu = row['manufacturer']
            except KeyError:
                pass            
            d['upnp'] += str(name)+" "
            d['upnp'] += str(desc)+" "
            d['upnp'] += str(manu)+" "        
    return d

data['upnp_words'] = pd.DataFrame([handle_upnp(x) for x in df['upnp']])
data['upnp_words'] = data['upnp_words'].replace('', np.nan)
data.count()

device_class     57906
device_id        57906
dhcp             13505
ip               57906
mac              57906
mdns_services    30880
services         53859
ssdp             25350
upnp             22881
ssdp_words       25350
upnp_words       22881
dtype: int64

Multicast-DNS (mDNS) . We convert the arrays into space separated strings.

In [5]:
def handle_mdns(x):
    d = {}   
    if type(x) is list:        
        d['mdns'] = ''
        for name in x:
            d['mdns'] += name.replace("_"," ").replace("."," ")
    return d

data['mdns_words'] = pd.DataFrame([handle_mdns(x) for x in df['mdns_services']])
data['mdns_words'] = data['mdns_words'].replace('', np.nan)
data.count()

device_class     57906
device_id        57906
dhcp             13505
ip               57906
mac              57906
mdns_services    30880
services         53859
ssdp             25350
upnp             22881
ssdp_words       25350
upnp_words       22881
mdns_words       30880
dtype: int64

MAC Addresses. These encode vendor information in the first 3 bytes, so we use the MacLookup library to get the vendor names from the mac addresses.

In [6]:
from mac_vendor_lookup import AsyncMacLookup # for google colab remove the async and await bits

mac = AsyncMacLookup()
await mac.load_vendors()

async def handle_mac(x):
    d = {}
    try:
        d['mac'] = await mac.lookup(x)    
    except KeyError:
        pass
    return d

data['mac_words'] = pd.DataFrame([await handle_mac(x) for x in df['mac']])
data['mac_words'] = data['mac_words'].replace('', np.nan)
data.count()

device_class     57906
device_id        57906
dhcp             13505
ip               57906
mac              57906
mdns_services    30880
services         53859
ssdp             25350
upnp             22881
ssdp_words       25350
upnp_words       22881
mdns_words       30880
mac_words        57724
dtype: int64

Ports (tcp & udp). We encode every port-protocol combination into a unique string to vectorize them later. It is also possible to just use "protocol_port", but I wanted to avoid using numbers here (they will be filtered out later).

In [7]:
# generate map of random chars, hash would be good too, but we don't want numbers 
import random
import string

umap = {}
for i in range(65535):
  rnd = ''.join(random.choice(string.ascii_lowercase) for x in range(32))
  umap[i] = rnd

def handle_ports(x):
    d = {}
    if type(x) is list:
        df = json_normalize(x)   
        d['port_words'] = ''
        for index, row in df.iterrows():
            port = row['port']
            proto = row['protocol']            
            item = proto+umap[port]           
            d['port_words'] += item + " "
    return d

data['port_words'] = pd.DataFrame([handle_ports(x) for x in df['services']])
data['port_words'] = data['port_words'].replace('', np.nan)
data.count()

device_class     57906
device_id        57906
dhcp             13505
ip               57906
mac              57906
mdns_services    30880
services         53859
ssdp             25350
upnp             22881
ssdp_words       25350
upnp_words       22881
mdns_words       30880
mac_words        57724
port_words       49793
dtype: int64

We have a lot of strings at this point and combine them into a single column (which will be vectorized later).

In [8]:
def combine_words(df):
    # combine words into single words column
    for index, row in df.iterrows():
        row['words'] = ""
        keys = ['ssdp_words','upnp_words','mdns_words','mac_words','port_words']
        for key in keys:
            entry = row[key]
            if entry is not np.nan:
              row['words'] += entry
                
data_comb = data.copy()
data_comb['words'] = ""
combine_words(data_comb)

# drop all other now no longer required columns
data_comb.drop(columns=['dhcp','ip','mac','mdns_services','services','ssdp','upnp','ssdp_words','upnp_words','mdns_words','mac_words','port_words'], inplace=True)
data_comb.head()

Unnamed: 0,device_class,device_id,words
0,IP_PHONE,5347ada9-925c-400e-8a7c-9aedd3c142f6,"YEALINK(XIAMEN) NETWORK TECHNOLOGY CO.,LTD.tcp..."
1,MEDIA_BOX,2717684b-3937-4644-a33a-33f4226c43ec,upnp:rootdevice http://192.168.1.109:8081/XD/2...
2,AUDIO,f1fc42f4-c794-4cc5-ac13-a5097d722d92,urn:schemas-upnp-org:service:RenderingControl:...
3,GAME_CONSOLE,74ab1b5b-3cb6-4aee-8362-f3b2f016574c,spotify-connect tcp local AzureWave Technolo...
4,GENERIC_IOT,1fe43d89-329d-40fe-b948-d1cbe0fe6c96,Bematech International Corp.


Natural Language processing is used on the single word column. This eliminates some words that mean similar things, but have slightly different spelling, numbers, special characters etc.

In [9]:
#https://towardsdatascience.com/multi-class-text-classification-with-sklearn-and-nltk-in-python-a-software-engineering-use-case-779d4a28ba5
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re

stemmer = PorterStemmer()
words = stopwords.words("english")

data_comb['words_clean'] = data_comb['words'].apply(lambda x: " ".join([stemmer.stem(i) for i in re.sub("[^a-zA-Z]", " ", x).split() if i not in words]).lower())
data_comb.head()

Unnamed: 0,device_class,device_id,words,words_clean
0,IP_PHONE,5347ada9-925c-400e-8a7c-9aedd3c142f6,"YEALINK(XIAMEN) NETWORK TECHNOLOGY CO.,LTD.tcp...",yealink xiamen network technolog co ltd tcpbfu...
1,MEDIA_BOX,2717684b-3937-4644-a33a-33f4226c43ec,upnp:rootdevice http://192.168.1.109:8081/XD/2...,upnp rootdevic http xd c dd b b c e ec allegro...
2,AUDIO,f1fc42f4-c794-4cc5-ac13-a5097d722d92,urn:schemas-upnp-org:service:RenderingControl:...,urn schema upnp org servic renderingcontrol ht...
3,GAME_CONSOLE,74ab1b5b-3cb6-4aee-8362-f3b2f016574c,spotify-connect tcp local AzureWave Technolo...,spotifi connect tcp local azurewav technolog i...
4,GENERIC_IOT,1fe43d89-329d-40fe-b948-d1cbe0fe6c96,Bematech International Corp.,bematech intern corp


## Model

The TfidfVectorizer is used vectorize the word column. Depending on the parameters this can be anything between 4k-30k features. Earlier I used SelectKBest to reduce the amount of features but it always made the result slightly worse, so I just took all features and ommitted this step.

The data is fed into a simple neural network with 2 dense and a dropout layer. I noticed that the architecture does not matter much, as changing the amount of neurons or layers hardly influenced the result. 

In [10]:
from sklearn.model_selection import train_test_split
#from sklearn.feature_selection import SelectKBest, chi2
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer,TfidfTransformer
from keras.wrappers.scikit_learn import KerasClassifier
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.optimizers import Adam
from keras.models import Sequential
from keras import layers

def create_model():
    model = Sequential()
    model.add(Dense(100, input_dim=31950, activation='relu'))
    model.add(Dropout(0.4))
    model.add(Dense(13, activation='softmax'))
    # Compile model
    optimizer = Adam(lr=0.001)
    model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])  
    return model

# the ngram range increases the amount of features a lot (4k, 15k, 31k,..), it seems to help a little to have it on 1,2 or 2,3
vectorizer = TfidfVectorizer(min_df= 2, stop_words="english", sublinear_tf=True, norm='l2', ngram_range=(1, 3))

X = data_comb['words_clean']
Y = data_comb['device_class']

X.head()

# have to set a fixed random state so we can enter the input_dim in the model
# test_size=0.1, random_state=0 gave the best result so far
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.1, random_state=0)

clf = KerasClassifier(build_fn=create_model, epochs=30, batch_size=128, verbose=1)
pipeline = Pipeline([('vect', vectorizer),
                     #('chi',  SelectKBest(chi2, 'k='all')),
                     ('clf',  clf)])

model = pipeline.fit(X_train, y_train)

print(len(X))
print(classification_report(y_test, model.predict(X_test)))
print(confusion_matrix(y_test, model.predict(X_test)))
print(accuracy_score(y_test, model.predict(X_test)))

Using TensorFlow backend.
Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
Use tf.cast instead.


Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
57906
                 precision    recall  f1-score   support

          AUDIO       1.00      1.00      1.00       332
   GAME_CONSOLE       0.99      0.99      0.99       229
    GENERIC_IOT       1.00      1.00      1.00       295
HOME_AUTOMATION       1.00      1.00      1.00      1226
       IP_PHONE       1.00      1.00      1.00       519
      MEDIA_BOX       0.99      0.93      0.96       649
         MOBILE       0.86      0.97      0.91       130
            NAS       1.00      1.00      1.00       269
             PC       0.99      0.97      0.98       657
        PRINTER       1.00      1.00      1.00       434
   SURVEILLANCE

In [11]:
clf.model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 100)               3195100   
_________________________________________________________________
dropout_1 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 13)                1313      
Total params: 3,196,413
Trainable params: 3,196,413
Non-trainable params: 0
_________________________________________________________________


### Create kaggle submission

In [12]:
# Load Data
df = None
with open('test.json', 'r') as f:
     df = pd.read_json(f.read(), lines=True)
data = df
# Prepare Data
data['ssdp_words'] = pd.DataFrame([handle_ssdp(x) for x in df['ssdp']])
data['ssdp_words'] = data['ssdp_words'].replace('', np.nan)
data['upnp_words'] = pd.DataFrame([handle_upnp(x) for x in df['upnp']])
data['upnp_words'] = data['upnp_words'].replace('', np.nan)
data['mdns_words'] = pd.DataFrame([handle_mdns(x) for x in df['mdns_services']])
data['mdns_words'] = data['mdns_words'].replace('', np.nan)
data['mac_words'] = pd.DataFrame([await handle_mac(x) for x in df['mac']])
data['mac_words'] = data['mac_words'].replace('', np.nan)
data['port_words'] = pd.DataFrame([handle_ports(x) for x in df['services']])
data['port_words'] = data['port_words'].replace('', np.nan)
data.count()

data_comb = data.copy()
data_comb['words'] = ""
combine_words(data_comb)

# drop all other now no longer required columns
data_comb.drop(columns=['dhcp','ip','mac','mdns_services','services','ssdp','upnp','ssdp_words','upnp_words','mdns_words','mac_words','port_words'], inplace=True)
data_comb.count()

device_id    77777
words        77777
dtype: int64

In [13]:
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re

stemmer = PorterStemmer()
words = stopwords.words("english")

data_comb['words_clean'] = data_comb['words'].apply(lambda x: " ".join([stemmer.stem(i) for i in re.sub("[^a-zA-Z]", " ", x).split() if i not in words]).lower())
data_comb.count()

device_id      77777
words          77777
words_clean    77777
dtype: int64

In [14]:
# Predict
X = data_comb['words_clean']
y_pred = model.predict(X)
result = data_comb.copy()
result['Predicted'] = y_pred
print(len(result))
print(result.isnull().sum())

77777
device_id      0
words          0
words_clean    0
Predicted      0
dtype: int64


In [15]:
final = result.filter(['device_id', 'Predicted'])
final.rename(columns={"device_id": "Id"},inplace=True)
print(final.isnull().sum())
print(len(final))
final.head()
# save
final.to_csv('submission.csv', index=None, header=True)

Id           0
Predicted    0
dtype: int64
77777
