In [1]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn import svm
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import accuracy_score

In [2]:
#Loading iot synthetic data
iotsim = pd.read_csv('mainSimulationAccessTraces2.csv', sep=',')


In [3]:
iotsim.head()

Unnamed: 0,sourceID,sourceAddress,sourceType,sourceLocation,destinationServiceAddress,destinationServiceType,destinationLocation,accessedNodeAddress,accessedNodeType,operation,value,timestamp,sevenWayLabel,twoWayLabel
0,lightcontrol2,/agent2/lightcontrol2,/lightControler,BedroomParents,/agent2/lightcontrol2,/lightControler,BedroomParents,/agent2/lightcontrol2,/lightControler,registerService,2456540000.0,1520000000000.0,0,0
1,lightcontrol3,/agent3/lightcontrol3,/lightControler,Dinningroom,/agent3/lightcontrol3,/lightControler,Dinningroom,/agent3/lightcontrol3,/lightControler,registerService,2456540000.0,1520000000000.0,0,0
2,lightcontrol1,/agent1/lightcontrol1,/lightControler,BedroomChildren,/agent1/lightcontrol1,/lightControler,BedroomChildren,/agent1/lightcontrol1,/lightControler,registerService,2456540000.0,1520000000000.0,0,0
3,lightcontrol4,/agent4/lightcontrol4,/lightControler,Kitchen,/agent4/lightcontrol4,/lightControler,Kitchen,/agent4/lightcontrol4,/lightControler,registerService,2456540000.0,1520000000000.0,0,0
4,movement4,/agent4/movement4,/movementSensor,Kitchen,/agent4/movement4,/movementSensor,Kitchen,/agent4/movement4,/movementSensor,registerService,2456540000.0,1520000000000.0,0,0


In [4]:
iotsim.count()

sourceID                     357941
sourceAddress                357941
sourceType                   357941
sourceLocation               357941
destinationServiceAddress    357941
destinationServiceType       357941
destinationLocation          357941
accessedNodeAddress          357941
accessedNodeType             357941
operation                    357941
value                        356039
timestamp                    357941
sevenWayLabel                357941
twoWayLabel                  357941
dtype: int64

In [5]:
iotsim['value'] = iotsim['value'].fillna(iotsim['value'].mean())

In [6]:
iotsim.count()

sourceID                     357941
sourceAddress                357941
sourceType                   357941
sourceLocation               357941
destinationServiceAddress    357941
destinationServiceType       357941
destinationLocation          357941
accessedNodeAddress          357941
accessedNodeType             357941
operation                    357941
value                        357941
timestamp                    357941
sevenWayLabel                357941
twoWayLabel                  357941
dtype: int64

In [7]:
#Encoding nominal features using OneHotEncoder
categorical_columns = ['sourceID','sourceAddress','sourceType','sourceLocation','destinationServiceAddress','destinationServiceType','destinationLocation','accessedNodeAddress','accessedNodeType','operation']
data_categorical = iotsim[categorical_columns]
encoder = OneHotEncoder(sparse=False)
data_encoded = encoder.fit_transform(data_categorical)




In [8]:
print(f"The encoded dataset contains {data_encoded.shape[1]} features")

The encoded dataset contains 503 features


In [9]:
#Add meaningful column names and creating a data frame to be used in classification
columns_encoded = encoder.get_feature_names_out(data_categorical.columns)
data_encoded_pd_frame = pd.DataFrame(data_encoded, columns=columns_encoded)

In [10]:
data_encoded_pd_frame.head()

Unnamed: 0,sourceID_battery1,sourceID_battery2,sourceID_battery3,sourceID_battery4,sourceID_battery5,sourceID_battery6,sourceID_doorlock1,sourceID_doorlock2,sourceID_doorlock3,sourceID_doorlock4,...,accessedNodeType_/movementSensor,accessedNodeType_/noType,accessedNodeType_/sensorService,accessedNodeType_/smartPhone,accessedNodeType_/thermostat,accessedNodeType_/washingService,operation_lockSubtree,operation_read,operation_registerService,operation_write
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [11]:
#Merging the value and normality columns to the data frame
data_encoded_pd_frame['value'] = iotsim['value']
data_encoded_pd_frame['twoWayLabel'] = iotsim['twoWayLabel']

In [12]:
data_encoded_pd_frame.head()

Unnamed: 0,sourceID_battery1,sourceID_battery2,sourceID_battery3,sourceID_battery4,sourceID_battery5,sourceID_battery6,sourceID_doorlock1,sourceID_doorlock2,sourceID_doorlock3,sourceID_doorlock4,...,accessedNodeType_/sensorService,accessedNodeType_/smartPhone,accessedNodeType_/thermostat,accessedNodeType_/washingService,operation_lockSubtree,operation_read,operation_registerService,operation_write,value,twoWayLabel
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,2456540000.0,0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,2456540000.0,0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,2456540000.0,0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,2456540000.0,0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,2456540000.0,0


In [13]:
data_encoded_pd_frame.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 357941 entries, 0 to 357940
Columns: 505 entries, sourceID_battery1 to twoWayLabel
dtypes: float64(504), int64(1)
memory usage: 1.3 GB


In [14]:
#Stats about the dataset
data_encoded_pd_frame['twoWayLabel'].unique()

array([0, 1])

In [15]:
X = data_encoded_pd_frame.drop(['twoWayLabel'], axis=1)
y = data_encoded_pd_frame['twoWayLabel']

In [16]:
X.count()

sourceID_battery1            357941
sourceID_battery2            357941
sourceID_battery3            357941
sourceID_battery4            357941
sourceID_battery5            357941
                              ...  
operation_lockSubtree        357941
operation_read               357941
operation_registerService    357941
operation_write              357941
value                        357941
Length: 504, dtype: int64

In [17]:
#Train and Test splitting of the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [18]:
#Apply standard scaling to get optimized results
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.fit_transform(X_test)

# Random forest classifier with 100 estimators

In [19]:
rfc = RandomForestClassifier(n_estimators=100) #size of the model
rfc.fit(X_train, y_train)
pred_rfc = rfc.predict(X_test)
accuracy = accuracy_score(y_test, pred_rfc)

In [20]:
#Printing classification metrics
print(classification_report(y_test, pred_rfc, digits=3))
print(confusion_matrix(y_test, pred_rfc))
print(f"Accuracy score: {accuracy:.3f}")

              precision    recall  f1-score   support

           0      0.994     0.999     0.997     69558
           1      0.977     0.787     0.872      2031

    accuracy                          0.993     71589
   macro avg      0.986     0.893     0.934     71589
weighted avg      0.993     0.993     0.993     71589

[[69521    37]
 [  433  1598]]
Accuracy score: 0.993


# Support Vector Machines

In [21]:
clf = svm.SVC()
clf.fit(X_train, y_train)
pred_clf = clf.predict(X_test)
accuracy = accuracy_score(y_test, pred_clf)

In [22]:
#Printing classification metrics
print(classification_report(y_test, pred_clf, digits=3))
print(confusion_matrix(y_test, pred_clf))
print(f"Accuracy score: {accuracy:.3f}")

              precision    recall  f1-score   support

           0      0.993     0.999     0.996     69558
           1      0.975     0.774     0.863      2031

    accuracy                          0.993     71589
   macro avg      0.984     0.887     0.930     71589
weighted avg      0.993     0.993     0.993     71589

[[69517    41]
 [  459  1572]]
Accuracy score: 0.993


# MLP Classifier

In [23]:
mlpc = MLPClassifier(hidden_layer_sizes=(11,11,11), max_iter=200)
mlpc.fit(X_train, y_train)
pred_mlpc = mlpc.predict(X_test)
accuracy = accuracy_score(y_test, pred_mlpc)

In [24]:
#Printing classification metrics
print(classification_report(y_test, pred_mlpc, digits=3))
print(confusion_matrix(y_test, pred_mlpc))
print(f"Accuracy score: {accuracy:.3f}")

              precision    recall  f1-score   support

           0      0.994     0.999     0.997     69558
           1      0.975     0.789     0.872      2031

    accuracy                          0.993     71589
   macro avg      0.984     0.894     0.934     71589
weighted avg      0.993     0.993     0.993     71589

[[69517    41]
 [  429  1602]]
Accuracy score: 0.993
