In [35]:
from pickle import dump, load
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from sklearn.metrics import  classification_report

In [36]:
df = pd.read_csv("../data/processed/final_df.csv")
columns = list(df.columns)
features = columns[4:]
df[features] = df[features].div(df.total_events, axis = 0)
# Separating out the features
X = df.loc[:, features].values
y = df.loc[:, ['is_malicious']].values
print(X)
print(y)

[[0.         0.         0.         ... 1.         1.         0.        ]
 [0.         0.         0.         ... 1.         1.         0.        ]
 [0.         0.         0.         ... 0.75       1.         0.        ]
 ...
 [0.         0.         0.         ... 1.         0.72701476 0.        ]
 [0.         0.         0.         ... 1.00056786 0.7274276  0.        ]
 [0.         0.         0.         ... 1.         0.72701476 0.        ]]
[[0]
 [0]
 [0]
 ...
 [0]
 [0]
 [0]]


In [37]:
smote = SMOTE(random_state=0, sampling_strategy="minority")
X_os, y_os = smote.fit_resample(X, y)
X_train_os, X_test_os, y_train_os, y_test_os = train_test_split(X_os, y_os, test_size = 0.2, random_state=2)

In [38]:
model = Pipeline([('scaler', StandardScaler()), ('forest', RandomForestClassifier())])
model.fit(X_train_os, y_train_os)

Pipeline(steps=[('scaler', StandardScaler()),
                ('forest', RandomForestClassifier())])

In [39]:
y_pred_os = model.predict(X_test_os)
print(classification_report(y_test_os, y_pred_os))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99     12930
           1       0.99      0.99      0.99     12767

    accuracy                           0.99     25697
   macro avg       0.99      0.99      0.99     25697
weighted avg       0.99      0.99      0.99     25697



In [40]:
# with open('../src/models/final_model.sav', 'wb') as f:
#     dump(model, f)

In [41]:
with open('../src/models/final_model.sav', 'rb') as f:
    loaded_model = load(f)

In [8]:
y_pred_os = loaded_model.predict(X_test_os)
print(classification_report(y_test_os, y_pred_os))

              precision    recall  f1-score   support

           0       1.00      0.99      0.99     12930
           1       0.99      1.00      0.99     12767

    accuracy                           0.99     25697
   macro avg       0.99      0.99      0.99     25697
weighted avg       0.99      0.99      0.99     25697



In [9]:
df[df.is_malicious == 1]

Unnamed: 0,time,host,total_events,is_malicious,event_1,event_3,event_8,event_10,event_11,event_12,...,event_4672,event_4698,event_4768,event_5140,event_5145,event_5156,event_5158,total_authN_events,hash_attack,phishing
92,576,Comp581624,311,1,0.000021,0.000000e+00,0.0,0.0,0.000000,0.000000e+00,...,0.000165,0.0,0.0,0.0,0.0,0.0,0.0,0.002812,0.003195,0.000021
93,578,Comp581624,313,1,0.000020,2.041462e-05,0.0,0.0,0.000000,0.000000e+00,...,0.000163,0.0,0.0,0.0,0.0,0.0,0.0,0.002776,0.003154,0.000020
95,583,Comp581624,319,1,0.000029,1.965389e-05,0.0,0.0,0.000000,0.000000e+00,...,0.000157,0.0,0.0,0.0,0.0,0.0,0.0,0.002722,0.003086,0.000029
102,676,Comp581624,334,1,0.000036,1.792822e-05,0.0,0.0,0.000000,0.000000e+00,...,0.000152,0.0,0.0,0.0,0.0,0.0,0.0,0.002591,0.002940,0.000036
104,680,Comp581624,339,1,0.000035,2.610489e-05,0.0,0.0,0.000000,0.000000e+00,...,0.000148,0.0,0.0,0.0,0.0,0.0,0.0,0.002550,0.002889,0.000035
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59642,73392,Comp337732,1920,1,0.000003,0.000000e+00,0.0,0.0,0.000002,0.000000e+00,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000516,0.000366,0.000004
60179,74592,Comp337732,1906,1,0.000003,0.000000e+00,0.0,0.0,0.000002,0.000000e+00,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000520,0.000371,0.000005
60184,74596,Comp337732,1906,1,0.000003,0.000000e+00,0.0,0.0,0.000002,0.000000e+00,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000521,0.000371,0.000005
62082,79877,Comp337732,1777,1,0.000000,0.000000e+00,0.0,0.0,0.000000,3.166833e-07,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000562,0.000409,0.000000


In [10]:
df = pd.read_csv("../data/processed/final_df.csv")
columns = list(df.columns)
features = columns[4:]
df[features] = df[features].div(df.total_events, axis = 0)
data = df.iloc[104,4:].values
data

array([3.480651926105759e-05, 2.6104889445793198e-05, 0.0, 0.0, 0.0, 0.0,
       0.0025495775358724687, 0.0, 0.00019143585593581677, 0.0, 0.0, 0.0,
       0.00014792770685949477, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
       0.0025495775358724687, 0.0028889410986677804,
       3.480651926105759e-05], dtype=object)

In [11]:
loaded_model.predict(data.reshape(1,-1))[0]

1

# Test Data

In [42]:
df = pd.read_csv("../data/processed/test_df.csv")
columns = list(df.columns)

In [43]:
columns

['time',
 'host',
 'total_events',
 'is_malicious',
 'event_1',
 'event_3',
 'event_8',
 'event_10',
 'event_11',
 'event_12',
 'event_4624',
 'event_4625',
 'event_4648',
 'event_4658',
 'event_4661',
 'event_4663',
 'event_4672',
 'event_4698',
 'event_4768',
 'event_5140',
 'event_5145',
 'event_5156',
 'event_5158',
 'total_authN_events',
 'hash_attack',
 'phishing']

In [44]:
df[df.is_malicious == 1]

Unnamed: 0,time,host,total_events,is_malicious,event_1,event_3,event_8,event_10,event_11,event_12,...,event_4672,event_4698,event_4768,event_5140,event_5145,event_5156,event_5158,total_authN_events,hash_attack,phishing
92,576,Comp581624,311,1,2,0,0,0,0,0,...,16,0,0,0,0,0,0,272,309,2
93,578,Comp581624,313,1,2,2,0,0,0,0,...,16,0,0,0,0,0,0,272,309,2
95,583,Comp581624,319,1,3,2,0,0,0,0,...,16,0,0,0,0,0,0,277,314,3
102,676,Comp581624,334,1,4,2,0,0,0,0,...,17,0,0,0,0,0,0,289,328,4
104,680,Comp581624,339,1,4,3,0,0,0,0,...,17,0,0,0,0,0,0,293,332,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59680,73392,Comp337732,1920,1,10,0,0,0,6,0,...,0,0,0,0,0,0,0,1904,1351,16
60217,74592,Comp337732,1906,1,11,0,0,0,6,0,...,0,0,0,0,0,0,0,1890,1348,17
60222,74596,Comp337732,1906,1,12,0,0,0,6,0,...,0,0,0,0,0,0,0,1892,1349,18
62120,79877,Comp337732,1777,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,1776,1293,0


In [45]:
features = columns[4:]
df[features] = df[features].div(df.total_events, axis = 0)

In [46]:
data = df[df.is_malicious == 1]
data = data.iloc[:,4:].values
loaded_model.predict(data)

array([1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

In [None]:
arr = loaded_model.predict(data)

In [47]:
arr.count(1)

182

In [48]:
arr.count(0)

606

In [49]:
df = pd.read_csv("../data/processed/test_df.csv")

In [51]:
df.to_csv("../data/processed/test_df.csv",encoding='utf-8')

In [52]:
for _, row in df.iterrows():
    print( row.values.tolist())

[3, 'Comp581624', 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0]
[5, 'Comp581624', 6, 0, 0, 0, 0, 0, 0, 0, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 6, 0]
[6, 'Comp581624', 8, 0, 0, 0, 0, 0, 0, 0, 6, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 8, 0]
[11, 'Comp581624', 10, 0, 0, 0, 0, 0, 0, 0, 8, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 10, 0]
[13, 'Comp581624', 13, 0, 0, 0, 0, 0, 0, 0, 11, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 11, 13, 0]
[15, 'Comp581624', 44, 0, 0, 0, 0, 0, 0, 0, 24, 0, 11, 0, 0, 0, 9, 0, 0, 0, 0, 0, 0, 24, 44, 0]
[16, 'Comp581624', 53, 0, 0, 0, 0, 0, 0, 0, 27, 0, 14, 0, 0, 0, 12, 0, 0, 0, 0, 0, 0, 27, 53, 0]
[18, 'Comp581624', 56, 0, 0, 0, 0, 0, 0, 0, 28, 0, 15, 0, 0, 0, 13, 0, 0, 0, 0, 0, 0, 28, 56, 0]
[22, 'Comp581624', 57, 0, 0, 0, 0, 0, 0, 0, 28, 0, 16, 0, 0, 0, 13, 0, 0, 0, 0, 0, 0, 28, 57, 0]
[30, 'Comp581624', 59, 0, 0, 0, 0, 0, 0, 0, 30, 0, 16, 0, 0, 0, 13, 0, 0, 0, 0, 0, 0, 30, 59, 0]
[31, 'Comp581624', 61, 0, 0, 0, 0, 0, 0, 0, 32, 0, 16, 0, 