In [1]:
from pickle import dump, load
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from sklearn.metrics import  classification_report

In [69]:
df = pd.read_csv("../data/processed/final_df.csv")
columns = list(df.columns)
features = columns[4:]
df[features] = df[features].div(df.total_events, axis = 0)
# Separating out the features
X = df.loc[:, features].values
y = df.loc[:, ['is_malicious']].values
print(X)
print(y)

[[0.00000000e+00 0.00000000e+00 0.00000000e+00 ... 1.00000000e+00
  1.00000000e+00 0.00000000e+00]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 ... 1.66666667e-01
  1.66666667e-01 0.00000000e+00]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 ... 9.37500000e-02
  1.25000000e-01 0.00000000e+00]
 ...
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 ... 5.67536890e-04
  4.12607694e-04 0.00000000e+00]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 ... 5.68181635e-04
  4.13076433e-04 0.00000000e+00]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 ... 5.67536890e-04
  4.12607694e-04 0.00000000e+00]]
[[0]
 [0]
 [0]
 ...
 [0]
 [0]
 [0]]


In [72]:
smote = SMOTE(random_state=0, sampling_strategy="minority")
X_os, y_os = smote.fit_resample(X, y)
X_train_os, X_test_os, y_train_os, y_test_os = train_test_split(X_os, y_os, test_size = 0.2, random_state=2)

In [73]:
model = Pipeline([('scaler', StandardScaler()), ('forest', RandomForestClassifier())])
model.fit(X_train_os, y_train_os)

Pipeline(steps=[('scaler', StandardScaler()),
                ('forest', RandomForestClassifier())])

In [76]:
y_pred_os = model.predict(X_test_os)
print(classification_report(y_test_os, y_pred_os))

              precision    recall  f1-score   support

           0       1.00      0.99      0.99     12930
           1       0.99      1.00      0.99     12767

    accuracy                           0.99     25697
   macro avg       0.99      0.99      0.99     25697
weighted avg       0.99      0.99      0.99     25697



In [78]:
with open('../src/models/final_model.sav', 'wb') as f:
    dump(model, f)

In [2]:
with open('../src/models/final_model.sav', 'rb') as f:
    loaded_model = load(f)

In [82]:
y_pred_os = loaded_model.predict(X_test_os)
print(classification_report(y_test_os, y_pred_os))

              precision    recall  f1-score   support

           0       1.00      0.99      0.99     12930
           1       0.99      1.00      0.99     12767

    accuracy                           0.99     25697
   macro avg       0.99      0.99      0.99     25697
weighted avg       0.99      0.99      0.99     25697



In [98]:
df[df.is_malicious == 1]

Unnamed: 0,time,host,total_events,is_malicious,event_1,event_3,event_8,event_10,event_11,event_12,...,event_4672,event_4698,event_4768,event_5140,event_5145,event_5156,event_5158,total_authN_events,hash_attack,phishing
92,576,Comp581624,311,1,0.000021,0.000000e+00,0.0,0.0,0.000000,0.000000e+00,...,0.000165,0.0,0.0,0.0,0.0,0.0,0.0,0.002812,0.003195,0.000021
93,578,Comp581624,313,1,0.000020,2.041462e-05,0.0,0.0,0.000000,0.000000e+00,...,0.000163,0.0,0.0,0.0,0.0,0.0,0.0,0.002776,0.003154,0.000020
95,583,Comp581624,319,1,0.000029,1.965389e-05,0.0,0.0,0.000000,0.000000e+00,...,0.000157,0.0,0.0,0.0,0.0,0.0,0.0,0.002722,0.003086,0.000029
102,676,Comp581624,334,1,0.000036,1.792822e-05,0.0,0.0,0.000000,0.000000e+00,...,0.000152,0.0,0.0,0.0,0.0,0.0,0.0,0.002591,0.002940,0.000036
104,680,Comp581624,339,1,0.000035,2.610489e-05,0.0,0.0,0.000000,0.000000e+00,...,0.000148,0.0,0.0,0.0,0.0,0.0,0.0,0.002550,0.002889,0.000035
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59642,73392,Comp337732,1920,1,0.000003,0.000000e+00,0.0,0.0,0.000002,0.000000e+00,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000516,0.000366,0.000004
60179,74592,Comp337732,1906,1,0.000003,0.000000e+00,0.0,0.0,0.000002,0.000000e+00,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000520,0.000371,0.000005
60184,74596,Comp337732,1906,1,0.000003,0.000000e+00,0.0,0.0,0.000002,0.000000e+00,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000521,0.000371,0.000005
62082,79877,Comp337732,1777,1,0.000000,0.000000e+00,0.0,0.0,0.000000,3.166833e-07,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000562,0.000409,0.000000


In [3]:
df = pd.read_csv("../data/processed/final_df.csv")
columns = list(df.columns)
features = columns[4:]
df[features] = df[features].div(df.total_events, axis = 0)
data = df.iloc[104,4:].values
data

array([3.480651926105759e-05, 2.6104889445793198e-05, 0.0, 0.0, 0.0, 0.0,
       0.0025495775358724687, 0.0, 0.00019143585593581677, 0.0, 0.0, 0.0,
       0.00014792770685949477, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
       0.0025495775358724687, 0.0028889410986677804,
       3.480651926105759e-05], dtype=object)

In [4]:
loaded_model.predict(data.reshape(1,-1))[0]

1