In [1]:
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.ensemble import IsolationForest
from sklearn.metrics import precision_score, recall_score


In [34]:
def convert_to_int(x):
    try:
        return int(x) if isinstance(x,str) else x
    except ValueError:
        return x

def convert_to_int_from_float(x):
    try:
        return int(x) if isinstance(x,float) else x
    except ValueError:
        return x

In [8]:
df_http = joblib.load('df_http.csv')

In [9]:
# I would like to drop the rows with NaN values
df_http = df_http.dropna(subset=['http.content_length_header'])

In [5]:
value_counts = df_http['http.content_length_header'].value_counts()
value_counts

http.content_length_header
896       9528
271       6025
0         4781
273       4188
274       4100
          ... 
562          1
6978         1
956          1
604          1
1230.0       1
Name: count, Length: 2559, dtype: int64

In [11]:
chosen_feature = 'http.content_length_header'

In [13]:
# I would like to see the rows that this value is not float
df_http['http.content_length_header'].value_counts()

http.content_length_header
896       9528
271       6025
0         4781
273       4188
274       4100
          ... 
562          1
6978         1
956          1
604          1
1230.0       1
Name: count, Length: 2559, dtype: int64

In [23]:
df_http['http.content_length_header'].info()

<class 'pandas.core.series.Series'>
Index: 333432 entries, 1 to 999999
Series name: http.content_length_header
Non-Null Count   Dtype 
--------------   ----- 
333432 non-null  object
dtypes: object(1)
memory usage: 5.1+ MB


In [30]:
# Convert 'alert' to a binary format where 'suspicious' is -1 and 'benign' is 1
df_http['alert'] = df_http['alert'].replace({'suspicious': -1, 'benign': 1})

In [28]:
# I need to write a function to change the string values to int numbers as the output of the Isolation forest prediction will be -1 or 1.
df_http['http.content_length_header'] = df_http['http.content_length_header'].apply(convert_to_int)

In [36]:
df_http['http.content_length_header'] = df_http['http.content_length_header'].apply(convert_to_int_from_float)

In [37]:
non_int_values = df_http[df_http['http.content_length_header'].apply(lambda x: not isinstance(x, int))]

In [38]:
non_int_values['http.content_length_header'].value_counts()

http.content_length_header
10.20.30.101                         468
application/x-www-form-urlencoded      6
Name: count, dtype: int64

I checked where this has come from. It is coming from the next cell 'http.content_type' that has leaked into this column. Seems the csv file hasn't properly realised that this is part of the next column. I will drop these rows for now.

In [39]:
df_http = df_http[df_http['http.content_length_header'] != '10.20.30.101']

In [40]:
df_http = df_http[df_http['http.content_length_header'] != 'application/x-www-form-urlencoded']

In [41]:
df_http['http.content_length_header'].info()

<class 'pandas.core.series.Series'>
Index: 332958 entries, 1 to 999999
Series name: http.content_length_header
Non-Null Count   Dtype 
--------------   ----- 
332958 non-null  object
dtypes: object(1)
memory usage: 5.1+ MB


In [43]:
non_int_values_2 = df_http[df_http['http.content_length_header'].apply(lambda x: not isinstance(x, int))]
non_int_values_2['http.content_length_header'].value_counts()

Series([], Name: count, dtype: int64)

In [44]:
x = df_http[chosen_feature].values
y = df_http['alert'].values

In [45]:
X_train, X_test, y_train, y_test = train_test_split(x,y,test_size=0.3, random_state=42)

In [46]:
params = dict(n_estimators=50,
              max_samples='auto',
              contamination='auto',
              random_state=42,
              verbose=1,
              warm_start=True,
              num_datapoints=len(x),
              n_jobs = -1,
              feature=chosen_feature)
# wandb.init(project='network-traffic-anomaly-detection', config=params,group='single_feature_http.content_length_header')

In [47]:
clf = IsolationForest(n_estimators=params['n_estimators'],
                      max_samples=params['max_samples'],
                      contamination=params['contamination'],
                      random_state=params['random_state'],
                      warm_start=params['warm_start'],
                      verbose=params['verbose'],
                      n_jobs=params['n_jobs'])

In [43]:
# wandb.config.update(clf.get_params())

In [48]:
print(f'X_train shape is {X_train.shape}\ny_train shape is {y_train.shape}' )

X_train shape is (233070,)
y_train shape is (233070,)


In [50]:
clf.fit(X_train.reshape(-1,1))

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done   2 out of   8 | elapsed:    0.2s remaining:    0.7s
[Parallel(n_jobs=8)]: Done   8 out of   8 | elapsed:    0.2s finished


In [51]:
scores = clf.decision_function(X_train.reshape(-1,1))

In [52]:
# wandb.summary['scores'] = scores
predictions = clf.predict(X_train.reshape(-1,1))

In [53]:
predictions

array([1, 1, 1, ..., 1, 1, 1])

In [58]:
training_precision = precision_score(y_train,predictions)
training_recall = recall_score(y_train.astype('int64'), predictions)
print(f'training precision is : {training_precision} \ntraining recall is: {training_recall} ')

training precision is : 0.0001408672917417959 
training recall is: 0.14285714285714285 


This training precision is very low meaning that training the Isolation Forest model on this one feature is cannot detect much anomalies. The majority of the packets are benign.Next step is that I will try and apply the same approach for the entire dataset although I do not expect it to give me a good result. And then I would like to also try and train the model on more than one feature.

In [59]:
# wandb.finish()