In [1]:
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.ensemble import IsolationForest
from sklearn.metrics import precision_score, recall_score

In [2]:
def convert_to_int_from_str(x):
    try:
        return int(x) if isinstance(x,str) else x
    except ValueError:
        return x

def convert_to_int_from_float(x):
    try:
        return int(x) if isinstance(x,float) else x
    except ValueError:
        return x

In [4]:
df = pd.read_csv('../attack-sample-3m.csv')

  df = pd.read_csv('../attack-sample-3m.csv')


In [5]:
df = df.loc[:,['http.content_length_header', 'alert']]

In [6]:
df.shape

(3000000, 2)

In [7]:
df = df.dropna(subset=['http.content_length_header'])

In [8]:
df.shape

(999778, 2)

In [9]:
df.alert.value_counts()

alert
suspicious    999053
benign           724
alert              1
Name: count, dtype: int64

In [10]:
df['alert'] = df['alert'].replace({'suspicious': -1, 'benign':1})

In [11]:
df.alert.value_counts()

alert
-1       999053
1           724
alert         1
Name: count, dtype: int64

In [12]:
df['alert'].info()

<class 'pandas.core.series.Series'>
Index: 999778 entries, 0 to 2999996
Series name: alert
Non-Null Count   Dtype 
--------------   ----- 
999778 non-null  object
dtypes: object(1)
memory usage: 15.3+ MB


In [13]:
# I would like to see if there's any value in this column which is not an int
df.loc[df['alert'].apply(lambda x: not isinstance(x, int)) == True]

Unnamed: 0,http.content_length_header,alert
2042160,http.content_length_header,alert


In [14]:
# I will drop this row
df = df.loc[df['alert'] != 'alert']

In [15]:
df.loc[df['alert'].apply(lambda x: not isinstance(x, int)) == True]

Unnamed: 0,http.content_length_header,alert


In [16]:
df.shape

(999777, 2)

In [17]:
df['http.content_length_header'].value_counts()

http.content_length_header
896       28516
271       18317
0         14538
273       12555
274       12510
          ...  
3140.0        1
378.0         1
471.0         1
863.0         1
706           1
Name: count, Length: 3660, dtype: int64

In [18]:
df.loc[df['http.content_length_header'].apply(lambda x: not isinstance(x, int)) == True]

Unnamed: 0,http.content_length_header,alert
0,896,-1
1,222,-1
6,896,-1
8,273,-1
11,184,-1
...,...,...
2999986,2232,-1
2999990,273,-1
2999991,11,-1
2999992,22,-1


In [19]:
df['http.content_length_header'] = df['http.content_length_header'].apply(convert_to_int_from_str)

In [20]:
df.loc[df['http.content_length_header'].apply(lambda x: not isinstance(x, int)) == True].value_counts()

http.content_length_header  alert
10.20.30.101                -1       1401
896.0                       -1        465
271.0                       -1        314
0.0                         -1        242
273.0                       -1        216
                                     ... 
84.0                        -1          1
2267.0                      -1          1
2269.0                      -1          1
506.0                        1          1
673.0                       -1          1
Name: count, Length: 752, dtype: int64

In [21]:
df['http.content_length_header'] = df['http.content_length_header'].apply(convert_to_int_from_float)

In [22]:
df.loc[df['http.content_length_header'].apply(lambda x: not isinstance(x, int)) == True].value_counts()

http.content_length_header         alert
10.20.30.101                       -1       1401
application/x-www-form-urlencoded  -1         22
Name: count, dtype: int64

In [23]:
df = df[df['http.content_length_header'] != '10.20.30.101']

In [24]:
df = df[df['http.content_length_header'] != 'application/x-www-form-urlencoded']

In [25]:
df.loc[df['http.content_length_header'].apply(lambda x: not isinstance(x, int)) == True].value_counts()

Series([], Name: count, dtype: int64)

In [26]:
x = df['http.content_length_header'].values
y = df['alert'].values

In [27]:
X_train, X_test, y_train, y_test = train_test_split(x,y,test_size=0.2, random_state=42)

In [28]:
X_train = X_train.reshape(-1, 1)

In [29]:
params = dict(n_estimators=50,
              max_samples='auto',
              contamination='auto',
              random_state=42,
              verbose=1,
              warm_start=True,
              num_datapoints=len(x),
              n_jobs = -1)

In [30]:
clf = IsolationForest(n_estimators=params['n_estimators'],
                      max_samples=params['max_samples'],
                      contamination=params['contamination'],
                      random_state=params['random_state'],
                      warm_start=params['warm_start'],
                      verbose=params['verbose'],
                      n_jobs=params['n_jobs'])

In [31]:
print(f'X_train shape is {X_train.shape}\ny_train shape is {y_train.shape}' )

X_train shape is (798683, 1)
y_train shape is (798683,)


In [32]:
clf.fit(X_train)

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done   2 out of   8 | elapsed:    0.3s remaining:    1.0s
[Parallel(n_jobs=8)]: Done   8 out of   8 | elapsed:    0.4s finished


In [33]:
scores = clf.decision_function(X_train)

In [34]:
scores

array([ 0.13189537,  0.13400081, -0.03229222, ...,  0.14048371,
        0.01691253, -0.07026759])

In [35]:
predictions = clf.predict(X_train)

In [36]:
type(predictions[0])

numpy.int64

In [37]:
print(predictions)
type(predictions)
print(predictions.shape)
print(predictions.dtype)

[ 1  1 -1 ...  1  1 -1]
(798683,)
int64


In [38]:
print(y_train)
type(y_train)
print(y_train.shape)
print(y_train.dtype)

[-1 -1 -1 ... -1 -1 -1]
(798683,)
object


In [39]:
print(y_train.astype('int64').dtype)

int64


In [40]:
training_precision = precision_score(y_train.astype('int64'),predictions)

In [41]:
training_recall = recall_score(y_train.astype('int64'), predictions)

In [42]:
print(f'training precision is : {training_precision} \ntraining recall is: {training_recall} ')

training precision is : 8.857499544590825e-05 
training recall is: 0.0899830220713073 


Precision improved massively but recall s very low.