In [1]:
import warnings
warnings.filterwarnings('ignore')

In [4]:
from pinecone import Pinecone, ServerlessSpec
from sentence_transformers import SentenceTransformer, InputExample, losses, models, util
from torch.utils.data import DataLoader
from torch import nn
from tqdm.auto import tqdm
from DLAIUtils import Utils
import torch
import time
import os

In [5]:
utils = Utils()
PINECONE_API_KEY = utils.get_pinecone_api_key()

In [6]:
INDEX_NAME = utils.create_dlai_index_name('dl-ai')

pinecone = Pinecone(api_key=PINECONE_API_KEY)

if INDEX_NAME in [index.name for index in pinecone.list_indexes()]:
  pinecone.delete_index(INDEX_NAME)
pinecone.create_index(name=INDEX_NAME, dimension=256, metric='cosine',
  spec=ServerlessSpec(cloud='aws', region='us-west-2'))
index = pinecone.Index(INDEX_NAME)

In [7]:
!wget -q --show-progress -O training.tar.zip "https://www.dropbox.com/scl/fi/rihfngx4ju5pzjzjj7u9z/lesson6.tar.zip?rlkey=rct9a9bo8euqgshrk8wiq2orh&dl=1"





In [8]:
!tar -xzvf training.tar.zip

x lesson6.tar


In [9]:
!tar -xzvf lesson6.tar

x sample.log
x training.txt


In [10]:
!head -5 sample.log

Apr 15 2013 09:36:50: %ASA-4-106023: Deny tcp src dmz:10.1.2.30/63016 dst outside:192.0.0.8/53 by access-group "acl_dmz" [0xe3aab522, 0x0]
Apr 15 2013 09:36:50: %ASA-4-106023: Deny tcp src dmz:10.1.2.30/63016 dst outside:192.0.0.8/53 type 3, code 0, by access-group "acl_dmz" [0xe3aab522, 0x0]
Apr 15 2014 09:34:34 EDT: %ASA-session-5-106100: access-list acl_in permitted tcp inside/10.1.2.16(2241) -> outside/192.0.0.89(2000) hit-cnt 1 first hit [0x71a87d94, 0x0]
Apr 24 2013 16:00:28 INT-FW01 : %ASA-6-106100: access-list inside denied udp inside/172.29.2.101(1039) -> outside/192.0.2.10(53) hit-cnt 1 first hit [0xd820e56a, 0x0]
Apr 24 2013 16:00:27 INT-FW01 : %ASA-6-106100: access-list inside permitted udp inside/172.29.2.3(1065) -> outside/192.0.2.57(53) hit-cnt 144 300-second interval [0xe982c7a4, 0x0]


In [11]:
!head -5 training.txt

Apr 15 2013 09:36:50: %ASA-4-106023: Deny tcp src dmz:10.1.2.30/63016 dst outside:192.0.0.8/53 by access-group "acl_dmz" [0xe3aab522, 0x0] ^ Apr 15 2013 09:36:50: %ASA-4-106023: Deny tcp src dmz:10.1.2.30/63016 dst outside:192.0.0.8/53 by access-group "acl_dmz" [0xe3aab522, 0x0] ^ 1.0
Apr 15 2013 09:36:50: %ASA-4-106023: Deny tcp src dmz:10.1.2.30/63016 dst outside:192.0.0.8/53 type 3, code 0, by access-group "acl_dmz" [0xe3aab522, 0x0] ^ Apr 15 2013 09:36:50: %ASA-4-106023: Deny tcp src dmz:10.1.2.30/63016 dst outside:192.0.0.8/53 by access-group "acl_dmz" [0xe3aab522, 0x0] ^ 0.9
Apr 15 2014 09:34:34 EDT: %ASA-session-5-106100: access-list acl_in permitted tcp inside/10.1.2.16(2241) -> outside/192.0.0.89(2000) hit-cnt 1 first hit [0x71a87d94, 0x0] ^ Apr 15 2013 09:36:50: %ASA-4-106023: Deny tcp src dmz:10.1.2.30/63016 dst outside:192.0.0.8/53 by access-group "acl_dmz" [0xe3aab522, 0x0] ^ 0.8
Apr 24 2013 16:00:28 INT-FW01 : %ASA-6-106100: access-list inside denied udp inside/172.29.2.1

## Setup model
We are using bert base uncased sentence transformers model that maps sentences to a 256 dimensional dense vector space

In [14]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
word_embedding_model = models.Transformer('bert-base-uncased', max_seq_length=768)
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
dense_model = models.Dense(in_features=pooling_model.get_sentence_embedding_dimension(), out_features=256, activation_function=nn.Tanh())
model = SentenceTransformer(modules=[word_embedding_model, pooling_model, dense_model], device = device)

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

## Train the model

In [16]:
train_examples = []
with open('./training.txt','r') as f:
    lines = f.readlines()
    for line in lines:
        line = line.strip()
        if line:
            a, b, label = line.split('^')
            train_examples.append(InputExample(texts = [a,b], label = float(label)))
#define dataset, data loader and training loss
warmup_steps = 100
train_dataloader = DataLoader(train_examples, shuffle= True, batch_size=16)
train_loss = losses.CosineSimilarityLoss(model)

(Note: load_pretrained_model = True): We've saved the trained model and are loading it here for speedier results, allowing you to observe the outcomes faster. Once you've done an initial run, you may set load_pretrained_model to False to train the model yourself. This can take some time to finsih, depending the value you set for the epochs.

In [17]:
import pickle
load_pretrained_model = False
if load_pretrained_model:
    trained_model_file = open('./pretrained_model','rb')
    db = pickle.load(trained_model_file)
    trained_model_file.close()
else:
    model.fit(train_objectives=[(train_dataloader, train_loss)],epochs=16, warmup_steps=100)

samples = []
with open('./sample.log','r') as f:
    lines = f.readlines()
    for line in lines:
        line = line.strip()
        if line:
            samples.append(line)

Epoch:   0%|          | 0/16 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2 [00:00<?, ?it/s]

## Create embeddings and upsert to pinecone

In [18]:
emb = model.encode(samples)

In [19]:
prepped = []
for i in tqdm(range(len(samples))):
  v = {'id':f'{i}', 'values':emb[i].tolist(), 'metadata':{'log':samples[i]}}
  prepped.append(v)
index.upsert(prepped)

  0%|          | 0/90 [00:00<?, ?it/s]

{'upserted_count': 90}

In [None]:
## finding the anomaly

In [20]:
good_log_line = samples[0]

In [21]:
print(good_log_line)

Apr 15 2013 09:36:50: %ASA-4-106023: Deny tcp src dmz:10.1.2.30/63016 dst outside:192.0.0.8/53 by access-group "acl_dmz" [0xe3aab522, 0x0]


In [22]:
results = []
while len(results)==0:  # After the upserts, it might take a few seconds for index to be ready for query.  
    time.sleep(2)       # If results is empty we try again two seconds later.
    queried = index.query(
        vector=emb[0].tolist(),
        include_metadata=True,
        top_k=100
    )
    results = queried['matches']
    print(".:. ",end="")

.:. 

In [23]:
for i in range(0,10) :
  print(f"{round(results[i]['score'], 4)}\t{results[i]['metadata']['log']}")

1.0	Apr 15 2013 09:36:50: %ASA-4-106023: Deny tcp src dmz:10.1.2.30/63016 dst outside:192.0.0.8/53 by access-group "acl_dmz" [0xe3aab522, 0x0]
0.9782	Apr 15 2013 09:36:50: %ASA-4-106023: Deny tcp src dmz:10.1.2.30/63016 dst outside:192.0.0.8/53 type 3, code 0, by access-group "acl_dmz" [0xe3aab522, 0x0]
0.9644	Apr 30 2013 09:23:41: %ASA-4-106023: Deny tcp src outside:192.0.2.126/53638 dst inside:10.0.0.132/8111 by access-group "acl_out" [0x71761f18, 0x0]
0.9623	Apr 30 2013 09:23:40: %ASA-4-106023: Deny tcp src outside:192.0.2.126/53638 dst inside:10.0.0.132/8111 by access-group "acl_out" [0x71761f18, 0x0]
0.9453	Sep 12 2014 06:53:01 GIFRCHN01 : %ASA-4-106023: Deny tcp src outside:192.0.2.95/24069 dst inside:10.32.112.125/25 by access-group "PERMIT_IN" [0x0, 0x0]"
0.9097	Dec 11 2018 08:01:39 <IP>: %ASA-4-106023: Deny udp src dmz:192.168.1.34/5679 dst outside:192.0.0.12/5000 by access-group "dmz" [0x123a465e, 0x8c20f21]
0.9008	Apr 29 2013 12:59:50: %ASA-6-305011: Built dynamic TCP transl

In [24]:
last_element = len(results) -1 

In [25]:
print(f"{round(results[last_element]['score'], 4)}\t{results[last_element]['metadata']['log']}")

0.3411	dec 31, 2021 09:18:59: %ASA-4-434005: seg fault detected in the matrix
