# Модель парсинга лога (NER)

In [None]:
import spacy
from spacy.training.example import Example
from spacy.tokens import DocBin
from spacy.util import minibatch, compounding
from spacy.scorer import Scorer
from sklearn.metrics import precision_recall_fscore_support
from sklearn.model_selection import train_test_split
import re
import json

In [None]:
# Необхоимо импортировать файл audit.log
from google.colab import files
uploaded = files.upload()

Saving audit.log to audit.log


In [None]:
from google.colab import drive
drive.mount('/content/drive')

### Подготовка данных

In [None]:
# Функция извлечения всех параметров из лога и их индексов
def extract_parameters(log):
    pattern = r'(\w+)=("([^"]+)"|([^\s]+))'
    matches = re.finditer(pattern, log)
    entities = []
    for match in matches:
        param_name = match.group(1)
        start = match.start(1)
        end = match.end(2)
        entities.append((start, end, param_name.upper()))
    return entities

# Подготовка датасета из файла audit.log
with open("audit.log", "r", encoding='utf-8') as log_file, open("audit_ner_dataset.json", "w", encoding='utf-8') as json_file:
    json_data = []
    for line in log_file:
      entities = extract_parameters(line.strip())
      if entities:
        json_data.append({"log_message": line.strip(), "entities": entities})
    json.dump(json_data, json_file, ensure_ascii=False, indent=4)

In [None]:
# Загрузка датасета из json файла
with open("audit_ner_dataset.json", "r", encoding='utf-8') as json_file:
    json_data = json.load(json_file)

data = []
for entry in json_data[:70]:
    data.append((entry['log_message'], {'entities': entry['entities']}))

data[:2]

[('type=DAEMON_START msg=audit(1714314919.366:3461): op=start ver=3.0.7 format=enriched kernel=6.5.0-28-generic auid=4294967295 pid=5713 uid=0 ses=4294967295 subj=unconfined  res=success\x1dAUID="unset" UID="root"',
  {'entities': [[0, 17, 'TYPE'],
    [18, 49, 'MSG'],
    [50, 58, 'OP'],
    [59, 68, 'VER'],
    [69, 84, 'FORMAT'],
    [85, 108, 'KERNEL'],
    [109, 124, 'AUID'],
    [125, 133, 'PID'],
    [134, 139, 'UID'],
    [140, 154, 'SES'],
    [155, 170, 'SUBJ'],
    [172, 183, 'RES'],
    [184, 196, 'AUID'],
    [197, 207, 'UID']]}),
 ('type=SYSCALL msg=audit(1714314919.363:537): arch=c000003e syscall=44 success=yes exit=60 a0=3 a1=7ffe69e55e30 a2=3c a3=0 items=0 ppid=5712 pid=5713 auid=4294967295 uid=0 gid=0 euid=0 suid=0 fsuid=0 egid=0 sgid=0 fsgid=0 tty=(none) ses=4294967295 comm="auditd" exe="/usr/sbin/auditd" subj=unconfined key=(null) \x1dARCH=x86_64 SYSCALL=sendto AUID="unset" UID="root" GID="root" EUID="root" SUID="root" FSUID="root" EGID="root" SGID="root" FSGID="roo

In [None]:
# Разделение данных на обучающий и тестовый наборы
train_data, val_data = train_test_split(data, test_size=0.2, random_state=42)

print(f'Train dataset size: {len(train_data)}')
print(f'Test dataset size: {len(val_data)}')

Train dataset size: 56
Test dataset size: 14


### Обучение модели

In [None]:
# Загрузка предобученной модели
nlp = spacy.load("en_core_web_sm")

# Создание пустой модели для NER
if "ner" not in nlp.pipe_names:
    ner = nlp.add_pipe("ner", last=True)
else:
    ner = nlp.get_pipe("ner")

# Добавление новых меток сущностей
for _, annotations in data:
    for ent in annotations.get("entities"):
        ner.add_label(ent[2])

# Начало процесса обучения
optimizer = nlp.resume_training()
n_iter = 40

# Отключение других pipeline-компонентов
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]

def evaluate(ner_model, data):
    examples = [Example.from_dict(ner_model.make_doc(text), annotations) for text, annotations in data]
    scorer = ner_model.evaluate(examples)
    return scorer

with nlp.disable_pipes(*other_pipes):
    for itn in range(n_iter):
        losses = {}
        for batch in minibatch(train_data, size=compounding(4.0, 32.0, 1.001)):
            for text, annotations in batch:
                doc = nlp.make_doc(text)
                example = Example.from_dict(doc, annotations)
                nlp.update([example], drop=0.5, sgd=optimizer, losses=losses)

        if (itn + 1) % 5 == 0:
            scorer = evaluate(nlp, val_data)
            f1_score = round(scorer["ents_f"], 2)
            precision = round(scorer["ents_p"], 2)
            recall = round(scorer["ents_r"], 2)
            loss = round(losses.get("ner", 0), 2)
            print(f"Iteration {itn + 1} - F1 Score: {f1_score} - Precision: {precision} - Recall: {recall} - Losses: {loss}")

# Сохранение обученной модели
nlp.to_disk("/content/drive/MyDrive/parse_model")

Iteration 5 - F1 Score: 0.56 - Precision: 0.82 - Recall: 0.43 - Losses: 857.9
Iteration 10 - F1 Score: 0.71 - Precision: 0.78 - Recall: 0.65 - Losses: 358.53
Iteration 15 - F1 Score: 0.76 - Precision: 0.87 - Recall: 0.68 - Losses: 286.09
Iteration 20 - F1 Score: 0.66 - Precision: 0.87 - Recall: 0.53 - Losses: 282.57
Iteration 25 - F1 Score: 0.9 - Precision: 0.91 - Recall: 0.9 - Losses: 149.79
Iteration 30 - F1 Score: 0.92 - Precision: 0.93 - Recall: 0.91 - Losses: 137.68
Iteration 35 - F1 Score: 0.92 - Precision: 0.93 - Recall: 0.91 - Losses: 88.76
Iteration 40 - F1 Score: 0.93 - Precision: 0.93 - Recall: 0.92 - Losses: 78.27


### Тестирование

In [None]:
# Тестовые логи
new_logs = [ "type=SYSCALL msg=audit(1714314919.363:537): arch=c000003e syscall=44 success=yes exit=60 a0=3 a1=7ffe69e55e30 a2=3c a3=0 items=0 ppid=5712 pid=5713 auid=4294967295 uid=0 gid=0 euid=0 suid=0 fsuid=0 egid=0 sgid=0 fsgid=0 tty=(none) ses=4294967295 comm='auditd' exe='/usr/sbin/auditd' subj=unconfined key=(null)ARCH=x86_64 SYSCALL=sendto AUID='unset' UID='root' GID='root' EUID='root' SUID='root' FSUID='root' EGID='root' SGID='root' FSGID='root'",
            "type=CWD msg=audit(1714314919.419:539): cwd='/'",
             "type=PROCTITLE msg=audit(1714314919.363:538): proctitle='/sbin/auditd'" ]

# Загрузка модели
nlp = spacy.load("/content/drive/MyDrive/parse_model")

# Функция парсинга логов
def analyze_logs(log):
    doc = nlp(log)
    entities = [(ent.text, ent.label_) for ent in doc.ents]
    print(f"Log: {log.strip()}\nNamed Entities: {entities}\n")
    return entities

# Вывод сущностей логов
out = []
for log in new_logs:
  out.append(analyze_logs(log))

Log: type=SYSCALL msg=audit(1714314919.363:537): arch=c000003e syscall=44 success=yes exit=60 a0=3 a1=7ffe69e55e30 a2=3c a3=0 items=0 ppid=5712 pid=5713 auid=4294967295 uid=0 gid=0 euid=0 suid=0 fsuid=0 egid=0 sgid=0 fsgid=0 tty=(none) ses=4294967295 comm='auditd' exe='/usr/sbin/auditd' subj=unconfined key=(null)ARCH=x86_64 SYSCALL=sendto AUID='unset' UID='root' GID='root' EUID='root' SUID='root' FSUID='root' EGID='root' SGID='root' FSGID='root'
Named Entities: [('type=SYSCALL', 'TYPE'), ('msg=audit(1714314919.363:537):', 'MSG'), ('arch=c000003e', 'ARCH'), ('syscall=44', 'SYSCALL'), ('success=yes', 'SUCCESS'), ('exit=60', 'EXIT'), ('a0=3', 'A0'), ('a1=7ffe69e55e30', 'A1'), ('a2=3c', 'A2'), ('a3=0', 'A3'), ('items=0', 'ITEMS'), ('ppid=5712', 'PPID'), ('pid=5713', 'PID'), ('auid=4294967295', 'AUID'), ('uid=0', 'UID'), ('gid=0', 'GID'), ('euid=0', 'EUID'), ('suid=0', 'SUID'), ('fsuid=0', 'FSUID'), ('egid=0', 'EGID'), ('sgid=0', 'SGID'), ('fsgid=0', 'FSGID'), ('tty=(none)', 'TTY'), ('ses=4

In [None]:
# Вывод распаршенных логов
for log in out:
  for par in log:
    first_param, second_param = par
    value_after_equal = first_param.split('=', 1)[1]
    print(f"{second_param}: {value_after_equal}")
  print("\n")

TYPE: SYSCALL
MSG: audit(1714314919.363:537):
ARCH: c000003e
SYSCALL: 44
SUCCESS: yes
EXIT: 60
A0: 3
A1: 7ffe69e55e30
A2: 3c
A3: 0
ITEMS: 0
PPID: 5712
PID: 5713
AUID: 4294967295
UID: 0
GID: 0
EUID: 0
SUID: 0
FSUID: 0
EGID: 0
SGID: 0
FSGID: 0
TTY: (none)
SES: 4294967295
EXE: '/usr/sbin/auditd'
SUBJ: unconfined
ARCH: (null)ARCH=x86_64
SYSCALL: sendto
AUID: 'unset'
UID: 'root'
FSGID: 'root'
KEY: 'root'
SADDR: 'root
FSUID: 'root'
FSGID: 'root'
FSGID: 'root'


TYPE: CWD
MSG: audit(1714314919.419:539):
EXE: '/'


TYPE: PROCTITLE
MSG: audit(1714314919.363:538):
PROCTITLE: '/sbin/auditd'


