## 필요 라이브러리 설치

In [None]:
!pip install faker
!pip install openai
!pip install elasticsearch
!pip install sklearn

## 합성로그 생성 함수

In [1]:
import openai
import ast
import json
from elasticsearch import Elasticsearch, helpers
from sklearn.feature_extraction.text import TfidfVectorizer
from faker import Faker
import random

fake = Faker()

# 1. Apache HTTP Server (Common Log Format)
def generate_apache_log():
    return '{RemoteHost} - - [{Timestamp}] "{RequestMethod} {RequestURI} {Protocol}" {StatusCode} {ResponseSize}'.format(
        RemoteHost=fake.ipv4(),
        Timestamp=fake.date_time_this_year().strftime('%d/%b/%Y:%H:%M:%S %z'),
        RequestMethod=fake.http_method(),
        RequestURI=fake.uri(),
        Protocol='HTTP/1.1',
        StatusCode=random.choice([200, 404, 500]),
        ResponseSize=random.randint(100, 10000)
    )

# 2. Nginx (Combined Log Format)
def generate_nginx_log():
    return '{RemoteAddress} - {RemoteUser} [{Timestamp}] "{RequestMethod} {RequestURI} {Protocol}" {StatusCode} {ResponseSize} \
"{Referer}" "{UserAgent}"'.format(
        RemoteAddress=fake.ipv4(),
        RemoteUser='-',
        Timestamp=fake.date_time_this_year().strftime('%d/%b/%Y:%H:%M:%S %z'),
        RequestMethod=fake.http_method(),
        RequestURI=fake.uri(),
        Protocol='HTTP/1.1',
        StatusCode=random.choice([200, 404, 500]),
        ResponseSize=random.randint(100, 10000),
        Referer=fake.uri(),
        UserAgent=fake.user_agent()
    )

# 3. Syslog (RFC 5424)
def generate_syslog():
    return '<{Priority}>{Version} {Timestamp} {Hostname} {AppName} {ProcID} {MsgID} {StructuredData} {Message}'.format(
        Priority=random.randint(1, 191),
        Version=1,
        Timestamp=fake.date_time_this_year().isoformat(),
        Hostname=fake.hostname(),
        AppName=fake.word(),
        ProcID=random.randint(1000, 9999),
        MsgID=random.randint(1000, 9999),
        StructuredData='-',
        Message=fake.sentence()
    )

# 4. AWS CloudTrail
def generate_aws_cloudtrail_log():
    return '{{"eventVersion": "{EventVersion}", "userIdentity": {{"type": "IAMUser", "userName": "{UserName}"}}, \
"eventTime": "{Timestamp}", "eventSource": "{EventSource}", "eventName": "{EventName}", "awsRegion": "{AwsRegion}" , \
"sourceIPAddress": "{SourceIPAddress}", "userAgent": "{UserAgent}", "requestParameters": {{"key": "value"}}, \
"responseElements": {{"key": "value"}}, "requestID": "{RequestId}", "eventID": "{EventId}", "eventType": "AwsApiCall", \
"recipientAccountId": "{RecipientAccountId}"}}'.format(
        EventVersion='1.08',
        UserName=fake.user_name(),
        Timestamp=fake.date_time_this_year().isoformat(),
        EventSource='s3.amazonaws.com',
        EventName='GetObject',
        AwsRegion='us-east-1',
        SourceIPAddress=fake.ipv4(),
        UserAgent=fake.user_agent(),
        RequestId=fake.uuid4(),
        EventId=fake.uuid4(),
        RecipientAccountId=fake.random_number(digits=12)
    )

# 5. Microsoft Windows Event Log
def generate_windows_event_log():
    return '<Event xmlns="http://schemas.microsoft.com/win/2004/08/events/event"><System><Provider Name="{ProviderName}"/>\
<EventID>{EventID}</EventID><Level>{Level}</Level><TimeCreated SystemTime="{Timestamp}"/><SourceName>{SourceName}</SourceName>\
<Computer>{Computer}</Computer></System><EventData>{Message}</EventData></Event>'.format(
        ProviderName=fake.word(),
        EventID=random.randint(1000, 9999),
        Level=random.randint(1, 5),
        Timestamp=fake.date_time_this_year().isoformat(),
        SourceName=fake.word(),
        Computer=fake.hostname(),
        Message=fake.sentence()
    )

# 6. Linux Audit Log
def generate_linux_audit_log():
    return 'type={AuditType} msg=audit({Timestamp}): {Message}'.format(
        AuditType=fake.word(),
        Timestamp=fake.date_time_this_year().isoformat(),
        Message=fake.sentence()
    )

def generate_logs(sources, total_logs, random_logs):
    # 함수이름과 로그의 종류를 맵핑합니다.
    source_to_function = {
        'apache': generate_apache_log,
        'nginx': generate_nginx_log,
        'syslog': generate_syslog,
        'aws_cloudtrail': generate_aws_cloudtrail_log,
        'windows_event': generate_windows_event_log,
        'linux_audit': generate_linux_audit_log,
    }
    
    # 각각의 로그 종류 별로 생성할 로그의 수를 계산합니다.
    num_sources = len(sources)
    logs_per_source = [total_logs // num_sources] * num_sources
    if random_logs:
        for i in range(total_logs % num_sources):
            logs_per_source[i] += 1
        random.shuffle(logs_per_source)
    
    # 로그를 생성하고 리스트에 생성된 합성로그를 추가합니다.
    generated_logs = []
    for source, num_logs in zip(sources, logs_per_source):
        log_function = source_to_function[source]
        for _ in range(num_logs):
            generated_logs.append(log_function())
    
    return generated_logs



## OpenAI API를 활용한 로그 확장

In [13]:
# 합성 로그 생성
sources_to_use = ['apache']
total_logs_to_generate = 15
random_logs_per_source = True
logs = generate_logs(sources_to_use, total_logs_to_generate, random_logs_per_source)


stringifiedPromptsArray = json.dumps(logs)

print("Logs: ")
print(logs)

prompts = [
    {
    "role": "user",
    "content": stringifiedPromptsArray
    }
]

batchInstruction = {
    "role": "system",
    "content": "Explain what happened for each log line of the array. Return a python array of the explanation. Only the array, no text around it or any extra comment, nothing else than the array should be in the answer. Don't forget in your completion to give the day, date and year of the log. Interpret some of the log content if you can, for example you have to translate what an error code 500."
}

prompts.append(batchInstruction)
print("ChatGPT: ")


# 활용가능한 Open API key를 선언합니다.
openai_api_key = "sk-DeopQ8PE****************************************"

# OpenAI 클라이언트를 초기화 합니다. 
openai.api_key = openai_api_key

stringifiedBatchCompletion = openai.chat.completions.create(model="gpt-3.5-turbo", messages=prompts, max_tokens=1000)
print(stringifiedBatchCompletion.choices[0].message.content)
batchCompletion = ast.literal_eval(stringifiedBatchCompletion.choices[0].message.content)

#batchCompletion

Logs: 
['57.221.222.167 - - [20/Mar/2024:19:36:59 ] "POST https://www.coleman.net/search/tagshome.asp HTTP/1.1" 404 4171', '133.22.159.34 - - [14/Mar/2024:01:16:01 ] "GET http://www.cruz-williams.net/searchsearch.html HTTP/1.1" 404 1245', '113.6.69.211 - - [27/Jan/2024:11:35:08 ] "TRACE https://www.osborne.com/list/tagssearch.jsp HTTP/1.1" 500 3076', '190.144.229.71 - - [27/Feb/2024:09:36:26 ] "TRACE https://thompson.net/tags/posts/categoriesprivacy.jsp HTTP/1.1" 500 8880', '216.190.45.104 - - [01/Mar/2024:14:01:22 ] "PATCH http://www.morris-dominguez.info/tags/wp-contentregister.php HTTP/1.1" 500 6873', '120.56.41.56 - - [06/Jan/2024:23:12:40 ] "OPTIONS https://www.woodward.info/tags/category/tagspost.html HTTP/1.1" 404 5603', '151.29.88.216 - - [03/Jan/2024:12:16:03 ] "DELETE https://www.singh.com/categoryhome.htm HTTP/1.1" 200 100', '44.235.83.16 - - [26/Jan/2024:20:37:27 ] "OPTIONS http://boone.com/categories/categoryfaq.php HTTP/1.1" 200 7777', '74.166.135.10 - - [20/Mar/2024:17:4

## 로그 벡터화

In [None]:
# 일래스틱서치 접속 정보 셋팅
import getpass
es_cloud_id = getpass.getpass('Enter Elastic Cloud ID: ') 
es_api_id = getpass.getpass('Enter cluster API key ID: ') 
es_api_key = getpass.getpass('Enter cluster API key: ')

# 일래스틱 클라우드 접속
es = Elasticsearch(cloud_id=es_cloud_id, api_key=(es_api_id, es_api_key))

# 인덱스 맵핑 설정
index_config = {
  "mappings": {
    "properties": {
      "description_vectorized": {
        "type": "dense_vector",
        "dims": 768,
        "index": True,
        "similarity": "cosine"
      }
    }
  }
}

# 인덱스 생성
response = es.indices.create(index='logs', body=index_config)

#### ※ 일괄 색인 수행 전 7.4.2 모델저장, 7.4.3 수집파이프라인 생성 단계 수행이 필요합니다. 

In [87]:
# 일괄 색인을 위한 JSON 문서 생성 
bulk_index_body = []
for index, log in enumerate(batchCompletion):
    document = {
        "_index": "logs", 
        "pipeline": "vectorize-log",
        "_source": {
            "text_field": log, "log": logs[index]
        }
    }
    bulk_index_body.append(document)

# 일괄 색인 문서 확인 
print("Bulk request: ")
print(bulk_index_body)

try:
    response = helpers.bulk(es, bulk_index_body)
    print ("\nRESPONSE:", response)
except Exception as e:
    print("\nERROR:", e)


Bulk request: 
[{'_index': 'logs', 'pipeline': 'vectorize-log', '_source': {'text_field': "On 20th March 2024, a POST request was made to 'https://www.coleman.net/search/tagshome.asp' but resulted in a 404 error - Not Found", 'log': '57.221.222.167 - - [20/Mar/2024:19:36:59 ] "POST https://www.coleman.net/search/tagshome.asp HTTP/1.1" 404 4171'}}, {'_index': 'logs', 'pipeline': 'vectorize-log', '_source': {'text_field': "On 14th March 2024, a GET request was made to 'http://www.cruz-williams.net/searchsearch.html' also resulting in a 404 error", 'log': '133.22.159.34 - - [14/Mar/2024:01:16:01 ] "GET http://www.cruz-williams.net/searchsearch.html HTTP/1.1" 404 1245'}}, {'_index': 'logs', 'pipeline': 'vectorize-log', '_source': {'text_field': "On 27th January 2024, a TRACE request to 'https://www.osborne.com/list/tagssearch.jsp' resulted in a 500 error - Internal Server Error", 'log': '113.6.69.211 - - [27/Jan/2024:11:35:08 ] "TRACE https://www.osborne.com/list/tagssearch.jsp HTTP/1.1" 5

## 시맨틱 검색

In [97]:

def ESSearch(query_text):
  # 일래스틱서치 BM25와 kNN의 하이브리드 검색
  query = {
    "bool": {
      "filter": [{
        "exists": {
          "field": "description_vectorized"
        }
      }]
    }
  }

  knn = {
    "field": "description_vectorized",
    "k": 1,
    "num_candidates": 20,
    "query_vector_builder": {
      "text_embedding": {
        "model_id": "sentence-transformers__all-distilroberta-v1",
        "model_text": query_text
      }
    },
    "boost": 24
  }

  fields = ["text_field"]
  index = 'logs'
  resp = es.search(index=index,
                   query=query,
                   knn=knn,
                   fields=fields,
                   size=1,
                   source=False)


  #print(resp['hits']['hits'][0]['fields']['text_field'][0])
  return resp['hits']['hits'][0]['fields']['text_field'][0]


ESSearch("Were there any error in March?")

"On 1st March 2024, a PATCH request to 'http://www.morris-dominguez.info/tags/wp-contentregister.php' resulted in a 500 error"

# 7.4.2 모델저장 - 일래스틱서치에 임베딩 모델 로드하기
#### 임베딩에 필요한 모델을 일래스틱서치에 로드합니다.

In [None]:
# 필요시 관련 라이브러리 설치가 필요합니다.  
from pathlib import Path
from eland.ml.pytorch import PyTorchModel
from eland.ml.pytorch.transformers import TransformerModel


hf_model_id='sentence-transformers/all-distilroberta-v1'
tm = TransformerModel(model_id=hf_model_id, task_type="text_embedding")

# 일래스틱서치에서 이름으로 사용할 modelID 설정
es_model_id = tm.elasticsearch_model_id()

# 허깅 페이스에서 모델 다운로드
tmp_path = "models"
Path(tmp_path).mkdir(parents=True, exist_ok=True)
model_path, config, vocab_path = tm.save(tmp_path)

# 일래스틱서치에 모델 저장
ptm = PyTorchModel(es, es_model_id)
ptm.import_model(model_path=model_path, config_path=None, vocab_path=vocab_path, config=config)


In [46]:
from elasticsearch import Elasticsearch
from elasticsearch.client import MlClient

# 모델을 사용가능한 상태로 배포합니다. 
s = MlClient.start_trained_model_deployment(es, model_id=es_model_id)

In [95]:
# 임베딩 모델이 정상 동작하는지 확인합니다. 
docs = [
    {
        "text_field": "Last week I upgraded my iOS version and ever since then my phone has been overheating whenever I use your app."
    }
]

z = MlClient.infer_trained_model(es, model_id=es_model_id, docs=docs)
doc_0_vector = z['inference_results'][0]['predicted_value']
doc_0_vector

[-0.021473683416843414,
 -0.020153144374489784,
 -0.0673813745379448,
 -0.014762161299586296,
 0.0048216055147349834,
 0.06022338196635246,
 0.02268715389072895,
 -0.0074721998535096645,
 0.01231254544109106,
 -0.026503419503569603,
 -0.004034925252199173,
 -0.027102677151560783,
 0.03775620460510254,
 -0.035329099744558334,
 -0.03595760464668274,
 0.04847604036331177,
 0.06781148910522461,
 -0.03396942839026451,
 0.006926665082573891,
 0.06797844171524048,
 0.047013361006975174,
 -0.028112638741731644,
 -0.03463795408606529,
 0.02090543881058693,
 0.011902746744453907,
 -0.033433735370635986,
 0.04799216240644455,
 0.00035042394301854074,
 0.00023658713325858116,
 -0.02467566914856434,
 0.03689796105027199,
 -0.023514190688729286,
 -0.0036989047657698393,
 0.04288441315293312,
 -0.05215068534016609,
 -0.008066058158874512,
 0.01793322153389454,
 0.0031528400722891092,
 0.01756146177649498,
 -0.01003781147301197,
 0.05079245939850807,
 -0.034011706709861755,
 0.007326971739530563,
 0.0

# 7.4.3 수집파이프라인 생성
#### 아래의 일래스틱서치 API를 키바나에서 수행하여 일괄색인 API에서 활용되는 vectorize-log 파이프라인을 정의합니다. 

In [None]:
#PUT _ingest/pipeline/vectorize-log
#{
#  "description": "ingest pipe for chapter 7",
#    "processors": [
#    {
#      "inference": {
#        "model_id": "sentence-transformers__all-distilroberta-v1",
#        "target_field": "description_vectorized"
#      }
#    },
#    {
#      "set": {
#        "field": "description_vectorized",
#        "copy_from": "description_vectorized.predicted_value"
#      }
#    }
#  ]
#}