In [18]:
import uuid
import pandas as pd
import json

case_name = "normal_6"
uuid_cols_name = ["job_id", "task_id"]

def conv_uuid(target):
    global uuid_cols_name
    df = pd.read_parquet(f'{target}.parquet', engine='pyarrow')
    for col in df.columns:
        if col in uuid_cols_name:
            df[col] = df[col].apply(lambda x: uuid.UUID(bytes=x))
    return df

def unlimit_print():
    pd.set_option('display.max_rows', None)
    pd.set_option('display.max_columns', None)
    pd.set_option('display.width', None)
    pd.set_option('display.max_colwidth', None)


def limit_print():
    pd.reset_option('display.max_rows')
    pd.reset_option('display.max_columns')
    pd.reset_option('display.width')
    pd.reset_option('display.max_colwidth')

unlimit_print()

log_df = conv_uuid(f"./{case_name}/logs")
task_df = conv_uuid(f"./{case_name}/tasks")
#display(log_df.head(1))
print(f"total log count: {len(log_df)}")
log_df = log_df[
    (log_df["job_id"] != uuid.UUID("00000000-0000-0000-0000-000000000000")) & 
    (log_df["dst_port"] != 53)]
tl_df = pd.merge(log_df, task_df, on=['job_id','task_id'], how='inner')[['task_id','src_endpoint','dst_endpoint','range_begin','range_end','src_ip','src_port','dst_ip','dst_port','packet_size']]
display(len(tl_df), len(task_df), len(log_df))
#display(task_df)
print(f"remained log count: {len(log_df)}")

limit_print()

total log count: 6210


2254

100

2254

remained log count: 2254


In [19]:
import re
import json

def normalize_range(port_range_str):
    start, end = map(int, port_range_str.split('_to_'))
    return start, end

source = "10.2.0.11"
normal_dst = "54.180.115.209"
mal_dst = "15.164.214.122"

normal_data = {}
mal_data = {}
with open(f"./{case_name}/normal-bucket.json", "r", encoding='utf-8') as f:
    normal_obj = json.load(f)["metadata"]
    for obj in normal_obj:
        match = re.match(r"^([0-9a-f\-]{36})\.([0-9a-f\-]{36})\.((?:\d{1,3}\.){3}\d{1,3})\.(\d+_to_\d+)\.gzip$", obj["object_name"])
        job_id, task_id, ip_addr, data_range = match.groups()
        if match:
            data_range = normalize_range(data_range)
            normal_data[task_id] = {"ip": ip_addr, "data_begin": data_range[0], "data_end": data_range[1]}
with open(f"./{case_name}/malicious-bucket.json", "r", encoding='utf-8') as f:
    mal_obj = json.load(f)["metadata"]
    for obj in mal_obj:
        match = re.match(r"^([0-9a-f\-]{36})\.([0-9a-f\-]{36})\.((?:\d{1,3}\.){3}\d{1,3})\.(\d+_to_\d+)\.gzip$", obj["object_name"])
        job_id, task_id, ip_addr, data_range = match.groups()
        if match:
            data_range = normalize_range(data_range)
            mal_data[task_id] = {"ip": ip_addr, "data_begin": data_range[0], "data_end": data_range[1]}

correct_cnt = 0
tot_cnt = len(tl_df)
skipped_cnt = 0
res = {}
for _, log in tl_df.iterrows():
    # 관심 없음.
    if log["dst_ip"] == source:
        #tot_cnt -= 1
        skipped_cnt += 1
        continue
    
    if log["dst_ip"] == normal_dst:
        res = normal_data
    elif log["dst_ip"] == mal_dst:
        res = mal_data
    else:
        print("=" * 50, "\nAnomaly Occurred:",)
        print(log)
        continue

    task_id = str(log["task_id"])
    if log["src_ip"] == res[task_id]["ip"] \
        and int(log["range_begin"]) == res[task_id]["data_begin"] \
        and int(log["range_end"]) == res[task_id]["data_end"]:
        correct_cnt += 1
        continue
    
    print("Unmatch Log Exists: ", log, "\n\twith: ", res[task_id])
# 일부 Miss가 존재하는데, 이건 RPC Response 과정에서 Task 맥락이 살짝 늦게 반영되서 발생한 일.
# 원래는 Task ID가 0이어야 함. 반영이 살짝 늦어서 이것까지 같은 맥락으로 포함된 것. Source Port가 RPC Serving Port인 점에서 인지 가능.
print(f"skipped_cnt: {skipped_cnt} / meaningful cnt: {tot_cnt - skipped_cnt} /  correct_cnt: {correct_cnt} / tot_cnt: {tot_cnt}")
print("Accuracy: ", (correct_cnt / (tot_cnt - skipped_cnt)) * 100, "%")


Anomaly Occurred:
task_id                      2126eba5-869d-406c-b97e-2973d8d4cd6b
src_endpoint                   minio.minio-s.svc.cluster.local:80
dst_endpoint    ec2-15-164-214-122.ap-northeast-2.compute.amaz...
range_begin                                                  4500
range_end                                                    4600
src_ip                                                 10.2.2.139
src_port                                                     8080
dst_ip                                                 10.2.1.161
dst_port                                                    48848
packet_size                                                    66
Name: 768, dtype: object
Anomaly Occurred:
task_id                      7fbb96d5-1aeb-41cd-881a-914f071775d9
src_endpoint                   minio.minio-s.svc.cluster.local:80
dst_endpoint    ec2-15-164-214-122.ap-northeast-2.compute.amaz...
range_begin                                                  6100
range_end      

In [81]:
import os
import shutil
from PIL import Image
from itertools import cycle

# Define paths and base output directory
base_image_paths = [f"images/{i}.jpg" for i in range(10)]
output_dir = "out_images"
os.makedirs(output_dir, exist_ok=True)

# Number of total images desired
total_images = 10000

# Cycle through base images and copy them with new names
image_cycle = cycle(base_image_paths)
for i in range(1, total_images + 1):
    src_path = next(image_cycle)
    dst_path = os.path.join(f"./{output_dir}/{i}.jpg")
    shutil.copy(src_path, dst_path)


In [16]:
import pandas as pd

case_name = "anomaly_2"
uuid_cols_name = ["job_id", "task_id"]

def conv_uuid(target):
    global uuid_cols_name
    df = pd.read_parquet(f'{target}.parquet', engine='pyarrow')
    for col in df.columns:
        if col in uuid_cols_name:
            df[col] = df[col].apply(lambda x: uuid.UUID(bytes=x))
    return df

def unlimit_print():
    pd.set_option('display.max_rows', None)
    pd.set_option('display.max_columns', None)
    pd.set_option('display.width', None)
    pd.set_option('display.max_colwidth', None)


def limit_print():
    pd.reset_option('display.max_rows')
    pd.reset_option('display.max_columns')
    pd.reset_option('display.width')
    pd.reset_option('display.max_colwidth')

unlimit_print()

log_df = conv_uuid(f"./{case_name}/logs")
task_df = conv_uuid(f"./{case_name}/tasks")
#display(log_df.head(1))
print(f"total log count: {len(log_df)}")
log_df = log_df[
    (log_df["job_id"] != uuid.UUID("00000000-0000-0000-0000-000000000000")) & 
    (log_df["dst_port"] != 53)]
print(log_df.groupby(["job_id", "task_id"]).agg({
    "timestamp": (lambda x : max(x) - min(x)), 
    "dst_ip": pd.Series.nunique
}))

limit_print()

total log count: 556453
                                                                                       timestamp  \
job_id                               task_id                                                       
ca3b703b-cbc0-429d-bf0e-9163380b183f 00887a01-280c-4dcc-8502-4a2fe051284b 0 days 00:00:15.451628   
                                     011d42a5-47ab-4e7e-b9c9-92f253a69c87 0 days 00:00:15.523471   
                                     02780ea0-0b57-4743-909d-25cfe7cb9a7a 0 days 00:00:16.810667   
                                     0445295f-4d7a-44f1-903c-0258eb66ac3b 0 days 00:00:17.485103   
                                     101c9584-fea4-43c8-9b07-274405490f5a 0 days 00:00:15.965396   
                                     10c3a31f-4969-4c2e-8312-6abead596b36 0 days 00:00:16.564341   
                                     1408a43b-3c89-405b-b90f-8f225b26a43c 0 days 00:00:15.387129   
                                     175d9c2e-8606-4a94-950c-344610fd3f08 0 

In [39]:
import uuid
import pandas as pd
import json
import re


base_name = "anomaly"
source = "10.2.0.11"
normal_dst = "54.180.115.209"
mal_dst = "15.164.214.122"
uuid_cols_name = ["job_id", "task_id"]


def conv_uuid(target):
    global uuid_cols_name
    df = pd.read_parquet(f'{target}.parquet', engine='pyarrow')
    for col in df.columns:
        if col in uuid_cols_name:
            df[col] = df[col].apply(lambda x: uuid.UUID(bytes=x))
    return df


def normalize_range(port_range_str):
    start, end = map(int, port_range_str.split('_to_'))
    return start, end

def test(case_name):
    log_df = conv_uuid(f"./{case_name}/logs")
    task_df = conv_uuid(f"./{case_name}/tasks")
    #display(log_df.head(1))
    print(f"total log count: {len(log_df)}")
    log_df = log_df[
        (log_df["job_id"] != uuid.UUID("00000000-0000-0000-0000-000000000000")) & 
        (log_df["dst_port"] != 53) &
        (log_df["src_port"] != 8080)]
    #display(task_df)
    print(f"remained log count: {len(log_df)}")
    
    
    normal_data = {}
    mal_data = {}
    with open(f"./{case_name}/normal-bucket.json", "r", encoding='utf-8') as f:
        normal_obj = json.load(f)["metadata"]
        for obj in normal_obj:
            match = re.match(r"^([0-9a-f\-]{36})\.([0-9a-f\-]{36})\.((?:\d{1,3}\.){3}\d{1,3})\.(\d+_to_\d+)\.gzip$", obj["object_name"])
            job_id, task_id, ip_addr, data_range = match.groups()
            if match:
                data_range = normalize_range(data_range)
                normal_data[task_id] = {"ip": ip_addr, "data_begin": data_range[0], "data_end": data_range[1]}
    with open(f"./{case_name}/malicious-bucket.json", "r", encoding='utf-8') as f:
        mal_obj = json.load(f)["metadata"]
        for obj in mal_obj:
            match = re.match(r"^([0-9a-f\-]{36})\.([0-9a-f\-]{36})\.((?:\d{1,3}\.){3}\d{1,3})\.(\d+_to_\d+)\.gzip$", obj["object_name"])
            job_id, task_id, ip_addr, data_range = match.groups()
            if match:
                data_range = normalize_range(data_range)
                mal_data[task_id] = {"ip": ip_addr, "data_begin": data_range[0], "data_end": data_range[1]}
    
    
    tasks = {}
    
    for _, log in log_df.iterrows():
        # 관심 없음.
        if log["task_id"] not in tasks.keys():
            tasks[log["task_id"]] = {"normal" : False, "anomaly": False}
        if log["dst_ip"] == normal_dst:
            tasks[log["task_id"]]["normal"] = True
        elif log["dst_ip"] == mal_dst:
            tasks[log["task_id"]]["anomaly"] = True
    
    err_cnt = 0
    for _, task_info in task_df.iterrows():
        is_anomaly = bool(task_info["run_as_evil"])
        if not tasks[task_info["task_id"]]["normal"]:
            print("(general_log) Error on", task_info["task_id"])
            err_cnt += 1
            continue
        if str(task_info["task_id"]) not in normal_data.keys():
            print("(general_dst) Error on", task_info["task_id"])
            err_cnt += 1
            continue
            
        if is_anomaly:
            if not tasks[task_info["task_id"]]["anomaly"]:
                print("(anomaly_log) Error on", task_info["task_id"])
                err_cnt += 1
                continue
            if str(task_info["task_id"]) not in mal_data.keys():
                print("(anomaly_dst) Error on", task_info["task_id"])
                err_cnt += 1
                continue
        else:
            if tasks[task_info["task_id"]]["anomaly"]:
                print("(normal_log) Error on", task_info["task_id"])
                err_cnt += 1
                continue
            if str(task_info["task_id"]) in mal_data.keys():
                print("(normal_dst) Error on", task_info["task_id"])
                err_cnt += 1
                continue
    print(err_cnt)

for cname in [f"{base_name}_{i}" for i in range(2,7)]:
    print(f"Current: {cname} ==================")
    test(cname)

total log count: 556453
remained log count: 393057
0
total log count: 536163
remained log count: 372727
0
total log count: 523746
remained log count: 360324
0
total log count: 518144
remained log count: 354812
0
total log count: 529131
remained log count: 365733
0
