# Import Library

In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from google.colab import drive
from tqdm import tqdm
import lightgbm as lgb

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



# Data Load & EDA

In [2]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
path = 'drive/MyDrive/로그 분석을 통한 보안 위험도 예측 AI 경진대회/'
train = pd.read_csv(path + 'train.csv')
test = pd.read_csv(path + 'test.csv')
sample_submission = pd.read_csv(path + 'sample_submission.csv')

print("Train shape:", train.shape)
print("Test shape:", test.shape)

Train shape: (472972, 3)
Test shape: (1418916, 2)


In [4]:
train.head()
test.head()

Unnamed: 0,id,full_log
0,1000000,"Feb 8 15:47:26 localhost kibana: {""type"":""err..."
1,1000001,"Sep 24 03:46:39 localhost kibana: {""type"":""err..."
2,1000002,type=SYSCALL msg=audit(1611888200.428:210563):...
3,1000003,"Jan 18 11:24:06 localhost kibana: {""type"":""err..."
4,1000004,type=SYSCALL msg=audit(1603081202.050:46851): ...


In [5]:
train['level'].value_counts()

Unnamed: 0_level_0,count
level,Unnamed: 1_level_1
0,334065
1,132517
3,4141
5,2219
2,12
4,10
6,8


# Data Preprocessing

In [6]:
# 중복 데이터 제거
train_before = train.shape[0]
train = train.drop_duplicates(subset=['full_log'])
print(f"Train 데이터 중 {train_before - train.shape[0]}개의 중복 로그가 제거되었습니다.")

# 텍스트 전처리 함수
def preprocess_text(text):
    text = re.sub(r"\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}:\d{1,5}\b", 'IP:Port', text)
    text = re.sub(r"\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b", 'IP', text)
    text = re.sub(r'[^a-zA-Z가-힣\s]', '', text)
    text = re.sub(r'\s+', ' ', text)

    return text.strip()

# 전처리 적용
tqdm.pandas()
train['full_log'] = train['full_log'].progress_apply(preprocess_text)
test['full_log'] = test['full_log'].progress_apply(preprocess_text)

# 전처리 확인
print("전처리 후 훈련 데이터 샘플:")
print(train['full_log'].head())
print("전처리 후 테스트 데이터 샘플:")
print(test['full_log'].head())

Train 데이터 중 68836개의 중복 로그가 제거되었습니다.


100%|██████████| 404136/404136 [00:22<00:00, 18291.13it/s]
100%|██████████| 1418916/1418916 [01:17<00:00, 18395.23it/s]


전처리 후 훈련 데이터 샘플:
0    Sep localhost kibana typeerrortimestampTZtagsw...
1    Feb localhost logstash TINFO logstashoutputsel...
2    Jan localhost kibana typeerrortimestampTZtagsw...
3    Jan localhost kibana typeerrortimestampTZtagsw...
4    typeSYSCALL msgaudit archce syscall successyes...
Name: full_log, dtype: object
전처리 후 테스트 데이터 샘플:
0    Feb localhost kibana typeerrortimestampTZtagsw...
1    Sep localhost kibana typeerrortimestampTZtagsw...
2    typeSYSCALL msgaudit archce syscall successyes...
3    Jan localhost kibana typeerrortimestampTZtagsw...
4    typeSYSCALL msgaudit archce syscall successyes...
Name: full_log, dtype: object


# Feature Engineering & Data Split

In [7]:
# TF-IDF 벡터화
tfidf = TfidfVectorizer(
    max_features=10000,
    ngram_range=(1,3)
)

X = train['full_log']
y = train['level']

# 훈련/검증 데이터 분리
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# TF-IDF 피처 생성
tfidf.fit(X_train)
X_train_tfidf = tfidf.transform(X_train)
X_val_tfidf = tfidf.transform(X_val)
X_test_tfidf = tfidf.transform(test['full_log'])

# Model Training

In [12]:
model = lgb.LGBMClassifier(
    objective='multiclass',
    num_class=7,
    n_estimators=1000,
    learning_rate=0.05,
    num_leaves=31,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    class_weight='balanced'
)

model.fit(
    X_train_tfidf, y_train,
    eval_set=[(X_val_tfidf, y_val)],
    eval_metric='multi_logloss',
    callbacks=[
        lgb.early_stopping(stopping_rounds=50),
        lgb.log_evaluation(100)
    ]
)



[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 18.117692 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 197595
[LightGBM] [Info] Number of data points in the train set: 323308, number of used features: 8952
[LightGBM] [Info] Start training from score -1.945910
[LightGBM] [Info] Start training from score -1.945910
[LightGBM] [Info] Start training from score -1.945910
[LightGBM] [Info] Start training from score -1.945910
[LightGBM] [Info] Start training from score -1.945910
[LightGBM] [Info] Start training from score -1.945910
[LightGBM] [Info] Start training from score -1.945910
Training until validation scores don't improve for 50 rounds
[100]	valid_0's multi_logloss: 0.003394
Early stopping, best iteration is:
[121]	valid_0's multi_logloss: 0.0031049


# Model Evaluation

In [13]:
y_pred_val = model.predict(X_val_tfidf)
print("Validation Classification Report:")
print(classification_report(y_val, y_pred_val))



Validation Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     54235
           1       1.00      1.00      1.00     25417
           2       1.00      1.00      1.00         2
           3       1.00      1.00      1.00       827
           4       0.67      1.00      0.80         2
           5       0.97      0.99      0.98       343
           6       1.00      0.50      0.67         2

    accuracy                           1.00     80828
   macro avg       0.95      0.93      0.92     80828
weighted avg       1.00      1.00      1.00     80828



# Prediction & Submission

In [14]:
# 테스트 데이터 예측
test_preds_proba = model.predict_proba(X_test_tfidf)
test_preds = np.argmax(test_preds_proba, axis=1)
test_preds_proba_max = np.max(test_preds_proba, axis=1)

# 레벨 7 할당
threshold = 0.5
test_preds_final = test_preds.copy()
test_preds_final[test_preds_proba_max < threshold] = 7

# 최종 예측값 조정
test_preds_final = np.where(
    test_preds_final < 7, test_preds_final, 7
)

# 제출 파일 생성
submission = sample_submission.copy()
submission['level'] = test_preds_final
submission.to_csv('submission.csv', index=False)

print("Submission 파일이 성공적으로 생성되었습니다.")



Submission 파일이 성공적으로 생성되었습니다.
