0: 환경 준비 & Google Drive 마운트

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping

from sklearn.model_selection import GroupShuffleSplit
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import (
    confusion_matrix,
    classification_report,
    roc_auc_score,
    average_precision_score,
    recall_score
)

1: 데이터 로드 (assault + fight 통합)

In [3]:
BASE = "/content/drive/MyDrive/ProcessedDataset"

def load_class(cls):
    X = np.load(os.path.join(BASE, cls, "X.npy"))
    y = np.load(os.path.join(BASE, cls, "y.npy"))
    g = np.load(os.path.join(BASE, cls, "groups.npy"))
    return X, y, g

Xa, ya, ga = load_class("assault")
Xf, yf, gf = load_class("fight")

X = np.concatenate([Xa, Xf], axis=0)
y = np.concatenate([ya, yf], axis=0)
groups = np.concatenate([ga, gf], axis=0)

print("Total X shape:", X.shape)
print("Total y shape:", y.shape)


Total X shape: (2160, 149, 5)
Total y shape: (2160,)


2: 시나리오 단위 Group Split

In [4]:
gss = GroupShuffleSplit(test_size=0.2, random_state=42)
train_idx, test_idx = next(gss.split(X, y, groups))

X_train, X_test = X[train_idx], X[test_idx]
y_train, y_test = y[train_idx], y[test_idx]

print("Train windows:", len(y_train))
print("Test windows :", len(y_test))

Train windows: 1728
Test windows : 432


3: Train / Test 분포 출력

In [5]:
print("\n[Train distribution]")
print("Positive:", int(y_train.sum()),
      "\nNegative:", len(y_train) - int(y_train.sum()),
      "\nRatio:", y_train.mean())

print("\n[Test distribution]")
print("Positive:", int(y_test.sum()),
      "\nNegative:", len(y_test) - int(y_test.sum()),
      "\nRatio:", y_test.mean())


[Train distribution]
Positive: 576 
Negative: 1152 
Ratio: 0.3333333333333333

[Test distribution]
Positive: 144 
Negative: 288 
Ratio: 0.3333333333333333


추가: 학습 곡선 (Loss)

In [6]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score, average_precision_score, precision_score, recall_score, f1_score

def set_seed(seed=42):
    np.random.seed(seed)
    tf.random.set_seed(seed)

def build_model(units=64, dropout=0.3, input_shape=(149,5)):
    model = Sequential([
        LSTM(units, return_sequences=True, input_shape=input_shape),
        Dropout(dropout),
        LSTM(units//2),
        Dropout(dropout),
        Dense(1, activation="sigmoid")
    ])
    model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])
    return model

def eval_at_threshold(y_true, y_prob, thr):
    y_pred = (y_prob >= thr).astype(int)
    return {
        "thr": thr,
        "precision": float(precision_score(y_true, y_pred, zero_division=0)),
        "recall": float(recall_score(y_true, y_pred)),
        "f1": float(f1_score(y_true, y_pred, zero_division=0)),
        "cm": confusion_matrix(y_true, y_pred)
    }

# ===== 튜닝 설정 =====
UNITS_LIST = [32, 64, 128]          # 필요하면 128 추가
DROPOUT_LIST = [0.3, 0.5]           # 필요하면 0.5 추가
BATCH_LIST = [16, 32]              # 필요하면 16 추가
POS_W_LIST = [1.5, 3, 5]       # balanced(현재) + 강화

THR_CANDIDATES = [0.5, 0.4, 0.3, 0.2]  # 보고서에 넣기 좋은 최소 sweep

results = []

for units in UNITS_LIST:
    for dr in DROPOUT_LIST:
        for bs in BATCH_LIST:
            for pos_w in POS_W_LIST:
                set_seed(42)

                model = build_model(units=units, dropout=dr, input_shape=X_train.shape[1:])

                early_stop = EarlyStopping(monitor="val_loss", patience=5, restore_best_weights=True)

                class_weight = {0: 1.0, 1: float(pos_w)}  # 0은 1로 고정, 1만 조절

                history = model.fit(
                    X_train, y_train,
                    validation_split=0.2,
                    epochs=50,
                    batch_size=bs,
                    class_weight=class_weight,
                    callbacks=[early_stop],
                    verbose=0
                )

                best_epoch = int(np.argmin(history.history["val_loss"]) + 1)

                y_prob = model.predict(X_test, verbose=0).ravel()

                roc_auc = float(roc_auc_score(y_test, y_prob))
                pr_auc = float(average_precision_score(y_test, y_prob))

                # threshold들에서 precision/recall/f1 뽑기
                thr_scores = [eval_at_threshold(y_test, y_prob, thr) for thr in THR_CANDIDATES]

                # 예: "Recall >= 0.9" 중 precision 최대인 threshold 선택
                feasible = [d for d in thr_scores if d["recall"] >= 0.90]
                if feasible:
                    best = max(feasible, key=lambda d: d["precision"])
                    pick_rule = "recall>=0.90, max precision"
                else:
                    best = max(thr_scores, key=lambda d: d["f1"])
                    pick_rule = "max f1 (fallback)"

                results.append({
                    "units": units,
                    "dropout": dr,
                    "batch": bs,
                    "pos_weight": float(pos_w),
                    "best_epoch": best_epoch,
                    "roc_auc": roc_auc,
                    "pr_auc": pr_auc,
                    "picked_thr": best["thr"],
                    "precision": best["precision"],
                    "recall": best["recall"],
                    "f1": best["f1"],
                    "pick_rule": pick_rule,
                    "cm": best["cm"]
                })

# 정렬: (1) recall 높은 것 우선, (2) precision, (3) pr_auc
results_sorted = sorted(results, key=lambda r: (r["recall"], r["precision"], r["pr_auc"]), reverse=True)

print("=== TOP 5 SETTINGS ===")
for r in results_sorted[:5]:
    print(r)

  super().__init__(**kwargs)
  super().__init__(**kwargs)
  super().__init__(**kwargs)
  super().__init__(**kwargs)
  super().__init__(**kwargs)
  super().__init__(**kwargs)
  super().__init__(**kwargs)
  super().__init__(**kwargs)
  super().__init__(**kwargs)
  super().__init__(**kwargs)
  super().__init__(**kwargs)
  super().__init__(**kwargs)
  super().__init__(**kwargs)
  super().__init__(**kwargs)
  super().__init__(**kwargs)
  super().__init__(**kwargs)
  super().__init__(**kwargs)
  super().__init__(**kwargs)
  super().__init__(**kwargs)
  super().__init__(**kwargs)
  super().__init__(**kwargs)
  super().__init__(**kwargs)
  super().__init__(**kwargs)
  super().__init__(**kwargs)
  super().__init__(**kwargs)
  super().__init__(**kwargs)
  super().__init__(**kwargs)
  super().__init__(**kwargs)
  super().__init__(**kwargs)
  super().__init__(**kwargs)
  super().__init__(**kwargs)
  super().__init__(**kwargs)
  super().__init__(**kwargs)
  super().__init__(**kwargs)
  super().__in

=== TOP 5 SETTINGS ===
{'units': 128, 'dropout': 0.3, 'batch': 16, 'pos_weight': 3.0, 'best_epoch': 6, 'roc_auc': 0.5135513117283951, 'pr_auc': 0.35109869901808965, 'picked_thr': 0.2, 'precision': 0.33410672853828305, 'recall': 1.0, 'f1': 0.5008695652173913, 'pick_rule': 'recall>=0.90, max precision', 'cm': array([[  1, 287],
       [  0, 144]])}
{'units': 32, 'dropout': 0.3, 'batch': 32, 'pos_weight': 3.0, 'best_epoch': 2, 'roc_auc': 0.5665027006172839, 'pr_auc': 0.3875351981565048, 'picked_thr': 0.4, 'precision': 0.3333333333333333, 'recall': 1.0, 'f1': 0.5, 'pick_rule': 'recall>=0.90, max precision', 'cm': array([[  0, 288],
       [  0, 144]])}
{'units': 64, 'dropout': 0.5, 'batch': 16, 'pos_weight': 1.5, 'best_epoch': 1, 'roc_auc': 0.5730372299382716, 'pr_auc': 0.38201257558293567, 'picked_thr': 0.3, 'precision': 0.3333333333333333, 'recall': 1.0, 'f1': 0.5, 'pick_rule': 'recall>=0.90, max precision', 'cm': array([[  0, 288],
       [  0, 144]])}
{'units': 64, 'dropout': 0.3, 'bat