In [1]:
import pandas as pd
import numpy as np
import warnings
from sklearn.utils.class_weight import compute_class_weight
from autogluon.multimodal import MultiModalPredictor

warnings.filterwarnings('ignore')


In [2]:
# 데이터를 CSV 파일에서 읽어옵니다.
try:
    final_train = pd.read_csv('../../data/preprocessed_data/final_train.csv')
    final_test = pd.read_csv('../../data/preprocessed_data/final_test.csv')
    print("데이터 로드 성공.")
except Exception as e:
    print("데이터 로드 에러:", e)
    final_train = pd.DataFrame()
    final_test = pd.DataFrame()


데이터 로드 성공.


In [3]:
# 각 컬럼의 타입을 지정합니다.
try:
    column_types = {
       'URL': 'text',
       'label': 'categorical',
       'digit_ratio': 'numerical',
       'special_char_count': 'numerical',
       'subdomain_count': 'numerical',
       'length': 'numerical'
    }
    print("컬럼 타입 설정 성공.")
except Exception as e:
    print("컬럼 타입 설정 에러:", e)
    column_types = {}


컬럼 타입 설정 성공.


In [4]:
# train 데이터의 'label' 컬럼을 기준으로 클래스 가중치를 계산하고 정규화합니다.
try:
    weights = compute_class_weight(
       class_weight='balanced',
       classes=np.unique(final_train['label']),
       y=final_train['label'].values
    )
    weights = weights / weights.sum()  # 가중치 정규화 (합계 1)
    weights = list(weights)
    print("\n계산된 클래스 가중치:", weights)
except Exception as e:
    print("클래스 가중치 계산 에러:", e)
    weights = []



계산된 클래스 가중치: [0.2237147207970887, 0.7762852792029113]


In [5]:
# AutoGluon MultiModalPredictor를 생성합니다.
try:
    predictor = MultiModalPredictor(
       label='label',
       problem_type='binary',
       eval_metric='roc_auc',
       validation_metric='roc_auc'
    )
    print("Predictor 생성 성공.")
except Exception as e:
    print("MultiModalPredictor 생성 에러:", e)
    predictor = None


Predictor 생성 성공.


In [6]:
# train 데이터를 사용하여 모델을 학습시킵니다.
try:
    if predictor is not None:
        predictor.fit(
           train_data=final_train,
           column_types=column_types,
           presets='best_quality',
           time_limit=None,
           seed=42,
           hyperparameters={
              "model.hf_text.checkpoint_name": "r3ddkahili/final-complete-malicious-url-model",
              "env.per_gpu_batch_size": 64,
              "optimization.patience": 5,
              "optimization.loss_function": "focal_loss",
              "optimization.focal_loss.alpha": weights,
           }
        )
        print("모델 학습 완료.")
    else:
        print("Predictor가 None입니다. 모델 학습 건너뜁니다.")
except Exception as e:
    print("predictor.fit 실행 중 에러:", e)


No path specified. Models will be saved in: "AutogluonModels/ag-20250224_031026"
AutoGluon Version:  1.2
Python Version:     3.10.12
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #53~22.04.1-Ubuntu SMP PREEMPT_DYNAMIC Wed Jan 15 19:18:46 UTC 2
CPU Count:          24
Pytorch Version:    2.5.1+cu124
CUDA Version:       12.4
Memory Avail:       52.25 GB / 62.57 GB (83.5%)
Disk Space Avail:   1530.74 GB / 1831.76 GB (83.6%)

AutoMM starts to create your model. ✨✨✨

To track the learning progress, you can open a terminal and launch Tensorboard:
    ```shell
    # Assume you have installed tensorboard
    tensorboard --logdir /home/lh/Documents/Malicious.URL.Detector/notebooks/inference/AutogluonModels/ag-20250224_031026
    ```

Seed set to 42
GPU Count: 1
GPU Count to be Used: 1
GPU 0 Name: NVIDIA GeForce RTX 3090 Ti
GPU 0 Memory: 0.61GB/23.99GB (Used/Total)

Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, 

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 0, global step 27051: 'val_roc_auc' reached 0.97931 (best 0.97931), saving model to '/home/lh/Documents/Malicious.URL.Detector/notebooks/inference/AutogluonModels/ag-20250224_031026/epoch=0-step=27051.ckpt' as top 3


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 0, global step 54102: 'val_roc_auc' reached 0.98090 (best 0.98090), saving model to '/home/lh/Documents/Malicious.URL.Detector/notebooks/inference/AutogluonModels/ag-20250224_031026/epoch=0-step=54102.ckpt' as top 3


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 1, global step 81154: 'val_roc_auc' reached 0.98175 (best 0.98175), saving model to '/home/lh/Documents/Malicious.URL.Detector/notebooks/inference/AutogluonModels/ag-20250224_031026/epoch=1-step=81154.ckpt' as top 3


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 1, global step 108205: 'val_roc_auc' reached 0.98264 (best 0.98264), saving model to '/home/lh/Documents/Malicious.URL.Detector/notebooks/inference/AutogluonModels/ag-20250224_031026/epoch=1-step=108205.ckpt' as top 3


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 2, global step 135257: 'val_roc_auc' reached 0.98208 (best 0.98264), saving model to '/home/lh/Documents/Malicious.URL.Detector/notebooks/inference/AutogluonModels/ag-20250224_031026/epoch=2-step=135257.ckpt' as top 3


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 2, global step 162308: 'val_roc_auc' reached 0.98265 (best 0.98265), saving model to '/home/lh/Documents/Malicious.URL.Detector/notebooks/inference/AutogluonModels/ag-20250224_031026/epoch=2-step=162308.ckpt' as top 3


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 3, global step 189360: 'val_roc_auc' was not in top 3


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 3, global step 216411: 'val_roc_auc' was not in top 3


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 4, global step 243463: 'val_roc_auc' was not in top 3


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 4, global step 270514: 'val_roc_auc' was not in top 3


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 5, global step 297566: 'val_roc_auc' was not in top 3
Start to fuse 3 checkpoints via the greedy soup algorithm.


Predicting: |          | 0/? [00:00<?, ?it/s]

Predicting: |          | 0/? [00:00<?, ?it/s]

Predicting: |          | 0/? [00:00<?, ?it/s]

AutoMM has created your model. 🎉🎉🎉

To load the model, use the code below:
    ```python
    from autogluon.multimodal import MultiModalPredictor
    predictor = MultiModalPredictor.load("/home/lh/Documents/Malicious.URL.Detector/notebooks/inference/AutogluonModels/ag-20250224_031026")
    ```

If you are not satisfied with the model, try to increase the training time, 
adjust the hyperparameters (https://auto.gluon.ai/stable/tutorials/multimodal/advanced_topics/customization.html),
or post issues on GitHub (https://github.com/autogluon/autogluon/issues).




모델 학습 완료.


In [7]:
# 학습된 모델을 사용하여 test 데이터에 대한 예측 확률을 계산합니다.
try:
    if predictor is not None:
        test_pred_proba = predictor.predict_proba(final_test)
        # 이진 분류의 경우, 클래스 1(악성 URL)의 확률 사용
        if 1 in test_pred_proba.columns:
            prediction_scores = test_pred_proba[1]
        else:
            prediction_scores = test_pred_proba.iloc[:, 1]
        print("예측 완료.")
    else:
        print("Predictor가 None입니다. 기본 0 예측값 사용.")
        prediction_scores = np.zeros(len(final_test))
except Exception as e:
    print("예측 실행 중 에러:", e)
    prediction_scores = np.zeros(len(final_test))


Predicting: |          | 0/? [00:00<?, ?it/s]

예측 완료.


In [8]:
# 예측 결과를 기반으로 제출 파일을 생성합니다.
try:
    submission = pd.DataFrame({
       'ID': final_test['ID'],
       'probability': prediction_scores
    })
    submission.to_csv('../../submission/FE_multimodal2.csv', index=False)
    print("\n제출 파일 생성 완료.")
except Exception as e:
    print("제출 파일 생성 에러:", e)



제출 파일 생성 완료.
