In [None]:
import requests
import json

# ✅ 최신 NVD API URL (1.0 → 2.0 변경)
url = "https://services.nvd.nist.gov/rest/json/cves/2.0"

# User-Agent 추가 (403 오류 방지)
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}

# API 요청 보내기
response = requests.get(url, headers=headers)

# 데이터가 정상적으로 받아졌는지 확인
if response.status_code == 200:
    cve_data = response.json()  # JSON 형태로 변환

    # JSON 파일로 저장
    with open("cve_data.json", "w", encoding="utf-8") as f:
        json.dump(cve_data, f, indent=4)

    print("✅ CVE 데이터 다운로드 완료! (cve_data.json)")
else:
    print(f"❌ 다운로드 실패! 상태 코드: {response.status_code}")
    print(f"응답 내용: {response.text}")

✅ CVE 데이터 다운로드 완료! (cve_data.json)


In [None]:
import json

# JSON 파일 불러오기
with open("cve_data.json", "r", encoding="utf-8") as f:
    cve_data = json.load(f)

# 첫 번째 CVE 항목 출력
print(json.dumps(cve_data["vulnerabilities"][0], indent=4))

{
    "cve": {
        "id": "CVE-1999-0095",
        "sourceIdentifier": "cve@mitre.org",
        "published": "1988-10-01T04:00:00.000",
        "lastModified": "2024-11-20T23:27:50.607",
        "vulnStatus": "Modified",
        "cveTags": [],
        "descriptions": [
            {
                "lang": "en",
                "value": "The debug command in Sendmail is enabled, allowing attackers to execute commands as root."
            },
            {
                "lang": "es",
                "value": "El comando de depuraci\u00f3n de Sendmail est\u00e1 activado, permitiendo a atacantes ejecutar comandos como root."
            }
        ],
        "metrics": {
            "cvssMetricV2": [
                {
                    "source": "nvd@nist.gov",
                    "type": "Primary",
                    "cvssData": {
                        "version": "2.0",
                        "vectorString": "AV:N/AC:L/Au:N/C:C/I:C/A:C",
                        "baseScore": 10.

In [None]:
import json
import pandas as pd

# JSON 파일 로드
with open("cve_data.json", "r", encoding="utf-8") as f:
    cve_data = json.load(f)

# 전처리할 데이터 리스트
dataset = []

# CVE 데이터에서 필요한 정보만 추출
for item in cve_data["vulnerabilities"]:
    cve_id = item["cve"]["id"]  # CVE ID
    descriptions = item["cve"]["descriptions"]

    # 영어 설명 찾기
    description = next((d["value"] for d in descriptions if d["lang"] == "en"), "No description available")

    # CVSS 점수 추출 (위험도 평가)
    base_score = item.get("metrics", {}).get("cvssMetricV2", [{}])[0].get("cvssData", {}).get("baseScore", "N/A")

    # CWE 취약점 유형 가져오기
    weaknesses = item.get("weaknesses", [])
    cwe = weaknesses[0]["description"][0]["value"] if weaknesses else "N/A"

    # 취약한 소프트웨어 목록
    configurations = item.get("configurations", [])
    affected_software = []
    for config in configurations:
        for node in config.get("nodes", []):
            for match in node.get("cpeMatch", []):
                if match["vulnerable"]:
                    affected_software.append(match["criteria"])

    # 참고 링크 추출
    references = item.get("references", [])
    reference_links = [ref["url"] for ref in references]

    # 데이터 저장
    dataset.append({
        "CVE_ID": cve_id,
        "Description": description,
        "Base_Score": base_score,
        "CWE": cwe,
        "Affected_Software": "; ".join(affected_software),
        "References": "; ".join(reference_links)
    })

# 데이터프레임으로 변환
df = pd.DataFrame(dataset)

# CSV 파일로 저장
df.to_csv("cve_preprocessed.csv", index=False)
print("✅ 데이터 전처리 완료! (cve_preprocessed.csv)")

✅ 데이터 전처리 완료! (cve_preprocessed.csv)


In [None]:
import pandas as pd

# CSV 파일 로드
df = pd.read_csv("cve_preprocessed.csv")

# 상위 5개 데이터 출력
print(df.head())

          CVE_ID                                        Description  \
0  CVE-1999-0095  The debug command in Sendmail is enabled, allo...   
1  CVE-1999-0082      CWD ~root command in ftpd allows root access.   
2  CVE-1999-1471  Buffer overflow in passwd in BSD based operati...   
3  CVE-1999-1122  Vulnerability in restore in SunOS 4.0.3 and ea...   
4  CVE-1999-1467  Vulnerability in rcp on SunOS 4.0.x allows rem...   

   Base_Score  CWE  Affected_Software  References  
0         NaN  NaN                NaN         NaN  
1         NaN  NaN                NaN         NaN  
2         NaN  NaN                NaN         NaN  
3         NaN  NaN                NaN         NaN  
4         NaN  NaN                NaN         NaN  


In [None]:
import pandas as pd

# CSV 파일 로드
df = pd.read_csv("cve_preprocessed.csv")

# NaN 값을 'N/A'로 채움
df.fillna("N/A", inplace=True)

# 수정된 데이터 저장
df.to_csv("cve_preprocessed_fixed.csv", index=False)

# 확인
print(df.head())

          CVE_ID                                        Description  \
0  CVE-1999-0095  The debug command in Sendmail is enabled, allo...   
1  CVE-1999-0082      CWD ~root command in ftpd allows root access.   
2  CVE-1999-1471  Buffer overflow in passwd in BSD based operati...   
3  CVE-1999-1122  Vulnerability in restore in SunOS 4.0.3 and ea...   
4  CVE-1999-1467  Vulnerability in rcp on SunOS 4.0.x allows rem...   

  Base_Score  CWE Affected_Software References  
0        N/A  N/A               N/A        N/A  
1        N/A  N/A               N/A        N/A  
2        N/A  N/A               N/A        N/A  
3        N/A  N/A               N/A        N/A  
4        N/A  N/A               N/A        N/A  


  df.fillna("N/A", inplace=True)


In [None]:
from transformers import AutoTokenizer
import pandas as pd

# CodeBERT 토크나이저 로드
tokenizer = AutoTokenizer.from_pretrained("microsoft/codebert-base")

# 전처리된 CSV 데이터 로드
df = pd.read_csv("cve_preprocessed_fixed.csv")

# Description 컬럼을 CodeBERT 토큰으로 변환
df["tokenized"] = df["Description"].apply(lambda x: tokenizer(x, padding="max_length", truncation=True, max_length=128, return_tensors="pt")["input_ids"])

# 변환된 데이터 확인
print(df.head())

# CSV로 저장
df.to_csv("cve_tokenized.csv", index=False)
print("✅ 토큰화 완료! (cve_tokenized.csv)")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/498 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

          CVE_ID                                        Description  \
0  CVE-1999-0095  The debug command in Sendmail is enabled, allo...   
1  CVE-1999-0082      CWD ~root command in ftpd allows root access.   
2  CVE-1999-1471  Buffer overflow in passwd in BSD based operati...   
3  CVE-1999-1122  Vulnerability in restore in SunOS 4.0.3 and ea...   
4  CVE-1999-1467  Vulnerability in rcp on SunOS 4.0.x allows rem...   

   Base_Score  CWE  Affected_Software  References  \
0         NaN  NaN                NaN         NaN   
1         NaN  NaN                NaN         NaN   
2         NaN  NaN                NaN         NaN   
3         NaN  NaN                NaN         NaN   
4         NaN  NaN                NaN         NaN   

                                           tokenized  
0  [[tensor(0), tensor(133), tensor(47423), tenso...  
1  [[tensor(0), tensor(347), tensor(18101), tenso...  
2  [[tensor(0), tensor(49334), tensor(34391), ten...  
3  [[tensor(0), tensor(846), tenso

In [None]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.4.1-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.4.1-py3-none-any.whl (487 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m487.4/487.4 kB[0m [31m16.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py311-none-any.whl (143 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m12.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading

In [None]:
import torch
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer

# ✅ CodeBERT 토크나이저 로드
tokenizer = AutoTokenizer.from_pretrained("microsoft/codebert-base")

# ✅ 토큰화된 CSV 데이터 로드
df = pd.read_csv("cve_tokenized.csv")

# ✅ 모델이 필요로 하는 필드 추가
def format_for_model(row):
    tokens = tokenizer(row["Description"], padding="max_length", truncation=True, max_length=128, return_tensors="pt")

    return {
        "input_ids": tokens["input_ids"].squeeze().tolist(),
        "attention_mask": tokens["attention_mask"].squeeze().tolist(),
        "labels": 1 if "Injection" in row["Description"] else 0  # Binary classification (1: Injection, 0: Other)
    }

# ✅ 데이터 변환
formatted_data = df.apply(format_for_model, axis=1).tolist()

# ✅ Hugging Face Dataset 변환
dataset = Dataset.from_list(formatted_data)

# ✅ 변환된 데이터 확인
print(dataset)

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 2000
})


In [None]:
# ✅ CodeBERT 모델 로드 (2개 클래스: Injection vs Other)
model = AutoModelForSequenceClassification.from_pretrained("microsoft/codebert-base", num_labels=2)

pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/codebert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
training_args = TrainingArguments(
    output_dir="./codebert_model",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_steps=10,  # 🔥 로그를 10 스텝마다 출력
    learning_rate=2e-5,
    per_device_train_batch_size=16,  # 🔥 8 → 16으로 증가 (속도 향상)
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    report_to="none",  # W&B 비활성화
    fp16=True  # 🔥 Mixed Precision Training 활성화 (속도 향상)
)



In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    eval_dataset=dataset
)

# ✅ 모델 학습 실행
trainer.train()

Epoch,Training Loss,Validation Loss
1,0.0004,0.00015
2,0.0002,8.7e-05
3,0.0002,7.4e-05


TrainOutput(global_step=375, training_loss=0.011374398589134216, metrics={'train_runtime': 201.8744, 'train_samples_per_second': 29.721, 'train_steps_per_second': 1.858, 'total_flos': 394666583040000.0, 'train_loss': 0.011374398589134216, 'epoch': 3.0})

In [None]:
# 모델 가중치 저장
model.save_pretrained("/content/codebert_model")

# ✅ 학습된 모델 다시 저장
model.save_pretrained("/content/codebert_model")

# ✅ 모델과 토크나이저 함께 저장
model.save_pretrained("/content/codebert_model")
tokenizer.save_pretrained("/content/codebert_model")

# ✅ 모델 압축 후 다운로드
import shutil
from google.colab import files

shutil.make_archive("/content/codebert_model", 'zip', "/content/codebert_model")
files.download("/content/codebert_model.zip")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>