In [None]:
import os

# 모델 경로 확인
print(os.path.exists("/content/codebert_model"))  # True면 존재, False면 없음

False


In [None]:
from google.colab import drive
from transformers import AutoTokenizer, AutoModelForSequenceClassification

drive.mount('/content/drive')

model_path = "/content/drive/MyDrive/models/codebert_model"

tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)

print("✅ 모델 로드 완료!")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
✅ 모델 로드 완료!


In [None]:
import torch
print("PyTorch 버전:", torch.__version__)
print("CUDA 사용 가능:", torch.cuda.is_available())

PyTorch 버전: 2.6.0+cu124
CUDA 사용 가능: False


In [None]:
import torch

def analyze_code(code_snippet):
    inputs = tokenizer(code_snippet, return_tensors="pt", truncation=True, padding="max_length", max_length=128)

    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        predicted = torch.argmax(logits, dim=1).item()

    if predicted == 1:
        return {
            "prediction": "⚠️ 취약점 있음 (예: Injection)",
            "label": 1,
            "security_score": 50
        }
    else:
        return {
            "prediction": "✅ 안전한 코드",
            "label": 0,
            "security_score": 100
        }

In [None]:
test_code = """
user_input = request.GET.get('id')
query = "SELECT * FROM users WHERE id = " + user_input
"""

result = analyze_code(test_code)

print("🔍 분석 결과:", result["prediction"])
print("🔐 보안 점수:", result["security_score"])

🔍 분석 결과: ✅ 안전한 코드
🔐 보안 점수: 100


In [None]:
def analyze_code(code_snippet):
    # CodeBERT 분석
    inputs = tokenizer(code_snippet, return_tensors="pt", truncation=True, padding="max_length", max_length=128)
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        predicted = torch.argmax(logits, dim=1).item()

    # 보완: SQL 키워드가 포함되었는지 단순 검사
    risky_keywords = ["SELECT", "DROP", "INSERT", "DELETE", "UPDATE", "WHERE", "' +", '" +', "+ user_input"]
    risk_found = any(kw in code_snippet.upper() for kw in risky_keywords)

    # 모델 + 키워드 기반 보조 판단
    if predicted == 1 or risk_found:
        return {"prediction": "⚠️ Injection 위험 가능성 있음", "label": 1, "security_score": 50}
    else:
        return {"prediction": "✅ 안전한 코드", "label": 0, "security_score": 100}

In [None]:
test_code = """
user_input = request.GET.get('id')
query = "SELECT * FROM users WHERE id = " + user_input
"""

result = analyze_code(test_code)

print("🔍 분석 결과:", result["prediction"])
print("🔐 보안 점수:", result["security_score"])

🔍 분석 결과: ⚠️ Injection 위험 가능성 있음
🔐 보안 점수: 50


In [None]:
# SQL Injection(취약점 O)
code_1 = """
user_input = request.args.get('username')
query = "SELECT * FROM users WHERE name = '" + user_input + "'"
"""

# Hardcoded Password(취약점 O)
code_2 = """
password = "admin123"
login(user, password)
"""

# XSS(취약점 O)
code_3 = """
@app.route('/search')
def search():
    keyword = request.args.get('q')
    return "<p>" + keyword + "</p>"
"""

# SQL Injection 방지 코드(취약점 X)
code_4 = """
cursor.execute("SELECT * FROM users WHERE id = %s", (user_id,))
"""

# XSS 방지 코드(취약점 X)
code_5 = """
@app.route('/search')
def search():
    keyword = escape(request.args.get('q'))
    return f"<p>{keyword}</p>"
"""

test_cases = [code_1, code_2, code_3, code_4, code_5]

for idx, code in enumerate(test_cases, start=1):
    result = analyze_code(code)
    print(f"[Test Case {idx}] 🔍 결과: {result['prediction']} | 🔐 점수: {result['security_score']}\n")

[Test Case 1] 🔍 결과: ⚠️ Injection 위험 가능성 있음 | 🔐 점수: 50

[Test Case 2] 🔍 결과: ✅ 안전한 코드 | 🔐 점수: 100

[Test Case 3] 🔍 결과: ⚠️ Injection 위험 가능성 있음 | 🔐 점수: 50

[Test Case 4] 🔍 결과: ⚠️ Injection 위험 가능성 있음 | 🔐 점수: 50

[Test Case 5] 🔍 결과: ✅ 안전한 코드 | 🔐 점수: 100



In [None]:
# 위에서 잘못된 결과가 몇 개 나와서 아무래도 데이터셋을 좀 더 학습시키는 게 좋을 듯
# 지금 상태는 그냥 껍데기만 얼추 기워낸 수준이라...
# 우선 데이터셋 추가로 찾아서 라벨링해서 학습시키고 점수 체게도 좀 더 세분화하기

In [None]:
from google.colab import files

uploaded = files.upload()

Saving labeled_vuln_data.csv to labeled_vuln_data.csv


In [None]:
import pandas as pd

# 업로드한 CSV 파일 로드
df = pd.read_csv("labeled_vuln_data.csv")

# 확인
print("✅ CSV 로드 완료!")
print(df.head())

✅ CSV 로드 완료!
                                                code  label
0  user_input = request.args.get('id')\nquery = '...      1
1  cursor.execute('SELECT * FROM users WHERE id =...      0
2                              password = 'admin123'      2
3                    API_KEY = 'sk-1234567890abcdef'      2
4               <p> + request.args.get("q") + "</p>"      3


In [None]:
!pip install datasets
!pip install transformers datasets

Collecting datasets
  Downloading datasets-3.4.1-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.4.1-py3-none-any.whl (487 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m487.4/487.4 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.12.0-py3-none-any.wh

In [None]:
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer

# ✅ CodeBERT 토크나이저 & 모델 불러오기 (5개 분류)
tokenizer = AutoTokenizer.from_pretrained("microsoft/codebert-base")
model = AutoModelForSequenceClassification.from_pretrained("microsoft/codebert-base", num_labels=5)

# ✅ 토크나이즈 함수 정의
def tokenize(example):
    tokens = tokenizer(example["code"], padding="max_length", truncation=True, max_length=128)
    tokens["labels"] = example["label"]
    return tokens

# ✅ Hugging Face Dataset 변환 & 전처리
dataset = Dataset.from_pandas(df)
tokenized_dataset = dataset.map(tokenize)

# ✅ 학습 설정
training_args = TrainingArguments(
    output_dir="./codebert_finetuned",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_steps=10,
    report_to="none"
)

# ✅ Trainer 정의
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_dataset
)

# ✅ 학습 실행!
trainer.train()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/498 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/codebert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/15 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss
1,No log,1.596481
2,No log,1.581815
3,No log,1.574679


TrainOutput(global_step=3, training_loss=1.6101417541503906, metrics={'train_runtime': 124.6309, 'train_samples_per_second': 0.361, 'train_steps_per_second': 0.024, 'total_flos': 2960079102720.0, 'train_loss': 1.6101417541503906, 'epoch': 3.0})

In [None]:
# 추가 학습시킨 모델 저장하기
model.save_pretrained("./codebert_finetuned")
tokenizer.save_pretrained("./codebert_finetuned")

('./codebert_finetuned/tokenizer_config.json',
 './codebert_finetuned/special_tokens_map.json',
 './codebert_finetuned/vocab.json',
 './codebert_finetuned/merges.txt',
 './codebert_finetuned/added_tokens.json',
 './codebert_finetuned/tokenizer.json')

In [None]:
import os
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# 저장한 모델 경로
model_path = "./codebert_finetuned"

# 모델과 토크나이저 불러오기
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)

print("✅ 모델과 토크나이저 불러오기 성공!")
print("모델의 클래스 수:", model.config.num_labels)

# 저장된 모델 내부 파일 확인
print("📁 저장된 파일 목록:", os.listdir("./codebert_finetuned"))

✅ 모델과 토크나이저 불러오기 성공!
모델의 클래스 수: 5
📁 저장된 파일 목록: ['model.safetensors', 'tokenizer.json', 'config.json', 'special_tokens_map.json', 'merges.txt', 'vocab.json', 'checkpoint-3', 'tokenizer_config.json']
