In [2]:
from datasets import load_dataset

# Load the CodeXGLUE vulnerability detection dataset
dataset = load_dataset("google/code_x_glue_cc_defect_detection")


In [3]:
import pandas as pd

# Convert and save
df_train = pd.DataFrame(dataset["train"])
df_valid = pd.DataFrame(dataset["validation"])
df_test  = pd.DataFrame(dataset["test"])

df_train.to_csv("devign_train.csv", index=False)
df_valid.to_csv("devign_valid.csv", index=False)
df_test.to_csv("devign_test.csv", index=False)


In [4]:
from transformers import RobertaTokenizer, RobertaModel
import torch

# Load CodeBERT tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained("microsoft/codebert-base")
model = RobertaModel.from_pretrained("microsoft/codebert-base")

# Set model to eval mode (no training)
model.eval()


RobertaModel(
  (embeddings): RobertaEmbeddings(
    (word_embeddings): Embedding(50265, 768, padding_idx=1)
    (position_embeddings): Embedding(514, 768, padding_idx=1)
    (token_type_embeddings): Embedding(1, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): RobertaEncoder(
    (layer): ModuleList(
      (0-11): 12 x RobertaLayer(
        (attention): RobertaAttention(
          (self): RobertaSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): RobertaSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (dr

In [17]:
# Function to embed one C function using CodeBERT
def embed_codebert(func_str, tokenizer, model, max_length=256):
    # Tokenize
    inputs = tokenizer(func_str, return_tensors="pt", truncation=True, max_length=max_length)
    
    with torch.no_grad():
        outputs = model(**inputs)

    # Use the [CLS] token embedding as the vector representation
    cls_embedding = outputs.last_hidden_state[:, 0, :].squeeze().numpy()
    return cls_embedding


In [7]:
import numpy as np
from tqdm import tqdm

# Use a small sample to start (e.g., first 1000 for faster processing)
sample_size = 1000
funcs = dataset["train"]["func"][:sample_size]
labels = dataset["train"]["target"][:sample_size]

# Generate embeddings for each function
embeddings = []
for func in tqdm(funcs, desc="Embedding functions"):
    try:
        vec = embed_codebert(func, tokenizer, model)
        embeddings.append(vec)
    except Exception as e:
        print("Error embedding function:", e)
        embeddings.append(np.zeros(768))  # fallback if error

# Convert to array
X = np.vstack(embeddings)
y = np.array(labels)


Embedding functions: 100%|█████████████████████████████████████████████████████████| 1000/1000 [10:34<00:00,  1.58it/s]


In [5]:

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Split data for internal validation
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Logistic Regression
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)

# Predict and evaluate
y_pred = clf.predict(X_val)
print(classification_report(y_val, y_pred))


NameError: name 'X' is not defined

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Train Random Forest
rf_clf = RandomForestClassifier(n_estimators=100, random_state=42)
rf_clf.fit(X_train, y_train)

# Predict and evaluate
y_rf_pred = rf_clf.predict(X_val)
print("📊 Random Forest Evaluation:\n")
print(classification_report(y_val, y_rf_pred))


In [None]:
def embed_functions_batched(functions, tokenizer, model, batch_size=32, max_len=256):
    """
    Embed a list of C/C++ functions using CodeBERT in batches.
    Returns: np.ndarray of shape (n_samples, 768)
    """
    model.eval()
    all_embeddings = []

    for i in range(0, len(functions), batch_size):
        batch = functions[i:i + batch_size]
        inputs = tokenizer(batch, return_tensors="pt", padding=True,
                           truncation=True, max_length=max_len)
        with torch.no_grad():
            
            outputs = model(**inputs)
        # Extract [CLS] embeddings
        cls_embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()
        all_embeddings.append(cls_embeddings)

    return np.vstack(all_embeddings)


In [None]:
import xgboost as xgb
from sklearn.metrics import classification_report

# Create and train the model
xgb_clf = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
xgb_clf.fit(X_train, y_train)

# Predict on validation set
y_xgb_pred = xgb_clf.predict(X_val)

# Show results
print("📊 XGBoost Evaluation:\n")
print(classification_report(y_val, y_xgb_pred))



In [None]:
# Load full training data from Hugging Face dataset
funcs_train_full = dataset["train"]["func"]
labels_train_full = dataset["train"]["target"]

print("Total training functions:", len(funcs_train_full))


In [13]:
import json

# Replace this with your actual notebook filename
notebook_path = "model.ipynb"

with open(notebook_path, "r", encoding="utf-8") as f:
    notebook = json.load(f)

code_blocks = []
for cell in notebook["cells"]:
    if cell["cell_type"] == "code":
        code = "".join(cell["source"])
        code_blocks.append(code)

# Print all code blocks
for i, block in enumerate(code_blocks):
    print(f"# Block {i+1}\n{block}\n")


# Block 1
from datasets import load_dataset

# Load the CodeXGLUE vulnerability detection dataset
dataset = load_dataset("google/code_x_glue_cc_defect_detection")


# Block 2
import pandas as pd

# Convert and save
df_train = pd.DataFrame(dataset["train"])
df_valid = pd.DataFrame(dataset["validation"])
df_test  = pd.DataFrame(dataset["test"])

df_train.to_csv("devign_train.csv", index=False)
df_valid.to_csv("devign_valid.csv", index=False)
df_test.to_csv("devign_test.csv", index=False)


# Block 3
from transformers import RobertaTokenizer, RobertaModel
import torch

# Load CodeBERT tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained("microsoft/codebert-base")
model = RobertaModel.from_pretrained("microsoft/codebert-base")

# Set model to eval mode (no training)
model.eval()


# Block 4
# Function to embed one C function using CodeBERT
def embed_codebert(func_str, tokenizer, model, max_length=256):
    # Tokenize
    inputs = tokenizer(func_str, return_tensors="pt", truncat

In [None]:
# Step 1: Batched embedding of full training functions
funcs_train_full = dataset["train"]["func"]
labels_train_full = dataset["train"]["target"]

print(f"Total training functions: {len(funcs_train_full)}")

embeddings_full = embed_functions_batched(funcs_train_full, tokenizer, model, batch_size=32, max_len=256)
print(f"Embeddings shape: {embeddings_full.shape}")


In [None]:
import torch
import numpy as np

def embed_functions_batched(functions, tokenizer, model, batch_size=16, max_len=256):
    """
    Embed a list of C/C++ functions using CodeBERT in batches.
    Returns: np.ndarray of shape (n_samples, 768)
    """
    model.eval()
    all_embeddings = []

    for i in range(0, len(functions), batch_size):
        batch = functions[i:i + batch_size]
        inputs = tokenizer(batch, return_tensors="pt", padding=True,
                           truncation=True, max_length=max_len).to('cuda')

        with torch.no_grad():
            outputs = model(**inputs)

        cls_embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()
        all_embeddings.append(cls_embeddings)

        # Clear cache after each batch to avoid OOM
        torch.cuda.empty_cache()

    return np.vstack(all_embeddings)


In [16]:
import torch
from tqdm import tqdm
import numpy as np

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Make sure model is loaded and moved to device
model.to(device)
model.eval()

def embed_functions_batched_safe(functions, tokenizer, model, batch_size=8, max_len=256):
    """
    Embed a list of C/C++ functions using CodeBERT in batches.
    Returns: np.ndarray of shape (n_samples, 768)
    """
    all_embeddings = []

    for i in tqdm(range(0, len(functions), batch_size), desc="Embedding batches"):
        batch = functions[i:i + batch_size]

        try:
            inputs = tokenizer(batch, return_tensors="pt", padding=True,
                               truncation=True, max_length=max_len)
            inputs = {k: v.to(device) for k, v in inputs.items()}

            with torch.no_grad():
                outputs = model(**inputs)

            cls_embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()
            all_embeddings.append(cls_embeddings)

            # Optional: clear cache if you face memory issues
            # torch.cuda.empty_cache()

        except Exception as e:
            print(f"Error at batch starting index {i}: {e}")
            all_embeddings.append(np.zeros((len(batch), 768)))

    return np.vstack(all_embeddings)

# Usage:
funcs_train_full = dataset["train"]["func"]

embeddings_full = embed_functions_batched_safe(funcs_train_full, tokenizer, model, batch_size=8)

print(f"Embedding complete. Shape: {embeddings_full.shape}")


Embedding batches: 100%|███████████████████████████████████████████████████████████| 2732/2732 [55:52<00:00,  1.23s/it]


Embedding complete. Shape: (21854, 768)


In [9]:
import numpy as np
import pandas as pd

# 🔁 Load previously saved embeddings
embeddings_full = np.load("codebert_embeddings.npy")

# 🔁 Load previously saved labels
labels_train_full = pd.read_csv("codebert_labels.csv")["label"].values

print("✅ Loaded embeddings and labels from disk.")
print(f"Embeddings shape: {embeddings_full.shape}")
print(f"Labels shape: {labels_train_full.shape}")


✅ Loaded embeddings and labels from disk.
Embeddings shape: (21854, 768)
Labels shape: (21854,)


In [10]:
import numpy as np
import pandas as pd

# Load previously saved files
X = np.load("codebert_embeddings.npy")
y = pd.read_csv("codebert_labels.csv")["label"].values

print(f"Loaded embeddings: {X.shape}, labels: {y.shape}")


Loaded embeddings: (21854, 768), labels: (21854,)


In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Split full data into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(
    embeddings_full, labels_train_full, test_size=0.2, random_state=42, stratify=labels_train_full)

# Initialize and train Logistic Regression
clf = LogisticRegression(max_iter=1000, n_jobs=-1)
clf.fit(X_train, y_train)

# Predict and evaluate on validation set
y_pred = clf.predict(X_val)
print("Logistic Regression Performance on Full Dataset:\n")
print(classification_report(y_val, y_pred))


Logistic Regression Performance on Full Dataset:

              precision    recall  f1-score   support

       False       0.61      0.74      0.67      2367
        True       0.59      0.44      0.50      2004

    accuracy                           0.60      4371
   macro avg       0.60      0.59      0.58      4371
weighted avg       0.60      0.60      0.59      4371



In [12]:
from sklearn.ensemble import RandomForestClassifier

# Initialize and train Random Forest
rf_clf = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
rf_clf.fit(X_train, y_train)

# Predict and evaluate on validation set
y_rf_pred = rf_clf.predict(X_val)
print("Random Forest Performance on Full Dataset:\n")
print(classification_report(y_val, y_rf_pred))


Random Forest Performance on Full Dataset:

              precision    recall  f1-score   support

       False       0.59      0.74      0.66      2367
        True       0.56      0.40      0.47      2004

    accuracy                           0.58      4371
   macro avg       0.58      0.57      0.56      4371
weighted avg       0.58      0.58      0.57      4371



In [13]:
import xgboost as xgb

# Train XGBoost
xgb_clf = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42, n_jobs=-1)
xgb_clf.fit(X_train, y_train)

# Predict and evaluate
y_xgb_pred = xgb_clf.predict(X_val)

print("📊 XGBoost Performance on Full Dataset:\n")
print(classification_report(y_val, y_xgb_pred))




📊 XGBoost Performance on Full Dataset:

              precision    recall  f1-score   support

       False       0.59      0.65      0.62      2367
        True       0.53      0.48      0.50      2004

    accuracy                           0.57      4371
   macro avg       0.56      0.56      0.56      4371
weighted avg       0.57      0.57      0.57      4371



In [14]:
import joblib

# Save XGBoost classifier
joblib.dump(xgb_clf, "vuln_xgb_model.pkl")

print("✅ Model saved as vuln_xgb_model.pkl")


✅ Model saved as vuln_xgb_model.pkl


In [15]:
from sklearn.metrics import classification_report, accuracy_score

# Predict on validation set
y_pred = xgb_clf.predict(X_val)

# Evaluate
print("📊 Evaluation Results:")
print(classification_report(y_val, y_pred))
print("✅ Accuracy:", accuracy_score(y_val, y_pred))



📊 Evaluation Results:
              precision    recall  f1-score   support

       False       0.59      0.65      0.62      2367
        True       0.53      0.48      0.50      2004

    accuracy                           0.57      4371
   macro avg       0.56      0.56      0.56      4371
weighted avg       0.57      0.57      0.57      4371

✅ Accuracy: 0.5696636925188744


In [16]:
from transformers import RobertaTokenizer, RobertaModel
import torch
import joblib
import numpy as np

# Load tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained("microsoft/codebert-base")
model = RobertaModel.from_pretrained("microsoft/codebert-base")
model.eval()

# Load trained XGBoost model
xgb_clf = joblib.load("vuln_xgb_model.pkl")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


RobertaModel(
  (embeddings): RobertaEmbeddings(
    (word_embeddings): Embedding(50265, 768, padding_idx=1)
    (position_embeddings): Embedding(514, 768, padding_idx=1)
    (token_type_embeddings): Embedding(1, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): RobertaEncoder(
    (layer): ModuleList(
      (0-11): 12 x RobertaLayer(
        (attention): RobertaAttention(
          (self): RobertaSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): RobertaSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (dr

In [23]:
def predict_vulnerability(code_snippet: str) -> str:
    """
    Predict if a single C/C++ code snippet is vulnerable using CodeBERT + XGBoost.
    """
    # Tokenize input
    inputs = tokenizer(code_snippet, return_tensors="pt", padding=True, truncation=True, max_length=256)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    # Get CodeBERT [CLS] embedding
    with torch.no_grad():
        outputs = model(**inputs)
        embedding = outputs.last_hidden_state[:, 0, :].cpu().numpy()  # shape: (1, 768)

    # Predict using trained XGBoost model
    prediction = xgb_clf.predict(embedding)[0]
    proba = xgb_clf.predict_proba(embedding)[0]

    label = "VULNERABLE" if prediction == 1 else "SAFE"
    confidence = proba[prediction]

    return f"🔍 Prediction: {label} (Confidence: {confidence:.2f})"


In [26]:
unsafe_code = """
#include <stdio.h>

void vulnerable_function() {
    char buffer[64];
    printf("Enter your name: ");
    gets(buffer);  // ❌ Unsafe: allows buffer overflow
    printf("Hello, %s\\n", buffer);
}
"""


In [27]:
import torch
import joblib
import numpy as np

def predict_vulnerability(code_snippet, model, tokenizer, clf_model_path="vuln_xgb_model.pkl"):
    """
    Predict if the given code snippet is vulnerable using CodeBERT and trained ML model.
    """
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Tokenize and embed
    model.to(device)
    model.eval()

    inputs = tokenizer(code_snippet, return_tensors="pt", padding=True, truncation=True, max_length=256)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model(**inputs)
        cls_embedding = outputs.last_hidden_state[:, 0, :].cpu().numpy()  # shape (1, 768)

    # Load classifier
    clf = joblib.load(clf_model_path)

    # Predict
    pred = clf.predict(cls_embedding)[0]
    proba = clf.predict_proba(cls_embedding)[0][pred]

    label = "VULNERABLE" if pred == 1 else "SAFE"
    print(f"🔍 Prediction: {label} (confidence: {proba:.2f})")
