In [9]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

In [10]:
secure_df = pd.read_csv("secure.csv")
vulnerable_df = pd.read_csv("vulnerable.csv")

In [11]:
df = pd.concat([secure_df, vulnerable_df], ignore_index=True)
print(len(df))

36671


In [12]:
contract_codes = []
for hash_id in df['hash_id']:
    file_path = f"source/{hash_id}.sol"  
    with open(file_path, 'r', encoding='utf-8') as file:
        contract_codes.append(file.read())

df['contract_code'] = contract_codes

In [6]:
import torch
print(torch.__version__)

2.6.0+cpu


In [13]:
from transformers import RobertaTokenizer, RobertaModel
import torch

tokenizer = RobertaTokenizer.from_pretrained("microsoft/codebert-base")
model = RobertaModel.from_pretrained("microsoft/codebert-base")

def get_codebert_embedding(code):
    inputs = tokenizer(code, return_tensors="pt", max_length=512, truncation=True, padding="max_length")
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()  

df['code_embedding'] = df['contract_code'].apply(get_codebert_embedding)


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


KeyboardInterrupt: 

In [15]:
def get_codebert_embedding_batch(codes):
    inputs = tokenizer(codes, return_tensors="pt", max_length=512, truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()

batch_size = 8  
embeddings = []
for i in range(0, len(df), batch_size):
    batch = df['contract_code'].iloc[i:i+batch_size]
    embeddings.extend(get_codebert_embedding_batch(batch))
df['code_embedding'] = embeddings

ValueError: text input must be of type `str` (single example), `List[str]` (batch or single pretokenized example) or `List[List[str]]` (batch of pretokenized examples).

In [3]:
print(torch.cuda.is_available())

False


In [16]:
from transformers import RobertaTokenizer, RobertaModel
import torch
import numpy as np
from tqdm import tqdm

# Check GPU availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load tokenizer and model (Optimized)
tokenizer = RobertaTokenizer.from_pretrained("microsoft/codebert-base", use_fast=True)
model = RobertaModel.from_pretrained("microsoft/codebert-base").to(device)
model.eval()  # Set to evaluation mode

# Set batch size (Optimized for P100 GPU)
BATCH_SIZE = 32  # Increase if GPU memory allows

def batch_get_codebert_embedding(texts):
    """Processes a batch of contract codes into embeddings."""
    inputs = tokenizer(texts, return_tensors="pt", max_length=512, truncation=True, padding="max_length").to(device)
    
    with torch.no_grad():
        outputs = model(**inputs)
    
    return outputs.last_hidden_state.mean(dim=1).cpu().numpy()

# Process in batches
all_embeddings = []
contracts = df['contract_code'].tolist()  # Convert DataFrame column to list

for i in tqdm(range(0, len(contracts), BATCH_SIZE), desc="Processing Batches"):
    batch = contracts[i:i + BATCH_SIZE]  # Get batch
    batch_embeddings = batch_get_codebert_embedding(batch)  # Compute embeddings
    all_embeddings.extend(batch_embeddings)  # Store results

df['code_embedding'] = list(all_embeddings)


Processing Batches:   3%|▎         | 32/1146 [05:15<3:03:03,  9.86s/it]


KeyboardInterrupt: 