# Load lib and setup

In [105]:
import pandas as pd
import json
import os

from sklearnex import patch_sklearn
patch_sklearn()

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [106]:
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline


# Load data

In [107]:
DATAPATH = [
    "./OutputOpcodes/reentrancy/",
    "./OutputOpcodes/gaslimit/",
    "./OutputOpcodes/integeroverflow/",
]
data_list = []
for path in DATAPATH:
    for file in os.listdir(path):
        if file.endswith(".json"):
            with open(os.path.join(path, file), 'r') as f:
                data = json.load(f)
                data_list.append({
                    "opcodes": data["opcodes"],
                    "label": path.split("/")[-2]
                })
                
# Create a DataFrame from the list of dictionaries
data = pd.DataFrame(data_list)
data


Unnamed: 0,opcodes,label
0,"[PUSH1, PUSH1, MSTORE, CALLVALUE, ISZERO, PUSH...",reentrancy
1,"[PUSH1, PUSH1, MSTORE, CALLVALUE, ISZERO, PUSH...",reentrancy
2,"[PUSH1, PUSH1, MSTORE, CALLVALUE, ISZERO, PUSH...",reentrancy
3,"[PUSH1, PUSH1, MSTORE, CALLVALUE, ISZERO, PUSH...",reentrancy
4,"[PUSH1, PUSH1, MSTORE, CALLVALUE, ISZERO, PUSH...",reentrancy
...,...,...
709,"[PUSH1, PUSH1, MSTORE, CALLVALUE, ISZERO, PUSH...",integeroverflow
710,"[PUSH1, PUSH1, MSTORE, CALLVALUE, ISZERO, PUSH...",integeroverflow
711,"[PUSH1, PUSH1, MSTORE, CALLVALUE, ISZERO, PUSH...",integeroverflow
712,"[PUSH1, PUSH1, MSTORE, CALLVALUE, ISZERO, PUSH...",integeroverflow


# EDA

In [108]:
# Import plotly
import plotly.express as px
import plotly.graph_objects as go
from collections import Counter

# 1. Distribution of labels
label_counts = data['label'].value_counts()
fig = px.bar(x=label_counts.index, y=label_counts.values,
             title='Distribution of Vulnerability Types',
             labels={'x': 'Vulnerability Type', 'y': 'Count'})
fig.show()


In [109]:
# Create balanced dataset with 199 samples per label
balanced_data = pd.concat([
    data[data['label'] == label].sample(n=199, random_state=42) 
    for label in data['label'].unique()
])

# Shuffle the balanced dataset
data = balanced_data.sample(frac=1, random_state=42).reset_index(drop=True)

label_counts = data['label'].value_counts()
fig = px.bar(x=label_counts.index, y=label_counts.values,
             title='Distribution of Vulnerability Types',
             labels={'x': 'Vulnerability Type', 'y': 'Count'})
fig.show()


In [110]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 597 entries, 0 to 596
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   opcodes  597 non-null    object
 1   label    597 non-null    object
dtypes: object(2)
memory usage: 9.5+ KB


In [111]:
type(data['opcodes'][0])


list

In [112]:
# Clean opcodes
def clean_opcodes(opcode_list):
    # Remove UNKNOWN and INVALID opcodes
    cleaned = [op for op in opcode_list if not (op.startswith('UNKNOWN_') or op.startswith('INVALID_'))]
    
    # Remove numeric values after opcodes (e.g., PUSH1, PUSH2 -> PUSH)
    cleaned = [op.rstrip('0123456789') for op in cleaned]
    
    return cleaned


In [113]:
data['opcodes'] = data['opcodes'].apply(clean_opcodes)
data['opcodes'] = data['opcodes'].apply(lambda x: ' '.join(x))
data['opcodes'][0]

'PUSH PUSH MSTORE CALLVALUE ISZERO PUSH JUMPI PUSH DUP REVERT JUMPDEST JUMPDEST PUSH DUP PUSH PUSH CODECOPY PUSH RETURN STOP PUSH PUSH MSTORE PUSH PUSH PUSH CALLDATALOAD DIV AND PUSH DUP EQ PUSH JUMPI JUMPDEST PUSH DUP REVERT JUMPDEST CALLVALUE ISZERO PUSH JUMPI PUSH DUP REVERT JUMPDEST PUSH PUSH PUSH CALLDATALOAD AND PUSH JUMP JUMPDEST STOP JUMPDEST PUSH DUP AND PUSH DUP DUP MSTORE PUSH DUP SWAP MSTORE PUSH DUP DUP SHA DUP SLOAD SWAP SWAP SSTORE SWAP SWAP SWAP DUP SWAP MLOAD PUSH PUSH MLOAD DUP DUP SUB DUP DUP DUP PUSH GAS SUB CALL SWAP POP POP POP ISZERO ISZERO PUSH JUMPI PUSH DUP REVERT JUMPDEST JUMPDEST POP POP JUMP STOP LOG PUSH SHA PUSH RETURN DUP SWAP DUP STOP'

In [114]:
data

Unnamed: 0,opcodes,label
0,PUSH PUSH MSTORE CALLVALUE ISZERO PUSH JUMPI P...,reentrancy
1,PUSH PUSH MSTORE CALLVALUE ISZERO PUSH JUMPI P...,integeroverflow
2,PUSH PUSH MSTORE CALLVALUE ISZERO PUSH JUMPI P...,reentrancy
3,PUSH PUSH MSTORE CALLVALUE ISZERO PUSH JUMPI P...,reentrancy
4,PUSH PUSH MSTORE CALLVALUE ISZERO PUSH JUMPI P...,gaslimit
...,...,...
592,PUSH PUSH MSTORE CALLVALUE ISZERO PUSH JUMPI P...,reentrancy
593,PUSH PUSH MSTORE CALLVALUE ISZERO PUSH JUMPI P...,reentrancy
594,PUSH PUSH MSTORE PUSH DUP MLOAD SWAP DUP ADD P...,gaslimit
595,PUSH PUSH MSTORE CALLVALUE ISZERO PUSH JUMPI P...,integeroverflow


# Data preprocess

In [116]:
# Split data into train and test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data['opcodes'], data['label'], test_size=0.2, random_state=42)

X_train.shape,X_test.shape,y_train.shape,y_test.shape


((477,), (120,), (477,), (120,))

In [117]:
# Vectorize the opcodes
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
vectorizer = CountVectorizer()
labelProcess = LabelEncoder()
y_train_encoded = labelProcess.fit_transform(y_train)
y_test_encoded = labelProcess.transform(y_test)

# Modeling

In [127]:
y_test_encoded

array([2, 1, 2, 1, 1, 1, 2, 1, 0, 1, 0, 1, 1, 2, 2, 2, 1, 2, 2, 0, 2, 1,
       1, 2, 0, 0, 1, 0, 2, 1, 2, 1, 1, 1, 1, 0, 2, 1, 2, 2, 1, 1, 1, 2,
       1, 1, 0, 2, 0, 0, 2, 2, 0, 0, 0, 2, 1, 1, 0, 1, 2, 0, 0, 0, 1, 0,
       1, 0, 1, 0, 2, 2, 0, 1, 0, 0, 2, 0, 2, 2, 2, 2, 2, 0, 1, 0, 2, 0,
       1, 0, 2, 0, 2, 2, 0, 0, 2, 2, 1, 0, 0, 0, 0, 0, 2, 2, 0, 2, 2, 2,
       0, 0, 2, 2, 0, 1, 0, 1, 1, 2])

In [140]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
hyperParams = {
    'classifier__n_estimators': [100, 150, 200],
    'classifier__learning_rate': [0.1, 0.2, 0.3],
    'classifier__max_depth': [3, 5, 7],
    'classifier__subsample': [0.8, 0.9, 1.0],
    'classifier__min_samples_split': [2, 4, 6]
}
pipeline = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('classifier', GradientBoostingClassifier())
])
grid_search = GridSearchCV(pipeline, hyperParams, cv=5, n_jobs=-1, verbose=1)
print(grid_search)
grid_search.fit(X_train, y_train_encoded)
score = grid_search.score(X_test, y_test_encoded)
print(f"Accuracy: {score:.6}")

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('vectorizer', CountVectorizer()),
                                       ('classifier',
                                        GradientBoostingClassifier())]),
             n_jobs=-1,
             param_grid={'classifier__learning_rate': [0.1, 0.2, 0.3],
                         'classifier__max_depth': [3, 5, 7],
                         'classifier__min_samples_split': [2, 4, 6],
                         'classifier__n_estimators': [100, 150, 200],
                         'classifier__subsample': [0.8, 0.9, 1.0]},
             verbose=1)
Fitting 5 folds for each of 243 candidates, totalling 1215 fits


Accuracy: 0.8


In [141]:
# Evaluate the model
from sklearn.metrics import accuracy_score, classification_report

# Get predictions
y_pred = grid_search.predict(X_test)

# Calculate accuracy using encoded labels
accuracy = accuracy_score(y_test_encoded, y_pred)

# Generate classification report using encoded labels
report = classification_report(
    y_test_encoded, 
    y_pred,
    target_names=labelProcess.classes_  # Use original label names in the report
)

print(f"Accuracy: {accuracy:.2f}")
print("\nClassification Report:")
print(report)

Accuracy: 0.80

Classification Report:
                 precision    recall  f1-score   support

       gaslimit       1.00      0.98      0.99        41
integeroverflow       0.69      0.67      0.68        36
     reentrancy       0.71      0.74      0.73        43

       accuracy                           0.80       120
      macro avg       0.80      0.80      0.80       120
   weighted avg       0.80      0.80      0.80       120



In [142]:
# save the model
import joblib
joblib.dump(grid_search, 'model.joblib')


['model.joblib']