# Load lib

In [8]:
import pandas as pd
import json
import os


# Load Data

In [9]:
DATAPATH = [
    "./filtered_results/reentrancy/",
    "./filtered_results/gaslimit/",
    "./filtered_results/integeroverflow/",
]
data_list = []
for path in DATAPATH:
    for file in os.listdir(path):
        if file.endswith(".json"):
            with open(os.path.join(path, file), 'r') as f:
                data = json.load(f)
                data_list.append({
                    "opcodes": data["opcodes"],
                    "label": path.split("/")[-2]
                })
                
# Create a DataFrame from the list of dictionaries
data = pd.DataFrame(data_list)
data


Unnamed: 0,opcodes,label
0,"[PUSH1, PUSH1, MSTORE, CALLVALUE, ISZERO, PUSH...",reentrancy
1,"[PUSH1, PUSH1, MSTORE, PUSH1, DUP1, SLOAD, PUS...",reentrancy
2,"[PUSH1, PUSH1, MSTORE, CALLVALUE, ISZERO, PUSH...",reentrancy
3,"[PUSH1, PUSH1, MSTORE, CALLVALUE, ISZERO, PUSH...",reentrancy
4,"[PUSH1, PUSH1, MSTORE, CALLVALUE, DUP1, ISZERO...",reentrancy
...,...,...
42272,"[PUSH1, PUSH1, MSTORE, CALLVALUE, ISZERO, PUSH...",integeroverflow
42273,"[PUSH1, PUSH1, MSTORE, CALLVALUE, ISZERO, PUSH...",integeroverflow
42274,"[PUSH1, PUSH1, MSTORE, CALLVALUE, ISZERO, PUSH...",integeroverflow
42275,"[PUSH1, PUSH1, MSTORE, PUSH1, DUP1, SLOAD, PUS...",integeroverflow


# EDA

## Clean data

In [10]:
# Import plotly
import plotly.express as px
import plotly.graph_objects as go
from collections import Counter

# 1. Distribution of labels
label_counts = data['label'].value_counts()
fig = px.bar(x=label_counts.index, y=label_counts.values,
             title='Distribution of Vulnerability Types',
             labels={'x': 'Vulnerability Type', 'y': 'Count'})
fig.show()


In [11]:
# Create balanced dataset with 199 samples per label
balanced_data = pd.concat([
    data[data['label'] == label].sample(n=600, random_state=42) 
    for label in data['label'].unique()
])

# Shuffle the balanced dataset
data = balanced_data.sample(frac=1, random_state=42).reset_index(drop=True)

label_counts = data['label'].value_counts()
fig = px.bar(x=label_counts.index, y=label_counts.values,
             title='Distribution of Vulnerability Types',
             labels={'x': 'Vulnerability Type', 'y': 'Count'})
fig.show()


In [12]:
# Clean opcodes
def clean_opcodes(opcode_list):
    # Remove UNKNOWN and INVALID opcodes
    cleaned = [op for op in opcode_list if not (op.startswith('UNKNOWN_') or op.startswith('INVALID_'))]
    
    # Remove numeric values after opcodes (e.g., PUSH1, PUSH2 -> PUSH)
    cleaned = [op.rstrip('0123456789') for op in cleaned]
    
    return cleaned


In [13]:
data['opcodes'] = data['opcodes'].apply(clean_opcodes)
data['opcodes'] = data['opcodes'].apply(lambda x: ' '.join(x))
data

Unnamed: 0,opcodes,label
0,PUSH PUSH MSTORE CALLVALUE ISZERO PUSH JUMPI J...,integeroverflow
1,PUSH PUSH MSTORE CALLVALUE ISZERO PUSH JUMPI P...,gaslimit
2,PUSH PUSH MSTORE CALLVALUE ISZERO PUSH JUMPI P...,gaslimit
3,PUSH PUSH MSTORE CALLVALUE ISZERO PUSH JUMPI P...,reentrancy
4,PUSH PUSH MSTORE CALLVALUE DUP ISZERO PUSH JUM...,integeroverflow
...,...,...
1795,PUSH PUSH MSTORE CALLVALUE ISZERO PUSH JUMPI P...,gaslimit
1796,PUSH PUSH MSTORE CALLVALUE ISZERO PUSH JUMPI P...,integeroverflow
1797,PUSH PUSH MSTORE CALLVALUE ISZERO PUSH JUMPI P...,gaslimit
1798,PUSH PUSH MSTORE CALLVALUE ISZERO PUSH JUMPI P...,integeroverflow


## Feature Extraction