# Load lib

In [1]:
!uv     pip install pandas torch scikit-learn-intelex matplotlib plotly torchviz 
!uv pip install hiddenlayer networkx graphviz


[2mUsing Python 3.9.20 environment at: C:\Users\Zh4g3Z\Desktop\MLCrypto\.venv[0m
[2mAudited [1m6 packages[0m [2min 9ms[0m[0m
[2mUsing Python 3.9.20 environment at: C:\Users\Zh4g3Z\Desktop\MLCrypto\.venv[0m
[2mAudited [1m3 packages[0m [2min 5ms[0m[0m


In [29]:
import pandas as pd
import json
import os
from sklearnex import patch_sklearn
patch_sklearn()
from sklearn.metrics import classification_report
import joblib
import warnings
warnings.filterwarnings("ignore")
from sklearn.metrics import classification_report, ConfusionMatrixDisplay
import plotly.graph_objects as go
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


# Load Data

In [3]:
DATAPATH = [
    "./filtered_results/reentrancy/",
    "./filtered_results/gaslimit/", 
    "./filtered_results/integeroverflow/",
]

# Load and process clean data first
cleandata = pd.read_csv("../CreateCleanDataset/CleanOpcodes.csv")


# Convert clean data - remove extra spaces and brackets
cleandata['opcodes'] = cleandata['opcodes'].apply(lambda x: ' '.join(eval(x)).replace("' '", " ").split())

# Load vulnerability data
data_list = []
for path in DATAPATH:
    for file in os.listdir(path):
        if file.endswith(".json"):
            with open(os.path.join(path, file), 'r') as f:
                data = json.load(f)
                data_list.append({
                    "opcodes": data["opcodes"],
                    "label": path.split("/")[-2]
                })

vuln_data = pd.DataFrame(data_list)

# Concatenate after format verification
final_data = pd.concat([vuln_data, cleandata], axis=0, ignore_index=True)
data = final_data.copy()
data

Unnamed: 0,opcodes,label
0,"[PUSH1, PUSH1, MSTORE, CALLVALUE, ISZERO, PUSH...",reentrancy
1,"[PUSH1, PUSH1, MSTORE, CALLVALUE, DUP1, ISZERO...",reentrancy
2,"[PUSH1, PUSH1, MSTORE, CALLVALUE, DUP1, ISZERO...",reentrancy
3,"[PUSH1, PUSH1, MSTORE, CALLVALUE, ISZERO, PUSH...",reentrancy
4,"[PUSH1, PUSH1, MSTORE, CALLVALUE, ISZERO, PUSH...",reentrancy
...,...,...
42968,"[PUSH1, PUSH1, MSTORE, PUSH1, CALLDATASIZE, LT...",clean
42969,"[PUSH1, PUSH1, MSTORE, PUSH1, CALLDATASIZE, LT...",clean
42970,"[PUSH1, PUSH1, MSTORE, PUSH1, CALLDATASIZE, LT...",clean
42971,"[PUSH20, ADDRESS, EQ, PUSH1, PUSH1, MSTORE, PU...",clean


# EDA

## Clean data

In [4]:
# Import plotly
import plotly.express as px
import plotly.graph_objects as go
from collections import Counter

# 1. Distribution of labels
label_counts = data['label'].value_counts()
fig = px.bar(x=label_counts.index, y=label_counts.values,
             title='Distribution of Vulnerability Types',
             labels={'x': 'Vulnerability Type', 'y': 'Count'})
fig.show()


In [5]:
# Create balanced dataset with 199 samples per label
balanced_data = pd.concat([
    data[data['label'] == label].sample(n=600, random_state=42) 
    for label in data['label'].unique()
])

# Shuffle the balanced dataset
data = balanced_data.sample(frac=1, random_state=42).reset_index(drop=True)

label_counts = data['label'].value_counts()
fig = px.bar(x=label_counts.index, y=label_counts.values,
             title='Distribution of Vulnerability Types',
             labels={'x': 'Vulnerability Type', 'y': 'Count'})
fig.show()


In [6]:
# Clean opcodes
def clean_opcodes(opcode_list):
    # Remove UNKNOWN and INVALID opcodes
    cleaned = [op for op in opcode_list if not (op.startswith('UNKNOWN_') or op.startswith('INVALID_'))]
    
    # Remove numeric values after opcodes (e.g., PUSH1, PUSH2 -> PUSH)
    cleaned = [op.rstrip('0123456789') for op in cleaned]
    
    return cleaned


In [7]:
data['opcodes'] = data['opcodes'].apply(clean_opcodes)
data['opcodes'] = data['opcodes'].apply(lambda x: ' '.join(x))
data

Unnamed: 0,opcodes,label
0,PUSH PUSH MSTORE CALLDATASIZE ISZERO PUSH JUMP...,clean
1,PUSH PUSH MSTORE CALLVALUE DUP ISZERO PUSH JUM...,clean
2,PUSH PUSH MSTORE PUSH DUP SLOAD PUSH PUSH PUSH...,gaslimit
3,PUSH PUSH MSTORE PUSH PUSH SSTORE CALLVALUE PU...,integeroverflow
4,PUSH PUSH MSTORE CALLVALUE DUP ISZERO PUSH JUM...,clean
...,...,...
2395,PUSH PUSH MSTORE CALLVALUE ISZERO PUSH JUMPI J...,integeroverflow
2396,PUSH PUSH MSTORE CALLVALUE ISZERO PUSH JUMPI P...,gaslimit
2397,PUSH PUSH MSTORE PUSH PUSH DUP SWAP MSTORE PUS...,gaslimit
2398,PUSH PUSH MSTORE CALLVALUE ISZERO PUSH JUMPI J...,integeroverflow


In [8]:
data['opcodes'][1]

'PUSH PUSH MSTORE CALLVALUE DUP ISZERO PUSH JUMPI PUSH DUP REVERT JUMPDEST POP PUSH CALLDATASIZE LT PUSH JUMPI PUSH CALLDATALOAD PUSH SHR DUP PUSH GT PUSH JUMPI DUP PUSH GT PUSH JUMPI DUP PUSH EQ PUSH JUMPI DUP PUSH EQ PUSH JUMPI DUP PUSH EQ PUSH JUMPI PUSH JUMP JUMPDEST DUP PUSH EQ PUSH JUMPI DUP PUSH EQ PUSH JUMPI DUP PUSH EQ PUSH JUMPI PUSH JUMP JUMPDEST DUP PUSH GT PUSH JUMPI DUP PUSH EQ PUSH JUMPI DUP PUSH EQ PUSH JUMPI DUP PUSH EQ PUSH JUMPI PUSH JUMP JUMPDEST DUP PUSH EQ PUSH JUMPI DUP PUSH EQ PUSH JUMPI JUMPDEST PUSH DUP REVERT JUMPDEST PUSH PUSH JUMP JUMPDEST PUSH MLOAD PUSH SWAP SWAP PUSH JUMP JUMPDEST PUSH MLOAD DUP SWAP SUB SWAP RETURN JUMPDEST PUSH PUSH CALLDATASIZE PUSH PUSH JUMP JUMPDEST PUSH JUMP JUMPDEST PUSH MLOAD PUSH SWAP SWAP PUSH JUMP JUMPDEST PUSH PUSH JUMP JUMPDEST PUSH MLOAD PUSH SWAP SWAP PUSH JUMP JUMPDEST PUSH PUSH CALLDATASIZE PUSH PUSH JUMP JUMPDEST PUSH JUMP JUMPDEST PUSH PUSH JUMP JUMPDEST PUSH MLOAD PUSH SWAP SWAP PUSH JUMP JUMPDEST PUSH PUSH CALLDATASI

## Feature Extraction

In [9]:
from collections import defaultdict

def extract_reentrancy_features(opcodes):
    # Tokenize the opcode sequence
    tokens = opcodes.split()

    # Initialize block features
    nodes = set(tokens)
    edges = defaultdict(list)
    in_degree = defaultdict(int)
    out_degree = defaultdict(int)

    # Initialize attribute features
    features = {
        'external_call_presence': False,
        'state_change_after_call': False,
        'recursive_call_potential': False,
        'gas_forwarding': False,
        'arithmetic_count': 0,
        'logical_count': 0,
        'comparison_count': 0,
        'storage_count': 0,
        'control_flow_count': 0
    }

    # Opcode categories
    external_calls = {'CALL', 'CALLCODE', 'DELEGATECALL', 'STATICCALL'}
    state_changes = {'SSTORE', 'MSTORE'}
    gas_management = {'GAS', 'CALLGAS'}
    arithmetic_ops = {'ADD', 'SUB', 'MUL', 'DIV', 'EXP'}
    logical_ops = {'AND', 'OR', 'XOR', 'NOT'}
    comparison_ops = {'EQ', 'LT', 'GT', 'SLT', 'SGT'}
    control_flow = {'JUMP', 'JUMPI', 'JUMPDEST'}

    # Process opcodes to extract features
    for i, op in enumerate(tokens):
        # External call presence
        if op in external_calls:
            features['external_call_presence'] = True

        # State change after external call
        if op in state_changes:
            if any(tokens[j] in external_calls for j in range(i)):
                features['state_change_after_call'] = True

        # Recursive call potential
        if op in control_flow:
            features['recursive_call_potential'] = True

        # Gas forwarding
        if op in gas_management:
            features['gas_forwarding'] = True

        # Count opcode categories
        if op in arithmetic_ops:
            features['arithmetic_count'] += 1
        elif op in logical_ops:
            features['logical_count'] += 1
        elif op in comparison_ops:
            features['comparison_count'] += 1
        elif op in state_changes:
            features['storage_count'] += 1
        elif op in control_flow:
            features['control_flow_count'] += 1

        # Build edges for block features
        if i < len(tokens) - 1:
            edges[op].append(tokens[i + 1])
            out_degree[op] += 1
            in_degree[tokens[i + 1]] += 1

    # Maximum in/out degree
    max_in_degree = max(in_degree.values()) if in_degree else 0
    max_out_degree = max(out_degree.values()) if out_degree else 0

    block_features = {
        'nodes': list(nodes),
        'edges': dict(edges),
        'max_in_degree': max_in_degree,
        'max_out_degree': max_out_degree,
    }

    return block_features, features


In [10]:
def extract_gas_limit_features(opcodes):
    # Tokenize the opcode sequence
    tokens = opcodes.split()

    # Initialize block features
    nodes = set(tokens)
    edges = defaultdict(list)
    in_degree = defaultdict(int)
    out_degree = defaultdict(int)

    # Initialize attribute features
    features = {
        'gas_related_opcodes': 0,
        'expensive_operations_count': 0,
        'state_change_count': 0,
        'loop_count': 0,
        'arithmetic_count': 0,
        'logical_count': 0,
        'comparison_count': 0,
        'control_flow_count': 0
    }

    # Opcode categories
    gas_related = {'GAS', 'CALLGAS'}
    state_changes = {'SSTORE', 'MSTORE'}
    expensive_ops = {'EXP', 'SHA3', 'LOG0', 'LOG1', 'LOG2', 'LOG3', 'LOG4'}
    arithmetic_ops = {'ADD', 'SUB', 'MUL', 'DIV', 'EXP'}
    logical_ops = {'AND', 'OR', 'XOR', 'NOT'}
    comparison_ops = {'EQ', 'LT', 'GT', 'SLT', 'SGT'}
    control_flow = {'JUMP', 'JUMPI', 'JUMPDEST'}

    # Process opcodes to extract features
    for i, op in enumerate(tokens):
        # Count gas-related opcodes
        if op in gas_related:
            features['gas_related_opcodes'] += 1

        # Count expensive operations
        if op in expensive_ops:
            features['expensive_operations_count'] += 1

        # Count state-changing opcodes
        if op in state_changes:
            features['state_change_count'] += 1

        # Count loops (control flow opcodes)
        if op in control_flow:
            features['loop_count'] += 1

        # Count opcode categories
        if op in arithmetic_ops:
            features['arithmetic_count'] += 1
        elif op in logical_ops:
            features['logical_count'] += 1
        elif op in comparison_ops:
            features['comparison_count'] += 1

        # Build edges for block features
        if i < len(tokens) - 1:
            edges[op].append(tokens[i + 1])
            out_degree[op] += 1
            in_degree[tokens[i + 1]] += 1

    # Maximum in/out degree
    max_in_degree = max(in_degree.values()) if in_degree else 0
    max_out_degree = max(out_degree.values()) if out_degree else 0

    block_features = {
        'nodes': list(nodes),
        'edges': dict(edges),
        'max_in_degree': max_in_degree,
        'max_out_degree': max_out_degree,
    }

    return block_features, features

In [11]:
def extract_integer_overflow_features(opcodes):
    # Tokenize the opcode sequence
    tokens = opcodes.split()

    # Initialize block features
    nodes = set(tokens)
    edges = defaultdict(list)
    in_degree = defaultdict(int)
    out_degree = defaultdict(int)

    # Initialize attribute features
    features = {
        'node_count': len(nodes),
        'edge_count': 0,
        'max_out_degree': 0,
        'max_in_degree': 0,
        'unary_arithmetic_ratio': 0.0,
        'binary_arithmetic_ratio': 0.0,
        'block_ratio': 0.0,
        'control_flow_ratio': 0.0,
        'environment_ratio': 0.0,
        'system_ratio': 0.0,
        'stack_ratio': 0.0,
        'invalid_ratio': 0.0
    }

    # Opcode categories
    unary_arithmetic_ops = {'ISZERO', 'NOT'}
    binary_arithmetic_ops = {'ADD', 'SUB', 'MUL', 'DIV', 'AND', 'SHA3'}
    block_related_ops = {'NUMBER', 'BLOCKHASH', 'COINBASE', 'TIMESTAMP'}
    control_flow_ops = {'JUMP', 'JUMPI', 'JUMPDEST'}
    environment_ops = {'CALLER', 'CALLDATASIZE', 'ORIGIN'}
    system_ops = {'CALL', 'RETURN', 'REVERT', 'SELFDESTRUCT'}
    stack_ops = {'PUSH', 'POP', 'SWAP'}
    invalid_ops = {'INVALID'}

    # Counters for each category
    opcode_counts = {
        'unary_arithmetic': 0,
        'binary_arithmetic': 0,
        'block_related': 0,
        'control_flow': 0,
        'environment': 0,
        'system': 0,
        'stack': 0,
        'invalid': 0
    }

    # Count opcodes by category
    for i, op in enumerate(tokens):
        if op in unary_arithmetic_ops:
            opcode_counts['unary_arithmetic'] += 1
        elif op in binary_arithmetic_ops:
            opcode_counts['binary_arithmetic'] += 1
        elif op in block_related_ops:
            opcode_counts['block_related'] += 1
        elif op in control_flow_ops:
            opcode_counts['control_flow'] += 1
        elif op in environment_ops:
            opcode_counts['environment'] += 1
        elif op in system_ops:
            opcode_counts['system'] += 1
        elif op in stack_ops:
            opcode_counts['stack'] += 1
        elif op in invalid_ops:
            opcode_counts['invalid'] += 1

        # Build edges for block features
        if i < len(tokens) - 1:
            edges[op].append(tokens[i + 1])
            out_degree[op] += 1
            in_degree[tokens[i + 1]] += 1

    # Calculate block features
    features['edge_count'] = sum(len(v) for v in edges.values())
    features['max_out_degree'] = max(out_degree.values()) if out_degree else 0
    features['max_in_degree'] = max(in_degree.values()) if in_degree else 0

    # Calculate ratios for attribute features
    total_opcodes = len(tokens)
    if total_opcodes > 0:
        features['unary_arithmetic_ratio'] = opcode_counts['unary_arithmetic'] / total_opcodes
        features['binary_arithmetic_ratio'] = opcode_counts['binary_arithmetic'] / total_opcodes
        features['block_ratio'] = opcode_counts['block_related'] / total_opcodes
        features['control_flow_ratio'] = opcode_counts['control_flow'] / total_opcodes
        features['environment_ratio'] = opcode_counts['environment'] / total_opcodes
        features['system_ratio'] = opcode_counts['system'] / total_opcodes
        features['stack_ratio'] = opcode_counts['stack'] / total_opcodes
        features['invalid_ratio'] = opcode_counts['invalid'] / total_opcodes

    return {
        'block_features': {
            'nodes': list(nodes),
            'edges': dict(edges),
            'max_in_degree': features['max_in_degree'],
            'max_out_degree': features['max_out_degree']
        },
        'attribute_features': {
            'unary_arithmetic_ratio': features['unary_arithmetic_ratio'],
            'binary_arithmetic_ratio': features['binary_arithmetic_ratio'],
            'block_ratio': features['block_ratio'],
            'control_flow_ratio': features['control_flow_ratio'],
            'environment_ratio': features['environment_ratio'],
            'system_ratio': features['system_ratio'],
            'stack_ratio': features['stack_ratio'],
            'invalid_ratio': features['invalid_ratio']
        }
    }

In [12]:
# Extract features based on label
def extract_features_by_label(row):
    # Initialize default values
    block_features = None
    attribute_features = None
    
    try:
        if row['label'] == 'reentrancy':
            block_features, attribute_features = extract_reentrancy_features(row['opcodes'])
        elif row['label'] == 'gaslimit':
            block_features, attribute_features = extract_gas_limit_features(row['opcodes'])
        elif row['label'] == 'integeroverflow':
            features = extract_integer_overflow_features(row['opcodes'])
            block_features = features['block_features']
            attribute_features = features['attribute_features']
        elif row['label'] == 'clean':
            # Handle clean samples with default features
            block_features = {
                'nodes': [],
                'edges': {},
                'max_in_degree': 0,
                'max_out_degree': 0
            }
            attribute_features = {
                'external_call_presence': False,
                'state_change_after_call': False,
                'recursive_call_potential': False,
                'gas_forwarding': False,
                'arithmetic_count': 0,
                'logical_count': 0,
                'comparison_count': 0,
                'storage_count': 0,
                'control_flow_count': 0
            }
        else:
            raise ValueError(f"Unknown label: {row['label']}")
            
        return pd.Series({
            'block_features': block_features,
            'attribute_features': attribute_features
        })
        
    except Exception as e:
        print(f"Error processing row with label {row['label']}: {str(e)}")
        # Return default features in case of error
        return pd.Series({
            'block_features': {
                'nodes': [],
                'edges': {},
                'max_in_degree': 0,
                'max_out_degree': 0
            },
            'attribute_features': {
                'external_call_presence': False,
                'state_change_after_call': False,
                'recursive_call_potential': False,
                'gas_forwarding': False,
                'arithmetic_count': 0,
                'logical_count': 0,
                'comparison_count': 0,
                'storage_count': 0,
                'control_flow_count': 0
            }
        })

# Apply the extraction function to each row
features_df = data.apply(extract_features_by_label, axis=1)

# Split the features into separate columns
data['block_features'] = features_df['block_features']
data['attribute_features'] = features_df['attribute_features']

# Display the first few rows to verify
data.head() 

Unnamed: 0,opcodes,label,block_features,attribute_features
0,PUSH PUSH MSTORE CALLDATASIZE ISZERO PUSH JUMP...,clean,"{'nodes': [], 'edges': {}, 'max_in_degree': 0,...","{'external_call_presence': False, 'state_chang..."
1,PUSH PUSH MSTORE CALLVALUE DUP ISZERO PUSH JUM...,clean,"{'nodes': [], 'edges': {}, 'max_in_degree': 0,...","{'external_call_presence': False, 'state_chang..."
2,PUSH PUSH MSTORE PUSH DUP SLOAD PUSH PUSH PUSH...,gaslimit,"{'nodes': ['SLOAD', 'SSTORE', 'ADD', 'SHA', 'L...","{'gas_related_opcodes': 0, 'expensive_operatio..."
3,PUSH PUSH MSTORE PUSH PUSH SSTORE CALLVALUE PU...,integeroverflow,"{'nodes': ['SSTORE', 'SLOAD', 'ADD', 'SHA', 'L...","{'unary_arithmetic_ratio': 0.0, 'binary_arithm..."
4,PUSH PUSH MSTORE CALLVALUE DUP ISZERO PUSH JUM...,clean,"{'nodes': [], 'edges': {}, 'max_in_degree': 0,...","{'external_call_presence': False, 'state_chang..."


In [13]:
data

Unnamed: 0,opcodes,label,block_features,attribute_features
0,PUSH PUSH MSTORE CALLDATASIZE ISZERO PUSH JUMP...,clean,"{'nodes': [], 'edges': {}, 'max_in_degree': 0,...","{'external_call_presence': False, 'state_chang..."
1,PUSH PUSH MSTORE CALLVALUE DUP ISZERO PUSH JUM...,clean,"{'nodes': [], 'edges': {}, 'max_in_degree': 0,...","{'external_call_presence': False, 'state_chang..."
2,PUSH PUSH MSTORE PUSH DUP SLOAD PUSH PUSH PUSH...,gaslimit,"{'nodes': ['SLOAD', 'SSTORE', 'ADD', 'SHA', 'L...","{'gas_related_opcodes': 0, 'expensive_operatio..."
3,PUSH PUSH MSTORE PUSH PUSH SSTORE CALLVALUE PU...,integeroverflow,"{'nodes': ['SSTORE', 'SLOAD', 'ADD', 'SHA', 'L...","{'unary_arithmetic_ratio': 0.0, 'binary_arithm..."
4,PUSH PUSH MSTORE CALLVALUE DUP ISZERO PUSH JUM...,clean,"{'nodes': [], 'edges': {}, 'max_in_degree': 0,...","{'external_call_presence': False, 'state_chang..."
...,...,...,...,...
2395,PUSH PUSH MSTORE CALLVALUE ISZERO PUSH JUMPI J...,integeroverflow,"{'nodes': ['SSTORE', 'SLOAD', 'ADD', 'SHA', 'L...","{'unary_arithmetic_ratio': 0.0273972602739726,..."
2396,PUSH PUSH MSTORE CALLVALUE ISZERO PUSH JUMPI P...,gaslimit,"{'nodes': ['SSTORE', 'SLOAD', 'ADD', 'SHA', 'L...","{'gas_related_opcodes': 0, 'expensive_operatio..."
2397,PUSH PUSH MSTORE PUSH PUSH DUP SWAP MSTORE PUS...,gaslimit,"{'nodes': ['SSTORE', 'SLOAD', 'ADD', 'SHA', 'L...","{'gas_related_opcodes': 0, 'expensive_operatio..."
2398,PUSH PUSH MSTORE CALLVALUE ISZERO PUSH JUMPI J...,integeroverflow,"{'nodes': ['SHA', 'LOG', 'JUMPDEST', 'CREATE',...",{'unary_arithmetic_ratio': 0.08450704225352113...


In [14]:
def create_combined_features(row):
    """
    Combine block_features and attribute_features into a single standardized feature vector.
    """
    features = {}
    
    # Extract block features
    block_feat = row['block_features']
    features.update({
        'num_nodes': len(block_feat['nodes']),
        'num_edges': sum(len(edges) for edges in block_feat['edges'].values()),
        'max_in_degree': block_feat['max_in_degree'],
        'max_out_degree': block_feat['max_out_degree'],
    })
    
    # Extract attribute features
    attr_feat = row['attribute_features']
    
    # Handle different attribute features based on vulnerability type
    if row['label'] == 'reentrancy':
        features.update({
            'external_call': int(attr_feat['external_call_presence']),
            'state_change_after_call': int(attr_feat['state_change_after_call']),
            'recursive_call': int(attr_feat['recursive_call_potential']),
            'gas_forwarding': int(attr_feat['gas_forwarding']),
            'arithmetic_ops': attr_feat['arithmetic_count'],
            'logical_ops': attr_feat['logical_count'],
            'comparison_ops': attr_feat['comparison_count'],
            'storage_ops': attr_feat['storage_count'],
            'control_flow_ops': attr_feat['control_flow_count']
        })
    elif row['label'] == 'gaslimit':
        features.update({
            'gas_related_ops': attr_feat['gas_related_opcodes'],
            'expensive_ops': attr_feat['expensive_operations_count'],
            'state_changes': attr_feat['state_change_count'],
            'loop_count': attr_feat['loop_count'],
            'arithmetic_ops': attr_feat['arithmetic_count'],
            'logical_ops': attr_feat['logical_count'],
            'comparison_ops': attr_feat['comparison_count'],
            'control_flow_ops': attr_feat['control_flow_count']
        })
    elif row['label'] == 'integeroverflow':
        features.update({
            'unary_arithmetic_ratio': attr_feat['unary_arithmetic_ratio'],
            'binary_arithmetic_ratio': attr_feat['binary_arithmetic_ratio'],
            'block_ratio': attr_feat['block_ratio'],
            'control_flow_ratio': attr_feat['control_flow_ratio'],
            'environment_ratio': attr_feat['environment_ratio'],
            'system_ratio': attr_feat['system_ratio'],
            'stack_ratio': attr_feat['stack_ratio'],
            'invalid_ratio': attr_feat['invalid_ratio']
        })
    
    return features

# Create combined features
data['combined_features'] = data.apply(create_combined_features, axis=1)

data

Unnamed: 0,opcodes,label,block_features,attribute_features,combined_features
0,PUSH PUSH MSTORE CALLDATASIZE ISZERO PUSH JUMP...,clean,"{'nodes': [], 'edges': {}, 'max_in_degree': 0,...","{'external_call_presence': False, 'state_chang...","{'num_nodes': 0, 'num_edges': 0, 'max_in_degre..."
1,PUSH PUSH MSTORE CALLVALUE DUP ISZERO PUSH JUM...,clean,"{'nodes': [], 'edges': {}, 'max_in_degree': 0,...","{'external_call_presence': False, 'state_chang...","{'num_nodes': 0, 'num_edges': 0, 'max_in_degre..."
2,PUSH PUSH MSTORE PUSH DUP SLOAD PUSH PUSH PUSH...,gaslimit,"{'nodes': ['SLOAD', 'SSTORE', 'ADD', 'SHA', 'L...","{'gas_related_opcodes': 0, 'expensive_operatio...","{'num_nodes': 35, 'num_edges': 382, 'max_in_de..."
3,PUSH PUSH MSTORE PUSH PUSH SSTORE CALLVALUE PU...,integeroverflow,"{'nodes': ['SSTORE', 'SLOAD', 'ADD', 'SHA', 'L...","{'unary_arithmetic_ratio': 0.0, 'binary_arithm...","{'num_nodes': 27, 'num_edges': 304, 'max_in_de..."
4,PUSH PUSH MSTORE CALLVALUE DUP ISZERO PUSH JUM...,clean,"{'nodes': [], 'edges': {}, 'max_in_degree': 0,...","{'external_call_presence': False, 'state_chang...","{'num_nodes': 0, 'num_edges': 0, 'max_in_degre..."
...,...,...,...,...,...
2395,PUSH PUSH MSTORE CALLVALUE ISZERO PUSH JUMPI J...,integeroverflow,"{'nodes': ['SSTORE', 'SLOAD', 'ADD', 'SHA', 'L...","{'unary_arithmetic_ratio': 0.0273972602739726,...","{'num_nodes': 23, 'num_edges': 72, 'max_in_deg..."
2396,PUSH PUSH MSTORE CALLVALUE ISZERO PUSH JUMPI P...,gaslimit,"{'nodes': ['SSTORE', 'SLOAD', 'ADD', 'SHA', 'L...","{'gas_related_opcodes': 0, 'expensive_operatio...","{'num_nodes': 37, 'num_edges': 1357, 'max_in_d..."
2397,PUSH PUSH MSTORE PUSH PUSH DUP SWAP MSTORE PUS...,gaslimit,"{'nodes': ['SSTORE', 'SLOAD', 'ADD', 'SHA', 'L...","{'gas_related_opcodes': 0, 'expensive_operatio...","{'num_nodes': 40, 'num_edges': 1928, 'max_in_d..."
2398,PUSH PUSH MSTORE CALLVALUE ISZERO PUSH JUMPI J...,integeroverflow,"{'nodes': ['SHA', 'LOG', 'JUMPDEST', 'CREATE',...",{'unary_arithmetic_ratio': 0.08450704225352113...,"{'num_nodes': 23, 'num_edges': 70, 'max_in_deg..."


# Data Preprocessing

In [15]:
newData = data.drop(columns=['opcodes', 'block_features', 'attribute_features'])
newData

Unnamed: 0,label,combined_features
0,clean,"{'num_nodes': 0, 'num_edges': 0, 'max_in_degre..."
1,clean,"{'num_nodes': 0, 'num_edges': 0, 'max_in_degre..."
2,gaslimit,"{'num_nodes': 35, 'num_edges': 382, 'max_in_de..."
3,integeroverflow,"{'num_nodes': 27, 'num_edges': 304, 'max_in_de..."
4,clean,"{'num_nodes': 0, 'num_edges': 0, 'max_in_degre..."
...,...,...
2395,integeroverflow,"{'num_nodes': 23, 'num_edges': 72, 'max_in_deg..."
2396,gaslimit,"{'num_nodes': 37, 'num_edges': 1357, 'max_in_d..."
2397,gaslimit,"{'num_nodes': 40, 'num_edges': 1928, 'max_in_d..."
2398,integeroverflow,"{'num_nodes': 23, 'num_edges': 70, 'max_in_deg..."


In [16]:
from sklearn.preprocessing import LabelEncoder
labelProcess = LabelEncoder()
newData['label'] = labelProcess.fit_transform(newData['label'])
newData


Unnamed: 0,label,combined_features
0,0,"{'num_nodes': 0, 'num_edges': 0, 'max_in_degre..."
1,0,"{'num_nodes': 0, 'num_edges': 0, 'max_in_degre..."
2,1,"{'num_nodes': 35, 'num_edges': 382, 'max_in_de..."
3,2,"{'num_nodes': 27, 'num_edges': 304, 'max_in_de..."
4,0,"{'num_nodes': 0, 'num_edges': 0, 'max_in_degre..."
...,...,...
2395,2,"{'num_nodes': 23, 'num_edges': 72, 'max_in_deg..."
2396,1,"{'num_nodes': 37, 'num_edges': 1357, 'max_in_d..."
2397,1,"{'num_nodes': 40, 'num_edges': 1928, 'max_in_d..."
2398,2,"{'num_nodes': 23, 'num_edges': 70, 'max_in_deg..."


In [17]:
newData["combined_features"][0]

{'num_nodes': 0, 'num_edges': 0, 'max_in_degree': 0, 'max_out_degree': 0}

In [18]:
data = newData['combined_features']
data

0       {'num_nodes': 0, 'num_edges': 0, 'max_in_degre...
1       {'num_nodes': 0, 'num_edges': 0, 'max_in_degre...
2       {'num_nodes': 35, 'num_edges': 382, 'max_in_de...
3       {'num_nodes': 27, 'num_edges': 304, 'max_in_de...
4       {'num_nodes': 0, 'num_edges': 0, 'max_in_degre...
                              ...                        
2395    {'num_nodes': 23, 'num_edges': 72, 'max_in_deg...
2396    {'num_nodes': 37, 'num_edges': 1357, 'max_in_d...
2397    {'num_nodes': 40, 'num_edges': 1928, 'max_in_d...
2398    {'num_nodes': 23, 'num_edges': 70, 'max_in_deg...
2399    {'num_nodes': 36, 'num_edges': 1519, 'max_in_d...
Name: combined_features, Length: 2400, dtype: object

In [19]:
import numpy as np
from sklearn.preprocessing import MinMaxScaler

# Define expected keys for count and ratio features
count_keys = ['num_nodes', 'num_edges', 'max_in_degree', 'max_out_degree']
ratio_keys = [
    'unary_arithmetic_ratio', 'binary_arithmetic_ratio', 'block_ratio',
    'control_flow_ratio', 'environment_ratio', 'system_ratio', 'stack_ratio', 'invalid_ratio'
]

# Initialize lists to store feature values
count_values = []
ratio_values = []

# Iterate over each feature dictionary
for features in data:
    # Extract count features, using 0 as default for missing keys
    count_values.append([features.get(key, 0) for key in count_keys])
    
    # Extract ratio features, using 0 as default for missing keys
    ratio_values.append([features.get(key, 0) for key in ratio_keys])

# Convert lists to NumPy arrays
count_values = np.array(count_values)
ratio_values = np.array(ratio_values)

# Step 2: Normalize count-based features
scaler = MinMaxScaler()
normalized_counts = scaler.fit_transform(count_values)



In [20]:
# Step 4: Combine all features into a single vector
vectorized_data = np.concatenate([normalized_counts, ratio_values], axis=1)

# Print the resulting vector
print("Vectorized Data:", vectorized_data)

Vectorized Data: [[0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.71428571 0.02681454 0.03349282 ... 0.         0.         0.        ]
 ...
 [0.81632653 0.13533623 0.1225412  ... 0.         0.         0.        ]
 [0.46938776 0.00491366 0.00637959 ... 0.02816901 0.38028169 0.        ]
 [0.73469388 0.10662642 0.0877193  ... 0.         0.         0.        ]]


In [21]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(vectorized_data, newData['label'], test_size=0.2, random_state=42)


In [22]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler

# Define the parameter grid
param_grid = {
    'svc__C': [0.1, 1, 10, 100],
    'svc__gamma': ['scale', 'auto', 0.001, 0.01, 0.1, 1],
    'svc__kernel': ['rbf', 'linear']
}

# Create a pipeline
pipeline = Pipeline([
    ('scaler', MinMaxScaler()),  # Step to normalize features
    ('svc', SVC(probability=True))  # SVC model
])

# Set up the GridSearchCV
main_pipeline = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
main_pipeline


In [23]:
main_pipeline.fit(X_train, y_train)
print("Best Parameters:", main_pipeline.best_params_)
print("Best Score:", main_pipeline.best_score_)

y_pred = main_pipeline.predict(X_test)
print(classification_report(y_test, y_pred))

Best Parameters: {'svc__C': 100, 'svc__gamma': 'scale', 'svc__kernel': 'rbf'}
Best Score: 0.9364583333333334
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       121
           1       0.94      0.63      0.76       117
           2       1.00      1.00      1.00       115
           3       0.74      0.96      0.84       127

    accuracy                           0.90       480
   macro avg       0.92      0.90      0.90       480
weighted avg       0.92      0.90      0.90       480



In [24]:
joblib.dump(pipeline, 'svc_model.pkl')


['svc_model.pkl']

In [25]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler

# Define the parameter grid for GradientBoostingClassifier
param_grid = {
    'gbc__n_estimators': [50, 100, 150],
    'gbc__learning_rate': [0.01, 0.1, 0.2],
    'gbc__max_depth': [3, 4, 5],
    'gbc__subsample': [0.8, 1.0]
}

# Create a pipeline
pipeline = Pipeline([
    ('scaler', MinMaxScaler()),  # Step to normalize features
    ('gbc', GradientBoostingClassifier(random_state=42))  # Gradient Boosting Classifier
])

# Set up the GridSearchCV
main_pipeline = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
main_pipeline


In [26]:
# Fit the model
main_pipeline.fit(X_train, y_train)

# Print the best parameters and the best score
print("Best Parameters:", main_pipeline.best_params_)
print("Best Score:", main_pipeline.best_score_)

# Predict and evaluate
y_pred = main_pipeline.predict(X_test)
print(classification_report(y_test, y_pred))

Best Parameters: {'gbc__learning_rate': 0.1, 'gbc__max_depth': 3, 'gbc__n_estimators': 50, 'gbc__subsample': 0.8}
Best Score: 0.9546875
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       121
           1       0.96      0.83      0.89       117
           2       1.00      1.00      1.00       115
           3       0.86      0.97      0.91       127

    accuracy                           0.95       480
   macro avg       0.96      0.95      0.95       480
weighted avg       0.95      0.95      0.95       480



In [27]:
# def extract_features_from_dict(data_series):
#     # Define all possible features we expect
#     feature_keys = [
#         'num_nodes', 'num_edges', 'max_in_degree', 'max_out_degree',
#         'unary_arithmetic_ratio', 'binary_arithmetic_ratio', 'block_ratio',
#         'control_flow_ratio', 'environment_ratio', 'system_ratio', 
#         'stack_ratio', 'invalid_ratio'
#     ]
    
#     # Initialize an empty array
#     features = np.zeros((len(data_series), len(feature_keys)))
    
#     # Fill the array with values from dictionaries
#     for i, dict_data in enumerate(data_series):
#         for j, key in enumerate(feature_keys):
#             features[i, j] = dict_data.get(key, 0)  # Use 0 as default if key doesn't exist
            
#     return features
# data = extract_features_from_dict(data)
# data

In [55]:
X_train_tensor = torch.FloatTensor(X_train)
X_test_tensor = torch.FloatTensor(X_test)
y_train_tensor = torch.LongTensor(y_train.values)
y_test_tensor = torch.LongTensor(y_test.values)

# Create datasets
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
val_dataset = TensorDataset(X_test_tensor, y_test_tensor)

# Create data loaders with batch size of 32
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, drop_last=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False, drop_last=True)

# Initialize model
class BiLSTMClassifier(nn.Module):
    def __init__(self, input_size=12, hidden_size=128, num_layers=3, output_size=4, dropout_rate=0.3):
        super().__init__()
        
        self.lstm = nn.LSTM(
            input_size=input_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True,
            bidirectional=True,
            dropout=dropout_rate
        )
        
        self.fc_layers = nn.Sequential(
            nn.Linear(hidden_size * 2, hidden_size),
            nn.BatchNorm1d(hidden_size),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            nn.Linear(hidden_size, hidden_size // 2),
            nn.BatchNorm1d(hidden_size // 2),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            nn.Linear(hidden_size // 2, output_size)
        )
        
    def forward(self, x):
        # LSTM layers
        lstm_out, _ = self.lstm(x)
        
        # Get final output
        final_out = lstm_out[:, -1, :]
        
        # FC layers
        output = self.fc_layers(final_out)
        return output

# Model parameters
input_size = 12  # From your data dimension
hidden_size = 128
num_layers = 3
output_size = 4  # Number of classes
dropout_rate = 0.3

# Initialize model
model = BiLSTMClassifier(
    input_size=input_size,
    hidden_size=hidden_size,
    num_layers=num_layers,
    output_size=output_size,
    dropout_rate=dropout_rate
)
model

BiLSTMClassifier(
  (lstm): LSTM(12, 128, num_layers=3, batch_first=True, dropout=0.3, bidirectional=True)
  (fc_layers): Sequential(
    (0): Linear(in_features=256, out_features=128, bias=True)
    (1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): Dropout(p=0.3, inplace=False)
    (4): Linear(in_features=128, out_features=64, bias=True)
    (5): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (6): ReLU()
    (7): Dropout(p=0.3, inplace=False)
    (8): Linear(in_features=64, out_features=4, bias=True)
  )
)

In [56]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
import torch.optim as optim
from sklearn.metrics import classification_report
import plotly.graph_objects as go

# Define device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Move model to device
model = model.to(device)

# Loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=0.001, weight_decay=0.01)

# Training setup
num_epochs = 200
patience = 10
trigger_times = 3
best_val_loss = float('inf')

# Store metrics
train_losses = []
val_losses = []
train_accuracies = []
val_accuracies = []

for epoch in range(num_epochs):
    # Training phase
    model.train()
    train_loss = 0.0
    train_correct = 0
    train_total = 0
    
    for batch_data, batch_labels in train_loader:
        # Ensure input is 3D for LSTM
        batch_data = batch_data.unsqueeze(1).to(device)
        batch_labels = batch_labels.to(device)
        
        # Zero gradients
        optimizer.zero_grad()
        
        # Forward pass
        outputs = model(batch_data)
        loss = criterion(outputs, batch_labels)
        
        # Backward pass and optimization
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)  # Gradient clipping
        optimizer.step()
        
        # Update training metrics
        train_loss += loss.item()
        _, predicted = torch.max(outputs, 1)
        train_total += batch_labels.size(0)
        train_correct += (predicted == batch_labels).sum().item()
    
    epoch_train_loss = train_loss / len(train_loader)
    epoch_train_acc = 100 * train_correct / train_total
    train_losses.append(epoch_train_loss)
    train_accuracies.append(epoch_train_acc)
    
    # Validation phase
    model.eval()
    val_loss = 0.0
    val_correct = 0
    val_total = 0
    
    with torch.no_grad():
        for val_data, val_labels in val_loader:
            # Ensure input is 3D for LSTM
            val_data = val_data.unsqueeze(1).to(device)
            val_labels = val_labels.to(device)
            
            # Forward pass
            val_outputs = model(val_data)
            val_loss += criterion(val_outputs, val_labels).item()
            
            # Update validation metrics
            _, predicted = torch.max(val_outputs, 1)
            val_total += val_labels.size(0)
            val_correct += (predicted == val_labels).sum().item()
    
    epoch_val_loss = val_loss / len(val_loader)
    epoch_val_acc = 100 * val_correct / val_total
    val_losses.append(epoch_val_loss)
    val_accuracies.append(epoch_val_acc)
    
    # Print epoch metrics
    print(f"Epoch [{epoch+1}/{num_epochs}]")
    print(f"Train Loss: {epoch_train_loss:.4f}, Train Acc: {epoch_train_acc:.2f}%")
    print(f"Val Loss: {epoch_val_loss:.4f}, Val Acc: {epoch_val_acc:.2f}%")
    
    # Early stopping
    if epoch_val_loss < best_val_loss:
        best_val_loss = epoch_val_loss
        trigger_times = 0
        torch.save(model.state_dict(), "best_model.pth")
    else:
        trigger_times += 1
        if trigger_times >= patience:
            print("Early stopping triggered!")
            break

# Load the best model
model.load_state_dict(torch.load("best_model.pth"))

# Evaluate on validation set
model.eval()
all_predictions = []
all_labels = []

with torch.no_grad():
    for val_data, val_labels in val_loader:
        val_data = val_data.unsqueeze(1).to(device)
        val_labels = val_labels.to(device)
        
        val_outputs = model(val_data)
        _, predicted = torch.max(val_outputs, 1)
        
        all_predictions.extend(predicted.cpu().numpy())
        all_labels.extend(val_labels.cpu().numpy())

# Print classification report
print("\nClassification Report:")
print(classification_report(all_labels, all_predictions))


Epoch [1/200]
Train Loss: 0.8411, Train Acc: 64.64%
Val Loss: 1.1600, Val Acc: 28.75%
Epoch [2/200]
Train Loss: 0.3780, Train Acc: 85.00%
Val Loss: 0.2708, Val Acc: 88.54%
Epoch [3/200]
Train Loss: 0.3144, Train Acc: 87.08%
Val Loss: 0.2476, Val Acc: 89.17%
Epoch [4/200]
Train Loss: 0.2799, Train Acc: 88.23%
Val Loss: 0.2618, Val Acc: 87.92%
Epoch [5/200]
Train Loss: 0.2530, Train Acc: 89.79%
Val Loss: 0.2284, Val Acc: 89.38%
Epoch [6/200]
Train Loss: 0.2446, Train Acc: 90.52%
Val Loss: 0.2238, Val Acc: 88.54%
Epoch [7/200]
Train Loss: 0.2287, Train Acc: 90.31%
Val Loss: 0.2316, Val Acc: 89.17%
Epoch [8/200]
Train Loss: 0.2242, Train Acc: 90.52%
Val Loss: 0.2353, Val Acc: 88.75%
Epoch [9/200]
Train Loss: 0.2280, Train Acc: 90.94%
Val Loss: 0.2234, Val Acc: 89.38%
Epoch [10/200]
Train Loss: 0.2131, Train Acc: 91.25%
Val Loss: 0.2236, Val Acc: 90.00%
Epoch [11/200]
Train Loss: 0.2266, Train Acc: 91.46%
Val Loss: 0.2208, Val Acc: 89.58%
Epoch [12/200]
Train Loss: 0.2116, Train Acc: 91.72%

In [57]:
# Combined chart for accuracy and loss
fig = go.Figure()

# Add training loss
fig.add_trace(go.Scatter(
    x=list(range(1, len(train_losses) + 1)),
    y=train_losses,
    mode='lines+markers',
    name='Train Loss',
    line=dict(width=2)
))

# Add validation loss
fig.add_trace(go.Scatter(
    x=list(range(1, len(val_losses) + 1)),
    y=val_losses,
    mode='lines+markers',
    name='Validation Loss',
    line=dict(width=2, dash='dot')
))

# Add training accuracy
fig.add_trace(go.Scatter(
    x=list(range(1, len(train_accuracies) + 1)),
    y=train_accuracies,
    mode='lines+markers',
    name='Train Accuracy',
    line=dict(width=2, color='green'),
    yaxis='y2'  # Use secondary y-axis for accuracy
))

# Add validation accuracy
fig.add_trace(go.Scatter(
    x=list(range(1, len(val_accuracies) + 1)),
    y=val_accuracies,
    mode='lines+markers',
    name='Validation Accuracy',
    line=dict(width=2, dash='dot', color='orange'),
    yaxis='y2'  # Use secondary y-axis for accuracy
))

# Update layout with dual axes
fig.update_layout(
    title="Training and Validation Metrics",
    xaxis_title="Epoch",
    yaxis=dict(
        title="Loss",
        titlefont=dict(color="blue"),
        tickfont=dict(color="blue")
    ),
    yaxis2=dict(
        title="Accuracy (%)",
        overlaying='y',
        side='right',
        titlefont=dict(color="green"),
        tickfont=dict(color="green")
    ),
    legend=dict(x=0.5, y=1.15, orientation="h"),
    template="plotly_white"
)

fig.show()
