# Load lib

In [197]:
!uv     pip install pandas torch scikit-learn-intelex matplotlib plotly torchviz 
!uv pip install hiddenlayer networkx graphviz


[2mAudited [1m6 packages[0m [2min 5ms[0m[0m
[2K[2mResolved [1m3 packages[0m [2min 178ms[0m[0m                                         [0m
[2K[2mPrepared [1m1 package[0m [2min 54ms[0m[0m                                               
[2K[2mInstalled [1m1 package[0m [2min 1ms[0m[0m                                  [0m
 [32m+[39m [1mhiddenlayer[0m[2m==0.3[0m


In [149]:
import pandas as pd
import json
import os
# Visualizations
import matplotlib.pyplot as plt
from sklearnex import patch_sklearn
patch_sklearn()
from sklearn.metrics import classification_report
import joblib
import warnings
warnings.filterwarnings("ignore")


Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


# Load Data

In [150]:
DATAPATH = [
    "./Datapreprocessing/filtered_results/reentrancy/",
    "./Datapreprocessing/filtered_results/gaslimit/",
    "./Datapreprocessing/filtered_results/integeroverflow/",
]
data_list = []
for path in DATAPATH:
    for file in os.listdir(path):
        if file.endswith(".json"):
            with open(os.path.join(path, file), 'r') as f:
                data = json.load(f)
                data_list.append({
                    "opcodes": data["opcodes"],
                    "label": path.split("/")[-2]
                })
                
# Create a DataFrame from the list of dictionaries
data = pd.DataFrame(data_list)
data


Unnamed: 0,opcodes,label
0,"[PUSH1, PUSH1, MSTORE, CALLVALUE, ISZERO, PUSH...",reentrancy
1,"[PUSH1, PUSH1, MSTORE, CALLVALUE, ISZERO, PUSH...",reentrancy
2,"[PUSH1, PUSH1, MSTORE, CALLVALUE, ISZERO, PUSH...",reentrancy
3,"[PUSH1, PUSH1, MSTORE, PUSH1, DUP1, SLOAD, PUS...",reentrancy
4,"[PUSH2, PUSH2, PUSH1, DUP3, DUP3, DUP3, CODECO...",reentrancy
...,...,...
42272,"[PUSH1, PUSH1, MSTORE, PUSH1, DUP1, SLOAD, PUS...",integeroverflow
42273,"[PUSH1, PUSH1, MSTORE, CALLVALUE, ISZERO, PUSH...",integeroverflow
42274,"[PUSH1, PUSH1, MSTORE, PUSH1, DUP1, SLOAD, PUS...",integeroverflow
42275,"[PUSH1, PUSH1, MSTORE, PUSH4, PUSH1, SSTORE, C...",integeroverflow


# EDA

## Clean data

In [151]:
# Import plotly
import plotly.express as px
import plotly.graph_objects as go
from collections import Counter

# 1. Distribution of labels
label_counts = data['label'].value_counts()
fig = px.bar(x=label_counts.index, y=label_counts.values,
             title='Distribution of Vulnerability Types',
             labels={'x': 'Vulnerability Type', 'y': 'Count'})
fig.show()


In [152]:
# Create balanced dataset with 199 samples per label
balanced_data = pd.concat([
    data[data['label'] == label].sample(n=600, random_state=42) 
    for label in data['label'].unique()
])

# Shuffle the balanced dataset
data = balanced_data.sample(frac=1, random_state=42).reset_index(drop=True)

label_counts = data['label'].value_counts()
fig = px.bar(x=label_counts.index, y=label_counts.values,
             title='Distribution of Vulnerability Types',
             labels={'x': 'Vulnerability Type', 'y': 'Count'})
fig.show()


In [153]:
# Clean opcodes
def clean_opcodes(opcode_list):
    # Remove UNKNOWN and INVALID opcodes
    cleaned = [op for op in opcode_list if not (op.startswith('UNKNOWN_') or op.startswith('INVALID_'))]
    
    # Remove numeric values after opcodes (e.g., PUSH1, PUSH2 -> PUSH)
    cleaned = [op.rstrip('0123456789') for op in cleaned]
    
    return cleaned


In [154]:
data['opcodes'] = data['opcodes'].apply(clean_opcodes)
data['opcodes'] = data['opcodes'].apply(lambda x: ' '.join(x))
data

Unnamed: 0,opcodes,label
0,PUSH PUSH MSTORE CALLVALUE ISZERO PUSH JUMPI J...,integeroverflow
1,PUSH PUSH MSTORE CALLVALUE ISZERO PUSH JUMPI P...,gaslimit
2,PUSH PUSH MSTORE PUSH DUP MLOAD SWAP DUP ADD P...,gaslimit
3,PUSH PUSH MSTORE CALLVALUE ISZERO PUSH JUMPI P...,reentrancy
4,PUSH PUSH MSTORE JUMPDEST PUSH DUP SLOAD PUSH ...,integeroverflow
...,...,...
1795,PUSH PUSH MSTORE PUSH PUSH MSTORE PUSH PUSH MS...,gaslimit
1796,PUSH PUSH MSTORE CALLVALUE ISZERO PUSH JUMPI P...,integeroverflow
1797,PUSH PUSH PUSH DUP DUP DUP CODECOPY DUP MLOAD ...,gaslimit
1798,PUSH PUSH MSTORE CALLVALUE DUP ISZERO PUSH JUM...,integeroverflow


In [155]:
data['opcodes'][1]

'PUSH PUSH MSTORE CALLVALUE ISZERO PUSH JUMPI PUSH DUP REVERT JUMPDEST PUSH DUP SLOAD PUSH PUSH PUSH EXP SUB NOT AND CALLER PUSH PUSH PUSH EXP SUB AND OR SWAP SSTORE PUSH DUP MLOAD SWAP DUP ADD PUSH MSTORE PUSH DUP MSTORE PUSH PUSH DUP ADD MSTORE PUSH SWAP DUP MLOAD PUSH SWAP SWAP PUSH ADD SWAP PUSH JUMP JUMPDEST POP PUSH DUP MLOAD SWAP DUP ADD PUSH MSTORE PUSH DUP MSTORE PUSH PUSH DUP ADD MSTORE PUSH SWAP DUP MLOAD PUSH SWAP SWAP PUSH ADD SWAP PUSH JUMP JUMPDEST POP PUSH DUP SLOAD PUSH PUSH NOT SWAP SWAP AND OR SWAP DUP SWAP SSTORE PUSH AND PUSH EXP PUSH MUL PUSH DUP SWAP SSTORE PUSH DUP SLOAD PUSH PUSH PUSH EXP SUB SWAP DUP AND DUP MSTORE PUSH PUSH MSTORE PUSH DUP DUP SHA DUP SWAP SSTORE DUP SLOAD SWAP SWAP AND SWAP PUSH SWAP MLOAD SWAP DUP MSTORE PUSH ADD PUSH MLOAD DUP SWAP SUB SWAP LOG PUSH JUMP JUMPDEST DUP DUP SLOAD PUSH DUP PUSH AND ISZERO PUSH MUL SUB AND PUSH SWAP DIV SWAP PUSH MSTORE PUSH PUSH SHA SWAP PUSH ADD PUSH SWAP DIV DUP ADD SWAP DUP PUSH LT PUSH JUMPI DUP MLOAD PUSH

## Feature Extraction

In [156]:
from collections import defaultdict

def extract_reentrancy_features(opcodes):
    # Tokenize the opcode sequence
    tokens = opcodes.split()

    # Initialize block features
    nodes = set(tokens)
    edges = defaultdict(list)
    in_degree = defaultdict(int)
    out_degree = defaultdict(int)

    # Initialize attribute features
    features = {
        'external_call_presence': False,
        'state_change_after_call': False,
        'recursive_call_potential': False,
        'gas_forwarding': False,
        'arithmetic_count': 0,
        'logical_count': 0,
        'comparison_count': 0,
        'storage_count': 0,
        'control_flow_count': 0
    }

    # Opcode categories
    external_calls = {'CALL', 'CALLCODE', 'DELEGATECALL', 'STATICCALL'}
    state_changes = {'SSTORE', 'MSTORE'}
    gas_management = {'GAS', 'CALLGAS'}
    arithmetic_ops = {'ADD', 'SUB', 'MUL', 'DIV', 'EXP'}
    logical_ops = {'AND', 'OR', 'XOR', 'NOT'}
    comparison_ops = {'EQ', 'LT', 'GT', 'SLT', 'SGT'}
    control_flow = {'JUMP', 'JUMPI', 'JUMPDEST'}

    # Process opcodes to extract features
    for i, op in enumerate(tokens):
        # External call presence
        if op in external_calls:
            features['external_call_presence'] = True

        # State change after external call
        if op in state_changes:
            if any(tokens[j] in external_calls for j in range(i)):
                features['state_change_after_call'] = True

        # Recursive call potential
        if op in control_flow:
            features['recursive_call_potential'] = True

        # Gas forwarding
        if op in gas_management:
            features['gas_forwarding'] = True

        # Count opcode categories
        if op in arithmetic_ops:
            features['arithmetic_count'] += 1
        elif op in logical_ops:
            features['logical_count'] += 1
        elif op in comparison_ops:
            features['comparison_count'] += 1
        elif op in state_changes:
            features['storage_count'] += 1
        elif op in control_flow:
            features['control_flow_count'] += 1

        # Build edges for block features
        if i < len(tokens) - 1:
            edges[op].append(tokens[i + 1])
            out_degree[op] += 1
            in_degree[tokens[i + 1]] += 1

    # Maximum in/out degree
    max_in_degree = max(in_degree.values()) if in_degree else 0
    max_out_degree = max(out_degree.values()) if out_degree else 0

    block_features = {
        'nodes': list(nodes),
        'edges': dict(edges),
        'max_in_degree': max_in_degree,
        'max_out_degree': max_out_degree,
    }

    return block_features, features


In [157]:
def extract_gas_limit_features(opcodes):
    # Tokenize the opcode sequence
    tokens = opcodes.split()

    # Initialize block features
    nodes = set(tokens)
    edges = defaultdict(list)
    in_degree = defaultdict(int)
    out_degree = defaultdict(int)

    # Initialize attribute features
    features = {
        'gas_related_opcodes': 0,
        'expensive_operations_count': 0,
        'state_change_count': 0,
        'loop_count': 0,
        'arithmetic_count': 0,
        'logical_count': 0,
        'comparison_count': 0,
        'control_flow_count': 0
    }

    # Opcode categories
    gas_related = {'GAS', 'CALLGAS'}
    state_changes = {'SSTORE', 'MSTORE'}
    expensive_ops = {'EXP', 'SHA3', 'LOG0', 'LOG1', 'LOG2', 'LOG3', 'LOG4'}
    arithmetic_ops = {'ADD', 'SUB', 'MUL', 'DIV', 'EXP'}
    logical_ops = {'AND', 'OR', 'XOR', 'NOT'}
    comparison_ops = {'EQ', 'LT', 'GT', 'SLT', 'SGT'}
    control_flow = {'JUMP', 'JUMPI', 'JUMPDEST'}

    # Process opcodes to extract features
    for i, op in enumerate(tokens):
        # Count gas-related opcodes
        if op in gas_related:
            features['gas_related_opcodes'] += 1

        # Count expensive operations
        if op in expensive_ops:
            features['expensive_operations_count'] += 1

        # Count state-changing opcodes
        if op in state_changes:
            features['state_change_count'] += 1

        # Count loops (control flow opcodes)
        if op in control_flow:
            features['loop_count'] += 1

        # Count opcode categories
        if op in arithmetic_ops:
            features['arithmetic_count'] += 1
        elif op in logical_ops:
            features['logical_count'] += 1
        elif op in comparison_ops:
            features['comparison_count'] += 1

        # Build edges for block features
        if i < len(tokens) - 1:
            edges[op].append(tokens[i + 1])
            out_degree[op] += 1
            in_degree[tokens[i + 1]] += 1

    # Maximum in/out degree
    max_in_degree = max(in_degree.values()) if in_degree else 0
    max_out_degree = max(out_degree.values()) if out_degree else 0

    block_features = {
        'nodes': list(nodes),
        'edges': dict(edges),
        'max_in_degree': max_in_degree,
        'max_out_degree': max_out_degree,
    }

    return block_features, features

In [158]:
def extract_integer_overflow_features(opcodes):
    # Tokenize the opcode sequence
    tokens = opcodes.split()

    # Initialize block features
    nodes = set(tokens)
    edges = defaultdict(list)
    in_degree = defaultdict(int)
    out_degree = defaultdict(int)

    # Initialize attribute features
    features = {
        'node_count': len(nodes),
        'edge_count': 0,
        'max_out_degree': 0,
        'max_in_degree': 0,
        'unary_arithmetic_ratio': 0.0,
        'binary_arithmetic_ratio': 0.0,
        'block_ratio': 0.0,
        'control_flow_ratio': 0.0,
        'environment_ratio': 0.0,
        'system_ratio': 0.0,
        'stack_ratio': 0.0,
        'invalid_ratio': 0.0
    }

    # Opcode categories
    unary_arithmetic_ops = {'ISZERO', 'NOT'}
    binary_arithmetic_ops = {'ADD', 'SUB', 'MUL', 'DIV', 'AND', 'SHA3'}
    block_related_ops = {'NUMBER', 'BLOCKHASH', 'COINBASE', 'TIMESTAMP'}
    control_flow_ops = {'JUMP', 'JUMPI', 'JUMPDEST'}
    environment_ops = {'CALLER', 'CALLDATASIZE', 'ORIGIN'}
    system_ops = {'CALL', 'RETURN', 'REVERT', 'SELFDESTRUCT'}
    stack_ops = {'PUSH', 'POP', 'SWAP'}
    invalid_ops = {'INVALID'}

    # Counters for each category
    opcode_counts = {
        'unary_arithmetic': 0,
        'binary_arithmetic': 0,
        'block_related': 0,
        'control_flow': 0,
        'environment': 0,
        'system': 0,
        'stack': 0,
        'invalid': 0
    }

    # Count opcodes by category
    for i, op in enumerate(tokens):
        if op in unary_arithmetic_ops:
            opcode_counts['unary_arithmetic'] += 1
        elif op in binary_arithmetic_ops:
            opcode_counts['binary_arithmetic'] += 1
        elif op in block_related_ops:
            opcode_counts['block_related'] += 1
        elif op in control_flow_ops:
            opcode_counts['control_flow'] += 1
        elif op in environment_ops:
            opcode_counts['environment'] += 1
        elif op in system_ops:
            opcode_counts['system'] += 1
        elif op in stack_ops:
            opcode_counts['stack'] += 1
        elif op in invalid_ops:
            opcode_counts['invalid'] += 1

        # Build edges for block features
        if i < len(tokens) - 1:
            edges[op].append(tokens[i + 1])
            out_degree[op] += 1
            in_degree[tokens[i + 1]] += 1

    # Calculate block features
    features['edge_count'] = sum(len(v) for v in edges.values())
    features['max_out_degree'] = max(out_degree.values()) if out_degree else 0
    features['max_in_degree'] = max(in_degree.values()) if in_degree else 0

    # Calculate ratios for attribute features
    total_opcodes = len(tokens)
    if total_opcodes > 0:
        features['unary_arithmetic_ratio'] = opcode_counts['unary_arithmetic'] / total_opcodes
        features['binary_arithmetic_ratio'] = opcode_counts['binary_arithmetic'] / total_opcodes
        features['block_ratio'] = opcode_counts['block_related'] / total_opcodes
        features['control_flow_ratio'] = opcode_counts['control_flow'] / total_opcodes
        features['environment_ratio'] = opcode_counts['environment'] / total_opcodes
        features['system_ratio'] = opcode_counts['system'] / total_opcodes
        features['stack_ratio'] = opcode_counts['stack'] / total_opcodes
        features['invalid_ratio'] = opcode_counts['invalid'] / total_opcodes

    return {
        'block_features': {
            'nodes': list(nodes),
            'edges': dict(edges),
            'max_in_degree': features['max_in_degree'],
            'max_out_degree': features['max_out_degree']
        },
        'attribute_features': {
            'unary_arithmetic_ratio': features['unary_arithmetic_ratio'],
            'binary_arithmetic_ratio': features['binary_arithmetic_ratio'],
            'block_ratio': features['block_ratio'],
            'control_flow_ratio': features['control_flow_ratio'],
            'environment_ratio': features['environment_ratio'],
            'system_ratio': features['system_ratio'],
            'stack_ratio': features['stack_ratio'],
            'invalid_ratio': features['invalid_ratio']
        }
    }

In [159]:
# Extract features based on label
def extract_features_by_label(row):
    if row['label'] == 'reentrancy':
        block_features, attribute_features = extract_reentrancy_features(row['opcodes'])
    elif row['label'] == 'gaslimit':
        block_features, attribute_features = extract_gas_limit_features(row['opcodes'])
    elif row['label'] == 'integeroverflow':
        features = extract_integer_overflow_features(row['opcodes'])
        block_features = features['block_features']
        attribute_features = features['attribute_features']
    
    return pd.Series({
        'block_features': block_features,
        'attribute_features': attribute_features
    })

# Apply the extraction function to each row
features_df = data.apply(extract_features_by_label, axis=1)

# Split the features into separate columns
data['block_features'] = features_df['block_features']
data['attribute_features'] = features_df['attribute_features']

# Display the first few rows to verify
data.head() 

Unnamed: 0,opcodes,label,block_features,attribute_features
0,PUSH PUSH MSTORE CALLVALUE ISZERO PUSH JUMPI J...,integeroverflow,"{'nodes': ['DIV', 'DUP', 'SWAP', 'MSTORE', 'SM...",{'unary_arithmetic_ratio': 0.04545454545454545...
1,PUSH PUSH MSTORE CALLVALUE ISZERO PUSH JUMPI P...,gaslimit,"{'nodes': ['DIV', 'DUP', 'SWAP', 'EXP', 'MSTOR...","{'gas_related_opcodes': 2, 'expensive_operatio..."
2,PUSH PUSH MSTORE PUSH DUP MLOAD SWAP DUP ADD P...,gaslimit,"{'nodes': ['DIV', 'DUP', 'SWAP', 'EXP', 'BLOCK...","{'gas_related_opcodes': 1, 'expensive_operatio..."
3,PUSH PUSH MSTORE CALLVALUE ISZERO PUSH JUMPI P...,reentrancy,"{'nodes': ['DIV', 'DUP', 'SWAP', 'EXP', 'MSTOR...","{'external_call_presence': True, 'state_change..."
4,PUSH PUSH MSTORE JUMPDEST PUSH DUP SLOAD PUSH ...,integeroverflow,"{'nodes': ['DIV', 'DUP', 'SWAP', 'EXP', 'MSTOR...",{'unary_arithmetic_ratio': 0.04155124653739612...


In [160]:
data

Unnamed: 0,opcodes,label,block_features,attribute_features
0,PUSH PUSH MSTORE CALLVALUE ISZERO PUSH JUMPI J...,integeroverflow,"{'nodes': ['DIV', 'DUP', 'SWAP', 'MSTORE', 'SM...",{'unary_arithmetic_ratio': 0.04545454545454545...
1,PUSH PUSH MSTORE CALLVALUE ISZERO PUSH JUMPI P...,gaslimit,"{'nodes': ['DIV', 'DUP', 'SWAP', 'EXP', 'MSTOR...","{'gas_related_opcodes': 2, 'expensive_operatio..."
2,PUSH PUSH MSTORE PUSH DUP MLOAD SWAP DUP ADD P...,gaslimit,"{'nodes': ['DIV', 'DUP', 'SWAP', 'EXP', 'BLOCK...","{'gas_related_opcodes': 1, 'expensive_operatio..."
3,PUSH PUSH MSTORE CALLVALUE ISZERO PUSH JUMPI P...,reentrancy,"{'nodes': ['DIV', 'DUP', 'SWAP', 'EXP', 'MSTOR...","{'external_call_presence': True, 'state_change..."
4,PUSH PUSH MSTORE JUMPDEST PUSH DUP SLOAD PUSH ...,integeroverflow,"{'nodes': ['DIV', 'DUP', 'SWAP', 'EXP', 'MSTOR...",{'unary_arithmetic_ratio': 0.04155124653739612...
...,...,...,...,...
1795,PUSH PUSH MSTORE PUSH PUSH MSTORE PUSH PUSH MS...,gaslimit,"{'nodes': ['DIV', 'DUP', 'SWAP', 'EXP', 'MSTOR...","{'gas_related_opcodes': 1, 'expensive_operatio..."
1796,PUSH PUSH MSTORE CALLVALUE ISZERO PUSH JUMPI P...,integeroverflow,"{'nodes': ['DIV', 'DUP', 'SWAP', 'EXP', 'MSTOR...",{'unary_arithmetic_ratio': 0.03448275862068965...
1797,PUSH PUSH PUSH DUP DUP DUP CODECOPY DUP MLOAD ...,gaslimit,"{'nodes': ['DIV', 'DUP', 'SWAP', 'MSTORE', 'ML...","{'gas_related_opcodes': 0, 'expensive_operatio..."
1798,PUSH PUSH MSTORE CALLVALUE DUP ISZERO PUSH JUM...,integeroverflow,"{'nodes': ['DIV', 'DUP', 'SWAP', 'EXP', 'MSTOR...",{'unary_arithmetic_ratio': 0.02631578947368421...


In [161]:
def create_combined_features(row):
    """
    Combine block_features and attribute_features into a single standardized feature vector.
    """
    features = {}
    
    # Extract block features
    block_feat = row['block_features']
    features.update({
        'num_nodes': len(block_feat['nodes']),
        'num_edges': sum(len(edges) for edges in block_feat['edges'].values()),
        'max_in_degree': block_feat['max_in_degree'],
        'max_out_degree': block_feat['max_out_degree'],
    })
    
    # Extract attribute features
    attr_feat = row['attribute_features']
    
    # Handle different attribute features based on vulnerability type
    if row['label'] == 'reentrancy':
        features.update({
            'external_call': int(attr_feat['external_call_presence']),
            'state_change_after_call': int(attr_feat['state_change_after_call']),
            'recursive_call': int(attr_feat['recursive_call_potential']),
            'gas_forwarding': int(attr_feat['gas_forwarding']),
            'arithmetic_ops': attr_feat['arithmetic_count'],
            'logical_ops': attr_feat['logical_count'],
            'comparison_ops': attr_feat['comparison_count'],
            'storage_ops': attr_feat['storage_count'],
            'control_flow_ops': attr_feat['control_flow_count']
        })
    elif row['label'] == 'gaslimit':
        features.update({
            'gas_related_ops': attr_feat['gas_related_opcodes'],
            'expensive_ops': attr_feat['expensive_operations_count'],
            'state_changes': attr_feat['state_change_count'],
            'loop_count': attr_feat['loop_count'],
            'arithmetic_ops': attr_feat['arithmetic_count'],
            'logical_ops': attr_feat['logical_count'],
            'comparison_ops': attr_feat['comparison_count'],
            'control_flow_ops': attr_feat['control_flow_count']
        })
    elif row['label'] == 'integeroverflow':
        features.update({
            'unary_arithmetic_ratio': attr_feat['unary_arithmetic_ratio'],
            'binary_arithmetic_ratio': attr_feat['binary_arithmetic_ratio'],
            'block_ratio': attr_feat['block_ratio'],
            'control_flow_ratio': attr_feat['control_flow_ratio'],
            'environment_ratio': attr_feat['environment_ratio'],
            'system_ratio': attr_feat['system_ratio'],
            'stack_ratio': attr_feat['stack_ratio'],
            'invalid_ratio': attr_feat['invalid_ratio']
        })
    
    return features

# Create combined features
data['combined_features'] = data.apply(create_combined_features, axis=1)

data

Unnamed: 0,opcodes,label,block_features,attribute_features,combined_features
0,PUSH PUSH MSTORE CALLVALUE ISZERO PUSH JUMPI J...,integeroverflow,"{'nodes': ['DIV', 'DUP', 'SWAP', 'MSTORE', 'SM...",{'unary_arithmetic_ratio': 0.04545454545454545...,"{'num_nodes': 25, 'num_edges': 87, 'max_in_deg..."
1,PUSH PUSH MSTORE CALLVALUE ISZERO PUSH JUMPI P...,gaslimit,"{'nodes': ['DIV', 'DUP', 'SWAP', 'EXP', 'MSTOR...","{'gas_related_opcodes': 2, 'expensive_operatio...","{'num_nodes': 41, 'num_edges': 2111, 'max_in_d..."
2,PUSH PUSH MSTORE PUSH DUP MLOAD SWAP DUP ADD P...,gaslimit,"{'nodes': ['DIV', 'DUP', 'SWAP', 'EXP', 'BLOCK...","{'gas_related_opcodes': 1, 'expensive_operatio...","{'num_nodes': 43, 'num_edges': 3275, 'max_in_d..."
3,PUSH PUSH MSTORE CALLVALUE ISZERO PUSH JUMPI P...,reentrancy,"{'nodes': ['DIV', 'DUP', 'SWAP', 'EXP', 'MSTOR...","{'external_call_presence': True, 'state_change...","{'num_nodes': 32, 'num_edges': 166, 'max_in_de..."
4,PUSH PUSH MSTORE JUMPDEST PUSH DUP SLOAD PUSH ...,integeroverflow,"{'nodes': ['DIV', 'DUP', 'SWAP', 'EXP', 'MSTOR...",{'unary_arithmetic_ratio': 0.04155124653739612...,"{'num_nodes': 32, 'num_edges': 360, 'max_in_de..."
...,...,...,...,...,...
1795,PUSH PUSH MSTORE PUSH PUSH MSTORE PUSH PUSH MS...,gaslimit,"{'nodes': ['DIV', 'DUP', 'SWAP', 'EXP', 'MSTOR...","{'gas_related_opcodes': 1, 'expensive_operatio...","{'num_nodes': 36, 'num_edges': 1524, 'max_in_d..."
1796,PUSH PUSH MSTORE CALLVALUE ISZERO PUSH JUMPI P...,integeroverflow,"{'nodes': ['DIV', 'DUP', 'SWAP', 'EXP', 'MSTOR...",{'unary_arithmetic_ratio': 0.03448275862068965...,"{'num_nodes': 38, 'num_edges': 173, 'max_in_de..."
1797,PUSH PUSH PUSH DUP DUP DUP CODECOPY DUP MLOAD ...,gaslimit,"{'nodes': ['DIV', 'DUP', 'SWAP', 'MSTORE', 'ML...","{'gas_related_opcodes': 0, 'expensive_operatio...","{'num_nodes': 25, 'num_edges': 58, 'max_in_deg..."
1798,PUSH PUSH MSTORE CALLVALUE DUP ISZERO PUSH JUM...,integeroverflow,"{'nodes': ['DIV', 'DUP', 'SWAP', 'EXP', 'MSTOR...",{'unary_arithmetic_ratio': 0.02631578947368421...,"{'num_nodes': 35, 'num_edges': 113, 'max_in_de..."


# Data Preprocessing

In [162]:
newData = data.drop(columns=['opcodes', 'block_features', 'attribute_features'])
newData

Unnamed: 0,label,combined_features
0,integeroverflow,"{'num_nodes': 25, 'num_edges': 87, 'max_in_deg..."
1,gaslimit,"{'num_nodes': 41, 'num_edges': 2111, 'max_in_d..."
2,gaslimit,"{'num_nodes': 43, 'num_edges': 3275, 'max_in_d..."
3,reentrancy,"{'num_nodes': 32, 'num_edges': 166, 'max_in_de..."
4,integeroverflow,"{'num_nodes': 32, 'num_edges': 360, 'max_in_de..."
...,...,...
1795,gaslimit,"{'num_nodes': 36, 'num_edges': 1524, 'max_in_d..."
1796,integeroverflow,"{'num_nodes': 38, 'num_edges': 173, 'max_in_de..."
1797,gaslimit,"{'num_nodes': 25, 'num_edges': 58, 'max_in_deg..."
1798,integeroverflow,"{'num_nodes': 35, 'num_edges': 113, 'max_in_de..."


In [163]:
from sklearn.preprocessing import LabelEncoder
labelProcess = LabelEncoder()
newData['label'] = labelProcess.fit_transform(newData['label'])
newData


Unnamed: 0,label,combined_features
0,1,"{'num_nodes': 25, 'num_edges': 87, 'max_in_deg..."
1,0,"{'num_nodes': 41, 'num_edges': 2111, 'max_in_d..."
2,0,"{'num_nodes': 43, 'num_edges': 3275, 'max_in_d..."
3,2,"{'num_nodes': 32, 'num_edges': 166, 'max_in_de..."
4,1,"{'num_nodes': 32, 'num_edges': 360, 'max_in_de..."
...,...,...
1795,0,"{'num_nodes': 36, 'num_edges': 1524, 'max_in_d..."
1796,1,"{'num_nodes': 38, 'num_edges': 173, 'max_in_de..."
1797,0,"{'num_nodes': 25, 'num_edges': 58, 'max_in_deg..."
1798,1,"{'num_nodes': 35, 'num_edges': 113, 'max_in_de..."


In [164]:
newData["combined_features"][0]

{'num_nodes': 25,
 'num_edges': 87,
 'max_in_degree': 24,
 'max_out_degree': 25,
 'unary_arithmetic_ratio': 0.045454545454545456,
 'binary_arithmetic_ratio': 0.056818181818181816,
 'block_ratio': 0.0,
 'control_flow_ratio': 0.14772727272727273,
 'environment_ratio': 0.0,
 'system_ratio': 0.022727272727272728,
 'stack_ratio': 0.3977272727272727,
 'invalid_ratio': 0.0}

In [165]:
data = newData['combined_features']
data

0       {'num_nodes': 25, 'num_edges': 87, 'max_in_deg...
1       {'num_nodes': 41, 'num_edges': 2111, 'max_in_d...
2       {'num_nodes': 43, 'num_edges': 3275, 'max_in_d...
3       {'num_nodes': 32, 'num_edges': 166, 'max_in_de...
4       {'num_nodes': 32, 'num_edges': 360, 'max_in_de...
                              ...                        
1795    {'num_nodes': 36, 'num_edges': 1524, 'max_in_d...
1796    {'num_nodes': 38, 'num_edges': 173, 'max_in_de...
1797    {'num_nodes': 25, 'num_edges': 58, 'max_in_deg...
1798    {'num_nodes': 35, 'num_edges': 113, 'max_in_de...
1799    {'num_nodes': 35, 'num_edges': 416, 'max_in_de...
Name: combined_features, Length: 1800, dtype: object

In [166]:
import numpy as np
from sklearn.preprocessing import MinMaxScaler

# Define expected keys for count and ratio features
count_keys = ['num_nodes', 'num_edges', 'max_in_degree', 'max_out_degree']
ratio_keys = [
    'unary_arithmetic_ratio', 'binary_arithmetic_ratio', 'block_ratio',
    'control_flow_ratio', 'environment_ratio', 'system_ratio', 'stack_ratio', 'invalid_ratio'
]

# Initialize lists to store feature values
count_values = []
ratio_values = []

# Iterate over each feature dictionary
for features in data:
    # Extract count features, using 0 as default for missing keys
    count_values.append([features.get(key, 0) for key in count_keys])
    
    # Extract ratio features, using 0 as default for missing keys
    ratio_values.append([features.get(key, 0) for key in ratio_keys])

# Convert lists to NumPy arrays
count_values = np.array(count_values)
ratio_values = np.array(ratio_values)

# Step 2: Normalize count-based features
scaler = MinMaxScaler()
normalized_counts = scaler.fit_transform(count_values)



In [167]:
# Step 4: Combine all features into a single vector
vectorized_data = np.concatenate([normalized_counts, ratio_values], axis=1)

# Print the resulting vector
print("Vectorized Data:", vectorized_data)

Vectorized Data: [[0.32432432 0.00458824 0.00400123 ... 0.02272727 0.39772727 0.        ]
 [0.75675676 0.16470216 0.16497384 ... 0.         0.         0.        ]
 [0.81081081 0.25678348 0.25730994 ... 0.         0.         0.        ]
 ...
 [0.32432432 0.00229412 0.00153894 ... 0.         0.         0.        ]
 [0.59459459 0.00664504 0.00615574 ... 0.04385965 0.38596491 0.        ]
 [0.59459459 0.03061467 0.03477993 ... 0.         0.         0.        ]]


In [168]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(vectorized_data, newData['label'], test_size=0.2, random_state=42)


In [169]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler

# Define the parameter grid
param_grid = {
    'svc__C': [0.1, 1, 10, 100],
    'svc__gamma': ['scale', 'auto', 0.001, 0.01, 0.1, 1],
    'svc__kernel': ['rbf', 'linear']
}

# Create a pipeline
pipeline = Pipeline([
    ('scaler', MinMaxScaler()),  # Step to normalize features
    ('svc', SVC(probability=True))  # SVC model
])

# Set up the GridSearchCV
main_pipeline = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
main_pipeline


In [170]:
main_pipeline.fit(X_train, y_train)
print("Best Parameters:", main_pipeline.best_params_)
print("Best Score:", main_pipeline.best_score_)

y_pred = main_pipeline.predict(X_test)
print(classification_report(y_test, y_pred))



Best Parameters: {'svc__C': 100, 'svc__gamma': 'scale', 'svc__kernel': 'rbf'}
Best Score: 0.90625
              precision    recall  f1-score   support

           0       0.95      0.77      0.85       115
           1       1.00      1.00      1.00       103
           2       0.84      0.96      0.90       142

    accuracy                           0.91       360
   macro avg       0.93      0.91      0.91       360
weighted avg       0.92      0.91      0.91       360





In [171]:
joblib.dump(pipeline, 'svc_model.pkl')


['svc_model.pkl']

In [172]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler

# Define the parameter grid for GradientBoostingClassifier
param_grid = {
    'gbc__n_estimators': [50, 100, 150],
    'gbc__learning_rate': [0.01, 0.1, 0.2],
    'gbc__max_depth': [3, 4, 5],
    'gbc__subsample': [0.8, 1.0]
}

# Create a pipeline
pipeline = Pipeline([
    ('scaler', MinMaxScaler()),  # Step to normalize features
    ('gbc', GradientBoostingClassifier(random_state=42))  # Gradient Boosting Classifier
])

# Set up the GridSearchCV
main_pipeline = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
main_pipeline


In [173]:
# Fit the model
main_pipeline.fit(X_train, y_train)

# Print the best parameters and the best score
print("Best Parameters:", main_pipeline.best_params_)
print("Best Score:", main_pipeline.best_score_)

# Predict and evaluate
y_pred = main_pipeline.predict(X_test)
print(classification_report(y_test, y_pred))

Best Parameters: {'gbc__learning_rate': 0.1, 'gbc__max_depth': 3, 'gbc__n_estimators': 50, 'gbc__subsample': 1.0}
Best Score: 0.9319444444444445
              precision    recall  f1-score   support

           0       0.95      0.90      0.92       115
           1       1.00      1.00      1.00       103
           2       0.92      0.96      0.94       142

    accuracy                           0.95       360
   macro avg       0.96      0.95      0.96       360
weighted avg       0.95      0.95      0.95       360



In [174]:
def extract_features_from_dict(data_series):
    # Define all possible features we expect
    feature_keys = [
        'num_nodes', 'num_edges', 'max_in_degree', 'max_out_degree',
        'unary_arithmetic_ratio', 'binary_arithmetic_ratio', 'block_ratio',
        'control_flow_ratio', 'environment_ratio', 'system_ratio', 
        'stack_ratio', 'invalid_ratio'
    ]
    
    # Initialize an empty array
    features = np.zeros((len(data_series), len(feature_keys)))
    
    # Fill the array with values from dictionaries
    for i, dict_data in enumerate(data_series):
        for j, key in enumerate(feature_keys):
            features[i, j] = dict_data.get(key, 0)  # Use 0 as default if key doesn't exist
            
    return features
data = extract_features_from_dict(data)
data

array([[2.50000000e+01, 8.70000000e+01, 2.40000000e+01, ...,
        2.27272727e-02, 3.97727273e-01, 0.00000000e+00],
       [4.10000000e+01, 2.11100000e+03, 5.47000000e+02, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [4.30000000e+01, 3.27500000e+03, 8.47000000e+02, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       ...,
       [2.50000000e+01, 5.80000000e+01, 1.60000000e+01, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [3.50000000e+01, 1.13000000e+02, 3.10000000e+01, ...,
        4.38596491e-02, 3.85964912e-01, 0.00000000e+00],
       [3.50000000e+01, 4.16000000e+02, 1.24000000e+02, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00]])

In [175]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from torch.optim.lr_scheduler import StepLR


In [176]:
y_train

832     2
836     2
1103    2
859     2
567     2
       ..
1130    1
1294    1
860     2
1459    0
1126    0
Name: label, Length: 1440, dtype: int64

In [177]:
X_train = torch.FloatTensor(X_train)
X_test = torch.FloatTensor(X_test)
y_train = torch.LongTensor(y_train.to_numpy())
y_test = torch.LongTensor(y_test.to_numpy())


In [178]:
train_dataset= TensorDataset(X_train.unsqueeze(1), y_train)
val_dataset = TensorDataset(X_test.unsqueeze(1), y_test)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)



In [144]:
# # make a fake data like the real data
# X_train = torch.randn(100, 1, 12)
# y_train = torch.randint(0, 3, (100,))
# train_dataset = TensorDataset(X_train, y_train) 
# train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)

# X_test = torch.randn(100, 1, 12)
# y_test = torch.randint(0, 3, (100,))
# val_dataset = TensorDataset(X_test, y_test)
# val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)



In [182]:
class BiLSTMClassifier(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size, dropout_rate=0.3):
        super(BiLSTMClassifier, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        
        # LSTM layer with dropout
        self.lstm = nn.LSTM(
            input_size, 
            hidden_size, 
            num_layers,
            batch_first=True,
            bidirectional=True,
            dropout=dropout_rate if num_layers > 1 else 0
        )
        
        # Multiple fully connected layers with batch norm and dropout
        self.fc_layers = nn.Sequential(
            nn.Linear(hidden_size * 2, hidden_size),
            nn.BatchNorm1d(hidden_size),  # Batch norm after linear layer
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            
            nn.Linear(hidden_size, hidden_size // 2),
            nn.BatchNorm1d(hidden_size // 2),  # Batch norm after linear layer
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            
            nn.Linear(hidden_size // 2, output_size)
        )
    
    def forward(self, x):
        # Add sequence dimension if not present
        if x.dim() == 2:
            x = x.unsqueeze(1)  # (batch_size, 1, features)
        
        # LSTM forward pass
        lstm_out, _ = self.lstm(x)
        
        # Take the output from the last time step
        lstm_out = lstm_out[:, -1, :]
        
        # Pass through fully connected layers
        out = self.fc_layers(lstm_out)
        return out

# Initialize model with proper parameters
model = BiLSTMClassifier(
    input_size=12,          # Number of features
    hidden_size=128,        # Increased hidden size
    num_layers=3,           # Number of LSTM layers
    output_size=3,          # Number of classes
    dropout_rate=0.3
)

# Initialize optimizer and scheduler
optimizer = torch.optim.AdamW(
    model.parameters(),
    lr=0.001,
    weight_decay=0.01
)

criterion = nn.CrossEntropyLoss()

scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer,
    mode='min',
    factor=0.1,
    patience=3,
    verbose=True
)
model

BiLSTMClassifier(
  (lstm): LSTM(12, 128, num_layers=3, batch_first=True, dropout=0.3, bidirectional=True)
  (fc_layers): Sequential(
    (0): Linear(in_features=256, out_features=128, bias=True)
    (1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): Dropout(p=0.3, inplace=False)
    (4): Linear(in_features=128, out_features=64, bias=True)
    (5): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (6): ReLU()
    (7): Dropout(p=0.3, inplace=False)
    (8): Linear(in_features=64, out_features=3, bias=True)
  )
)

In [183]:
# Training setup
num_epochs = 100
best_val_loss = float('inf')
patience = 7
trigger_times = 0

# Lists to store metrics for plotting
train_losses = []
val_losses = []
train_accuracies = []
val_accuracies = []

for epoch in range(num_epochs):
    # Training phase
    model.train()
    train_loss = 0.0
    train_correct = 0
    train_total = 0
    
    for batch_data, batch_labels in train_loader:
        # Zero the gradients
        optimizer.zero_grad()
        
        # Forward pass
        outputs = model(batch_data)
        loss = criterion(outputs, batch_labels)
        
        # Backward pass and optimize
        loss.backward()
        # Gradient clipping to prevent exploding gradients
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        
        # Calculate training metrics
        train_loss += loss.item()
        _, predicted = torch.max(outputs.data, 1)
        train_total += batch_labels.size(0)
        train_correct += (predicted == batch_labels).sum().item()

    # Validation phase
    model.eval()
    val_loss = 0.0
    val_correct = 0
    val_total = 0
    
    with torch.no_grad():
        for val_data, val_labels in val_loader:
            # Forward pass
            val_outputs = model(val_data)
            val_loss += criterion(val_outputs, val_labels).item()
            
            # Calculate validation metrics
            _, predicted = torch.max(val_outputs.data, 1)
            val_total += val_labels.size(0)
            val_correct += (predicted == val_labels).sum().item()

    # Calculate epoch metrics
    epoch_train_loss = train_loss / len(train_loader)
    epoch_val_loss = val_loss / len(val_loader)
    epoch_train_acc = 100 * train_correct / train_total
    epoch_val_acc = 100 * val_correct / val_total
    
    # Store metrics for plotting
    train_losses.append(epoch_train_loss)
    val_losses.append(epoch_val_loss)
    train_accuracies.append(epoch_train_acc)
    val_accuracies.append(epoch_val_acc)
    
    # Update learning rate scheduler
    scheduler.step(epoch_val_loss)
    
    # Print epoch results
    print(f"Epoch [{epoch+1}/{num_epochs}]")
    print(f"Train Loss: {epoch_train_loss:.4f}, Train Acc: {epoch_train_acc:.2f}%")
    print(f"Val Loss: {epoch_val_loss:.4f}, Val Acc: {epoch_val_acc:.2f}%")
    print("-" * 50)

    # Early stopping check
    if epoch_val_loss < best_val_loss:
        best_val_loss = epoch_val_loss
        trigger_times = 0
        # Save the best model
        torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'train_loss': epoch_train_loss,
            'val_loss': epoch_val_loss,
        }, 'best_model.pth')
    else:
        trigger_times += 1
        if trigger_times >= patience:
            print("Early stopping triggered!")
            break

# Plot the training metrics
fig = go.Figure()

# Add training loss
fig.add_trace(go.Scatter(
    x=list(range(1, len(train_losses) + 1)),
    y=train_losses,
    mode='lines+markers',
    name='Train Loss'
))

# Add validation loss
fig.add_trace(go.Scatter(
    x=list(range(1, len(val_losses) + 1)),
    y=val_losses,
    mode='lines+markers',
    name='Validation Loss'
))

# Add training accuracy
fig.add_trace(go.Scatter(
    x=list(range(1, len(train_accuracies) + 1)),
    y=train_accuracies,
    mode='lines+markers',
    name='Train Accuracy'
))

# Add validation accuracy
fig.add_trace(go.Scatter(
    x=list(range(1, len(val_accuracies) + 1)),
    y=val_accuracies,
    mode='lines+markers',
    name='Validation Accuracy'
))

# Customize the layout
fig.update_layout(
    title='Training Metrics Over Epochs',
    xaxis_title='Epoch',
    yaxis_title='Value',
    legend_title='Metrics',
    template='plotly_dark'
)

# Show the plot
fig.show()

# Load the best model and evaluate on test set
best_model = BiLSTMClassifier(
    input_size=12,
    hidden_size=128,
    num_layers=3,
    output_size=3,
    dropout_rate=0.3
)

checkpoint = torch.load('best_model.pth')
best_model.load_state_dict(checkpoint['model_state_dict'])
best_model.eval()

# Evaluate on test set
test_correct = 0
test_total = 0
all_predictions = []
all_labels = []

with torch.no_grad():
    for test_data, test_labels in val_loader:
        outputs = best_model(test_data)
        _, predicted = torch.max(outputs.data, 1)
        test_total += test_labels.size(0)
        test_correct += (predicted == test_labels).sum().item()
        
        all_predictions.extend(predicted.numpy())
        all_labels.extend(test_labels.numpy())

# Print final test accuracy and classification report
test_accuracy = 100 * test_correct / test_total
print(f"\nFinal Test Accuracy: {test_accuracy:.2f}%")
print("\nClassification Report:")
print(classification_report(all_labels, all_predictions))

Epoch [1/100]
Train Loss: 0.7942, Train Acc: 60.21%
Val Loss: 0.9711, Val Acc: 39.72%
--------------------------------------------------
Epoch [2/100]
Train Loss: 0.4143, Train Acc: 82.50%
Val Loss: 0.3508, Val Acc: 87.78%
--------------------------------------------------
Epoch [3/100]
Train Loss: 0.3665, Train Acc: 83.89%
Val Loss: 0.2838, Val Acc: 88.89%
--------------------------------------------------
Epoch [4/100]
Train Loss: 0.3441, Train Acc: 84.79%
Val Loss: 0.2730, Val Acc: 88.89%
--------------------------------------------------
Epoch [5/100]
Train Loss: 0.3286, Train Acc: 85.35%
Val Loss: 0.2550, Val Acc: 89.44%
--------------------------------------------------
Epoch [6/100]
Train Loss: 0.3141, Train Acc: 86.88%
Val Loss: 0.2438, Val Acc: 90.00%
--------------------------------------------------
Epoch [7/100]
Train Loss: 0.2938, Train Acc: 87.15%
Val Loss: 0.2409, Val Acc: 89.17%
--------------------------------------------------
Epoch [8/100]
Train Loss: 0.2978, Train A


Final Test Accuracy: 89.72%

Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.72      0.82       115
           1       1.00      1.00      1.00       103
           2       0.81      0.96      0.88       142

    accuracy                           0.90       360
   macro avg       0.92      0.90      0.90       360
weighted avg       0.91      0.90      0.89       360



In [200]:
from prettytable import PrettyTable
import numpy as np

def create_model_table(model):
    # Create tables
    arch_table = PrettyTable()
    arch_table.field_names = ["Layer", "Type", "Output Shape", "Parameters", "Connected to"]
    
    param_table = PrettyTable()
    param_table.field_names = ["Layer Type", "Parameter Count", "% of Total"]
    
    total_params = 0
    layer_params = {}
    
    def get_layer_info(layer):
        params = sum(p.numel() for p in layer.parameters() if p.requires_grad)
        total = sum(p.numel() for p in layer.parameters())
        trainable = sum(p.numel() for p in layer.parameters() if p.requires_grad)
        return total, trainable

    def format_shape(shape):
        if isinstance(shape, tuple):
            return 'x'.join(str(dim) for dim in shape)
        return str(shape)

    # Track previous layer for connections
    prev_layer = None
    
    # Analyze each layer
    for name, layer in model.named_children():
        if isinstance(layer, nn.Sequential):
            for i, sublayer in enumerate(layer):
                layer_name = f"{name}_{i}"
                layer_type = sublayer.__class__.__name__
                
                # Get parameter counts
                total, trainable = get_layer_info(sublayer)
                total_params += trainable
                
                # Track parameters by layer type
                if layer_type not in layer_params:
                    layer_params[layer_type] = 0
                layer_params[layer_type] += trainable
                
                # Determine output shape
                if hasattr(sublayer, 'out_features'):
                    output_shape = sublayer.out_features
                elif isinstance(sublayer, nn.LSTM):
                    output_shape = f"{sublayer.hidden_size}*2 (bidirectional)"
                else:
                    output_shape = "Same as input"
                
                # Add row to architecture table
                arch_table.add_row([
                    layer_name,
                    layer_type,
                    output_shape,
                    f"{trainable:,}",
                    prev_layer if prev_layer else "Input"
                ])
                
                prev_layer = layer_name
        else:
            layer_type = layer.__class__.__name__
            total, trainable = get_layer_info(layer)
            total_params += trainable
            
            if layer_type not in layer_params:
                layer_params[layer_type] = 0
            layer_params[layer_type] += trainable
            
            # Determine output shape
            if isinstance(layer, nn.LSTM):
                output_shape = f"{layer.hidden_size}*2 (bidirectional)"
            else:
                output_shape = "Same as input"
            
            arch_table.add_row([
                name,
                layer_type,
                output_shape,
                f"{trainable:,}",
                prev_layer if prev_layer else "Input"
            ])
            
            prev_layer = name
    
    # Create parameter distribution table
    for layer_type, params in layer_params.items():
        percentage = (params / total_params) * 100
        param_table.add_row([
            layer_type,
            f"{params:,}",
            f"{percentage:.2f}%"
        ])
    
    # Print tables
    print("\n=== Model Architecture ===")
    print(arch_table)
    print(f"\nTotal Trainable Parameters: {total_params:,}")
    
    print("\n=== Parameter Distribution ===")
    print(param_table)
    
    # Additional Statistics
    print("\n=== Model Statistics ===")
    stats_table = PrettyTable()
    stats_table.field_names = ["Metric", "Value"]
    
    # Calculate model depth
    model_depth = len(list(model.modules())) - 1  # Subtract 1 to exclude the model itself
    
    stats_table.add_row(["Model Depth", model_depth])
    stats_table.add_row(["Total Parameters", f"{total_params:,}"])
    stats_table.add_row(["Trainable Parameters", f"{sum(p.numel() for p in model.parameters() if p.requires_grad):,}"])
    stats_table.add_row(["Non-trainable Parameters", f"{sum(p.numel() for p in model.parameters() if not p.requires_grad):,}"])
    
    # Memory estimation (rough approximation)
    memory_estimate = total_params * 4 / (1024 * 1024)  # 4 bytes per parameter, convert to MB
    stats_table.add_row(["Estimated Model Size", f"{memory_estimate:.2f} MB"])
    
    print(stats_table)

# Use the function
create_model_table(model)


=== Model Architecture ===
+-------------+-------------+-----------------------+------------+--------------+
|    Layer    |     Type    |      Output Shape     | Parameters | Connected to |
+-------------+-------------+-----------------------+------------+--------------+
|     lstm    |     LSTM    | 128*2 (bidirectional) |  935,936   |    Input     |
| fc_layers_0 |    Linear   |          128          |   32,896   |     lstm     |
| fc_layers_1 | BatchNorm1d |     Same as input     |    256     | fc_layers_0  |
| fc_layers_2 |     ReLU    |     Same as input     |     0      | fc_layers_1  |
| fc_layers_3 |   Dropout   |     Same as input     |     0      | fc_layers_2  |
| fc_layers_4 |    Linear   |           64          |   8,256    | fc_layers_3  |
| fc_layers_5 | BatchNorm1d |     Same as input     |    128     | fc_layers_4  |
| fc_layers_6 |     ReLU    |     Same as input     |     0      | fc_layers_5  |
| fc_layers_7 |   Dropout   |     Same as input     |     0      | fc_