# [Web3 Phishing Scammer Detection Competition](https://www.kaggle.com/competitions/forta-protect-web3) Neural Network Models
Inspired by the Benchmark Model: EasyEnsemble with One-Order Neighbors' Features

In [2]:
from psutil import cpu_count
import os

import dask.dataframe as dd
from dask.distributed import Client

import numpy as np
import pandas as pd
from sklearn.metrics import average_precision_score
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

In [None]:
client = Client()
client

In [32]:
# Get transaction data
data_dir = '../input/forta-protect-web3'


TX_DTYPES = {'value': 'object'}
COLUMNS = ['nonce', 'to_address', 'from_address', 'value', 'block_number', 'gas', 'gas_price', 'max_fee_per_gas', 'max_priority_fee_per_gas', 'transaction_type']

train_dfs = dd.read_csv(f'{data_dir}/eoa_tx_train/eoa_tx_train/*.csv', dtype=TX_DTYPES, usecols=COLUMNS)
test_dfs = dd.read_csv(f'{data_dir}/eoa_tx_test/eoa_tx_test/*.csv', dtype=TX_DTYPES, usecols=COLUMNS)

In [33]:
# Get training ground truth values and ids
train_ids = pd.read_csv(f'{data_dir}/train.csv')
test_ids = pd.read_csv(f'{data_dir}/test.csv')

In [None]:
train_dfs.head(5)

In [None]:
train_dfs.describe()

In [34]:
first_tx_train = pd.to_datetime(train_ids['first_tx_timestamp'])
last_tx_train = pd.to_datetime(train_ids['last_tx_timestamp'])
account_age_train = last_tx_train - first_tx_train
first_tx_test = pd.to_datetime(test_ids['first_tx_timestamp'])
last_tx_test = pd.to_datetime(test_ids['last_tx_timestamp'])
account_age_test = last_tx_test - first_tx_test

In [35]:
# add account age to training ground truth set
train_ids['account_age']=account_age_train.astype('timedelta64[D]')
test_ids['account_age']=account_age_test.astype('timedelta64[D]')

Convert account age to days

In [None]:
train_ids.head(5)

In [36]:
test_ids.head(5)

Unnamed: 0,address,first_tx_timestamp,last_tx_timestamp,Id,account_age
0,0x8808273c346152c4df2ce49a3299079c87215298,2021-09-01 06:27:20 UTC,2022-05-26 00:38:09 UTC,1,266.0
1,0xec203218855498cb08d47ec6f585c9c263d2663c,2021-04-25 13:53:18 UTC,2022-03-10 11:54:40 UTC,2,318.0
2,0x849c93a693d2c1e81daa40723199ece968cce590,2022-02-02 04:55:54 UTC,2022-02-14 23:50:36 UTC,3,12.0
3,0x5586e22d0c78735e769c25a56ef46116ccac37e5,2022-04-30 16:45:01 UTC,2022-05-12 09:11:34 UTC,4,11.0
4,0x678155075c95608d60b472e77a4616bb91371156,2022-02-23 05:34:06 UTC,2022-06-03 18:31:34 UTC,5,100.0


In [None]:
train_ids['account_age'].describe()

Create visualization to explore extent to which account_age may contribute to EOA phisher status.

In [None]:
f, ax = plt.subplots(figsize=(7, 5))
sns.despine(f)

sns.histplot(
    train_ids,
    x="account_age", hue="target",
    bins=7,
    multiple="stack",
    log_scale=(False,True),
)

It doesn't look like the account age is concentrated at any particular range enough to make a significant difference in the classification. Let's confirm that by building a Logistic Regression model.

In [None]:
X = train_ids['account_age']
y = train_ids['target']
X = X.values
X = X.reshape(-1,1)
print(X.shape)
print(X.dtype)
print(y.shape)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

log_reg = LogisticRegression(random_state=42, class_weight='balanced')  #balanced to account for the large class imbalance
log_reg.fit(X_train, y_train)

In [None]:
from sklearn.model_selection import cross_val_score

cross_val_score(log_reg, X_train, y_train, cv=3, scoring="accuracy")

Indeed, the accuracy of this model is quite low. Let's keep this feature and continue building out a robust model with many more features.

In [30]:
def generate_features(tx_df, labels):
    # Convert value from WEI to ETH and gas prices to GWEI
    tx_df['value_eth'] = tx_df['value'].str.zfill(18).astype(float) / 1e18
    tx_df['gas_price_gwei'] = tx_df['gas_price'] / 1e9
    tx_df['max_fee_per_gas_gwei'] = tx_df['max_fee_per_gas'] / 1e9
    tx_df['max_priority_fee_per_gas_gwei'] = tx_df['max_priority_fee_per_gas'] / 1e9
    incoming = tx_df.merge(labels, how='left', left_on='to_address', right_on='address')
    incoming = incoming[incoming['address'].notnull()].compute()
    
    incoming_agg = incoming.groupby('to_address').agg({'block_number': 'std', 
                                                       'from_address': ['count', 'nunique'],
                                                       'value_eth': ['max', 'mean', 'min', 'sum']}).reset_index()
    incoming_agg.columns = ["_".join(filter(None, name)) for name in incoming_agg.columns.to_flat_index()]
    # in_block_number_std: standard deviation of blockNumber of all incoming transactions for an EOA
    # in_value_max: maximum value of all incoming transactions for an EOA
    # in_value_mean: mean value of all incoming transactions for an EOA
    # in_value_min: minimum value of all incoming transactions for an EOA
    # in_value_sum: sum of all incoming transactions for an EOA
    # from_address_count: total incoming transaction count for an EOA
    # from_address_nunique: total incoming transaction from unique address for an EOA
    incoming_agg = incoming_agg.rename(columns={'block_number_std': 'in_block_number_std',
                                                'value_eth_max': 'in_value_max',
                                                'value_eth_mean': 'in_value_mean',
                                                'value_eth_min': 'in_value_min',
                                                'value_eth_sum': 'in_value_sum'})
    # from_address_count_unique_ratio: of all incoming transactions, what ratio is unique?
    incoming_agg['from_address_count_unique_ratio'] = incoming_agg['from_address_nunique'] / incoming_agg['from_address_count']
    # for std with 1 transaction, fill na with 0
    # https://stackoverflow.com/questions/32130954/pandas-standard-deviation-returns-nan
    incoming_agg['in_block_number_std'] = incoming_agg['in_block_number_std'].fillna(0)
    incoming_2 = incoming.merge(incoming_agg, how='left', left_on='to_address', right_on='to_address')
    incoming_2['transaction_type'] = incoming_2['transaction_type'].fillna(3)
    incoming_2 = incoming_2.drop(['value', 'gas_price', 'max_fee_per_gas', 'max_priority_fee_per_gas', 'from_address', 'first_tx_timestamp', 'last_tx_timestamp', 'address', 'account_age'], axis=1)
    dt = labels.merge(incoming_2, how='left', left_on='address', right_on='to_address')
    dt = dt.drop(['to_address', 'first_tx_timestamp', 'last_tx_timestamp'], axis=1)
    dt = dt.fillna(0)
    
    return dt

In [None]:
train_data = generate_features(train_dfs, train_ids)

In [None]:
train_data

In [None]:
train_data.info()

In [None]:
train_data.describe()

In [None]:
os.remove("/kaggle/working/train_data.csv") #remove legacy train_data.csv file

In [None]:
train_data.to_csv('train_data.csv',index=False) #save file to disk

In [8]:
train_data = pd.read_csv("/kaggle/working/train_data.csv") #add train_data.csv file to workspace

Setup function to extract and prepare data for model training.

In [8]:
def get_features_and_labels(features, no_labels=False):
    EXCLUDE_COLS = {'target_x', 'target_y', 'target', 'address', 'Id_x', 'Id_y', 'Id'}
    x_cols = [col for col in features.columns if col not in EXCLUDE_COLS]

    X = features[x_cols] 
    y = None if no_labels else features['target']
    
    #convert to numpy arrays
    X = X.to_numpy() 
    y = y if no_labels else y.to_numpy()
    
    return X, y

In [10]:
x_train, y_train = get_features_and_labels(train_data)

In [None]:
x_train.shape

In [None]:
y_train.shape

In [11]:
# Create a Normalization layer and set its internal state using the training data
normalizer = layers.Normalization()
normalizer.adapt(x_train)
input_shape = x_train.shape[1:]

2022-11-08 21:28:52.221494: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


In [12]:
METRICS = [
      keras.metrics.TruePositives(name='tp'),
      keras.metrics.FalsePositives(name='fp'),
      keras.metrics.TrueNegatives(name='tn'),
      keras.metrics.FalseNegatives(name='fn'), 
      keras.metrics.BinaryAccuracy(name='accuracy'),
      keras.metrics.Precision(name='precision'),
      keras.metrics.Recall(name='recall'),
      keras.metrics.AUC(name='auc'),
      keras.metrics.AUC(name='prc', curve='PR'), # precision-recall curve
]
EPOCHS = 20
BATCH_SIZE = 2048

In [13]:
# Create a model that includes the normalization layer
inputs = keras.Input(shape=input_shape)
x = normalizer(inputs)
y = layers.Dense(16, activation='relu')(x)
z = keras.layers.Dropout(0.5)(y)
outputs = layers.Dense(1, activation='sigmoid')(z)
model_a = keras.Model(inputs, outputs)

In [14]:
model_a.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 17)]              0         
_________________________________________________________________
normalization (Normalization (None, 17)                35        
_________________________________________________________________
dense (Dense)                (None, 16)                288       
_________________________________________________________________
dropout (Dropout)            (None, 16)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 17        
Total params: 340
Trainable params: 305
Non-trainable params: 35
_________________________________________________________________


In [15]:
model_a.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3), 
              loss=tf.keras.losses.BinaryCrossentropy(),
              metrics=METRICS)

In [16]:
model_a.fit(x_train, y_train, batch_size=BATCH_SIZE, epochs=EPOCHS, shuffle=True)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7ef741206b10>

While the accuracy and precision may look nice, the recall is abysmal. This is likely due to the massive class imbalance between accounts identified as phishers (positive cases) and those that were not (negative cases). Let's examine this class imbalance.

In [17]:
neg, pos = np.bincount(train_data['target'])
total = neg + pos
print('Examples:\n    Total: {}\n    Positive: {} ({:.2f}% of total)\n'.format(
    total, pos, 100 * pos / total))

Examples:
    Total: 5378274
    Positive: 113029 (2.10% of total)



There are only 113,029 examples of phishing accounts in this corpus, or 2.10%. Let's set an accurate bias initialization to account for this imbalance per https://www.tensorflow.org/tutorials/structured_data/imbalanced_data

In [18]:
initial_bias = np.log([pos/neg])

In [19]:
output_bias = tf.keras.initializers.Constant(initial_bias)

#make the model again with the bias in the output layer
inputs = keras.Input(shape=input_shape)
x = normalizer(inputs)
y = layers.Dense(16, activation='relu')(x)
z = keras.layers.Dropout(0.5)(y)
outputs = layers.Dense(1, activation='sigmoid', bias_initializer=output_bias)(z)
model_b = keras.Model(inputs, outputs)

In [20]:
model_b.summary()

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         [(None, 17)]              0         
_________________________________________________________________
normalization (Normalization (None, 17)                35        
_________________________________________________________________
dense_2 (Dense)              (None, 16)                288       
_________________________________________________________________
dropout_1 (Dropout)          (None, 16)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 17        
Total params: 340
Trainable params: 305
Non-trainable params: 35
_________________________________________________________________


Let's also weight the positive samples more heavily so the classifier pays more attention to this class.

In [21]:
# Scaling by total/2 helps keep the loss to a similar magnitude.
# The sum of the weights of all examples stays the same.
weight_for_0 = (1 / neg) * (total / 2.0)
weight_for_1 = (1 / pos) * (total / 2.0)

class_weight = {0: weight_for_0, 1: weight_for_1}

print('Weight for class 0: {:.2f}'.format(weight_for_0))
print('Weight for class 1: {:.2f}'.format(weight_for_1))

Weight for class 0: 0.51
Weight for class 1: 23.79


In [22]:
model_b.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3), 
              loss=tf.keras.losses.BinaryCrossentropy(),
              metrics=METRICS)
model_b.fit(x_train, y_train, batch_size=BATCH_SIZE, epochs=EPOCHS, 
          shuffle=True, class_weight=class_weight)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7ef74127e210>

Let's try with no bias in the model.

In [23]:
inputs = keras.Input(shape=input_shape)
x = normalizer(inputs)
y = layers.Dense(16, activation='relu')(x)
z = keras.layers.Dropout(0.5)(y)
outputs = layers.Dense(1, activation='sigmoid')(z)
model_c = keras.Model(inputs, outputs)

In [24]:
model_c.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3), 
              loss=tf.keras.losses.BinaryCrossentropy(),
              metrics=METRICS)
model_c.fit(x_train, y_train, batch_size=BATCH_SIZE, epochs=EPOCHS, 
          shuffle=True, class_weight=class_weight)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7ef73d64ffd0>

Let's try with no dropout and no bias; only the class weighting.

In [25]:
inputs = keras.Input(shape=input_shape)
x = normalizer(inputs)
y = layers.Dense(16, activation='relu')(x)
# z = keras.layers.Dropout(0.5)(y)
outputs = layers.Dense(1, activation='sigmoid')(y)
model_d = keras.Model(inputs, outputs)

In [26]:
model_d.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3), 
              loss=tf.keras.losses.BinaryCrossentropy(),
              metrics=METRICS)
model_d.fit(x_train, y_train, batch_size=BATCH_SIZE, epochs=EPOCHS, 
          shuffle=True, class_weight=class_weight)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7ef733f8a290>

In [63]:
model_a.save('my_model_a.h5')
model_b.save('my_model_b.h5')
model_c.save('my_model_c.h5')
model_d.save('my_model_d.h5')

In [3]:
from keras.models import load_model

# returns a compiled model
# identical to the previous one
model_a = load_model('my_model_a.h5')
model_b = load_model('my_model_b.h5')
model_c = load_model('my_model_c.h5')
model_d = load_model('my_model_d.h5')

In [4]:
model_b.summary()

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         [(None, 17)]              0         
_________________________________________________________________
normalization (Normalization (None, 17)                35        
_________________________________________________________________
dense_2 (Dense)              (None, 16)                288       
_________________________________________________________________
dropout_1 (Dropout)          (None, 16)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 17        
Total params: 340
Trainable params: 305
Non-trainable params: 35
_________________________________________________________________


Prepare test submission

In [38]:
test_data = generate_features(test_dfs, test_ids)

In [5]:
test_data = pd.read_csv("/kaggle/working/test_data.csv") #load test_data.csv file from workspace

In [6]:
test_data

Unnamed: 0,address,Id_x,account_age,nonce,gas,block_number,transaction_type,value_eth,gas_price_gwei,max_fee_per_gas_gwei,max_priority_fee_per_gas_gwei,Id_y,in_block_number_std,from_address_count,from_address_nunique,in_value_max,in_value_mean,in_value_min,in_value_sum,from_address_count_unique_ratio
0,0x8808273c346152c4df2ce49a3299079c87215298,1,266.0,1074230,207128,13142587,3.0,0.033500,143.000000,0.000000,0.0,1.0,684032.567655,3,3,0.311684,0.117678,0.007850,0.353034,1.000000
1,0x8808273c346152c4df2ce49a3299079c87215298,1,266.0,1,21000,14325154,2.0,0.007850,23.503169,32.577411,1.5,1.0,684032.567655,3,3,0.311684,0.117678,0.007850,0.353034,1.000000
2,0x8808273c346152c4df2ce49a3299079c87215298,1,266.0,1035871,207128,13138175,3.0,0.311684,123.000000,0.000000,0.0,1.0,684032.567655,3,3,0.311684,0.117678,0.007850,0.353034,1.000000
3,0xec203218855498cb08d47ec6f585c9c263d2663c,2,318.0,2143,60000,12487321,3.0,0.002460,41.000000,0.000000,0.0,2.0,92257.860317,3,1,0.003300,0.002520,0.001800,0.007560,0.333333
4,0xec203218855498cb08d47ec6f585c9c263d2663c,2,318.0,2003,60000,12309851,3.0,0.003300,55.000000,0.000000,0.0,2.0,92257.860317,3,1,0.003300,0.002520,0.001800,0.007560,0.333333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1828440,0x5616931a6e5f41f47dbc49fbda2bb16d88e246a2,131719,10.0,2828408,207128,14709615,2.0,0.010640,39.444118,120.000000,2.0,131719.0,26968.994303,5,4,0.191607,0.104463,0.009658,0.522314,0.800000
1828441,0x5616931a6e5f41f47dbc49fbda2bb16d88e246a2,131719,10.0,3537563,207128,14655033,2.0,0.175734,85.180972,120.000000,2.0,131719.0,26968.994303,5,4,0.191607,0.104463,0.009658,0.522314,0.800000
1828442,0x5616931a6e5f41f47dbc49fbda2bb16d88e246a2,131719,10.0,1198649,207128,14654450,2.0,0.009658,73.871326,120.000000,2.0,131719.0,26968.994303,5,4,0.191607,0.104463,0.009658,0.522314,0.800000
1828443,0x815d0053eac89c3eecb7a88aa1fde511ea3e7605,131720,76.0,42144648,250000,14602223,2.0,0.999679,15.931056,32.110723,1.0,131720.0,325740.052462,2,1,0.999837,0.999758,0.999679,1.999516,0.500000


In [40]:
test_data.to_csv('test_data.csv',index=False) #save file to disk

In [9]:
X_test, y_test = get_features_and_labels(test_data, no_labels=True)

In [10]:
X_test.shape

(1828445, 17)

In [11]:
X_test_a = model_a.predict(X_test)
X_test_b = model_b.predict(X_test)
X_test_c = model_c.predict(X_test)
X_test_d = model_d.predict(X_test)

2022-11-09 15:48:17.735066: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


In [12]:
X_test_a.shape

(1828445, 1)

In [13]:
X_test

array([[2.66000000e+02, 1.07423000e+06, 2.07128000e+05, ...,
        7.85000000e-03, 3.53034500e-01, 1.00000000e+00],
       [2.66000000e+02, 1.00000000e+00, 2.10000000e+04, ...,
        7.85000000e-03, 3.53034500e-01, 1.00000000e+00],
       [2.66000000e+02, 1.03587100e+06, 2.07128000e+05, ...,
        7.85000000e-03, 3.53034500e-01, 1.00000000e+00],
       ...,
       [1.00000000e+01, 1.19864900e+06, 2.07128000e+05, ...,
        9.65773000e-03, 5.22313520e-01, 8.00000000e-01],
       [7.60000000e+01, 4.21446480e+07, 2.50000000e+05, ...,
        9.99678520e-01, 1.99951576e+00, 5.00000000e-01],
       [7.60000000e+01, 4.36707910e+07, 2.50000000e+05, ...,
        9.99678520e-01, 1.99951576e+00, 5.00000000e-01]])

In [17]:
X_test_d

array([[1.],
       [1.],
       [1.],
       ...,
       [1.],
       [0.],
       [0.]], dtype=float32)

In [35]:
submission_a = pd.DataFrame(np.column_stack([test_data['Id_x'], X_test_a]), columns=['Id', 'Predicted'])
submission_a

Unnamed: 0,Id,Predicted
0,1.0,1.0
1,1.0,1.0
2,1.0,1.0
3,2.0,1.0
4,2.0,1.0
...,...,...
1828440,131719.0,1.0
1828441,131719.0,1.0
1828442,131719.0,1.0
1828443,131720.0,1.0


In [38]:
submission_a = submission_a.groupby('Id').agg({'Predicted': 'mean'}).reset_index()

In [42]:
submission_a['Predicted'].describe()

count    131720.000000
mean          0.999987
std           0.003318
min           0.000000
25%           1.000000
50%           1.000000
75%           1.000000
max           1.000000
Name: Predicted, dtype: float64

In [43]:
submission_a.to_csv('benchmark_submission.csv', index=None)

In [46]:
def process_predictions(predictions, labels):
    submission = pd.DataFrame(np.column_stack([labels['Id_x'], predictions]), columns=['Id', 'Predicted'])
    submission = submission.groupby('Id').agg({'Predicted': 'mean'}).reset_index()
    submission.to_csv('benchmark_submission.csv', index=None)
    
    return submission

In [47]:
submission_a = process_predictions(X_test_a, test_data)

In [45]:
os.remove("/kaggle/working/benchmark_submission.csv")

In [77]:
test_sub['predicted'] = X_test_a.reshape(-1,1)

ValueError: Cannot set a frame with no defined index and a value that cannot be converted to a Series

In [None]:
# group by Id, avg predicted

## Areas of Improvement

1. Test with validation dataset
2. Save training history for different models and visualize differences
3. Try out different network architectures
4. Try out different combinations of class weights, biases, and learning rates
5. Try oversampling the positive cases and training on this data
6. Try adding features from other datasets
7. Remove EOA's from the data