### Data Prep and Training

#### Step 1: Load the Data

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load the filtered datasets
classes_df = pd.read_csv('../data/filtered/filtered_classes.csv')
edgelist_df = pd.read_csv('../data/filtered/filtered_edgelist.csv')
features_df = pd.read_csv('../data/filtered/filtered_features.csv', header=None)

# Rename columns for features_df
features_df.columns = ['txId'] + [f'feature_{i}' for i in range(1, features_df.shape[1])]

# Merge features and classes data
data = pd.merge(features_df, classes_df, on='txId')

# Display the first few rows of the merged data
data.head()

Unnamed: 0,txId,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,...,feature_158,feature_159,feature_160,feature_161,feature_162,feature_163,feature_164,feature_165,feature_166,class
0,232438397,1,0.163054,1.96379,-0.646376,12.409294,-0.063725,9.782742,12.414558,-0.163645,...,-0.613614,0.241128,0.241406,1.072793,0.08553,-0.131155,0.677799,-0.120613,-0.119792,2
1,232029206,1,-0.005027,0.578941,-0.091383,4.380281,-0.063725,4.667146,0.851305,-0.163645,...,-0.613614,0.241128,0.241406,0.60412,0.008632,-0.131155,0.333211,-0.120613,-0.119792,2
2,232344069,1,-0.147852,-0.184668,-1.201369,-0.12197,-0.043875,-0.113002,-0.061584,-0.137933,...,-0.613614,0.241128,0.241406,0.018279,-0.08749,-0.131155,-0.097524,-0.120613,-0.119792,2
3,27553029,1,-0.151357,-0.184668,-1.201369,-0.12197,-0.043875,-0.113002,-0.061584,-0.141519,...,-0.582077,-0.979074,-0.978556,0.018279,-0.08749,-0.131155,-0.097524,-0.120613,-0.119792,2
4,3881097,1,-0.172306,-0.184668,-1.201369,0.028105,-0.043875,-0.02914,0.242712,-0.16364,...,-0.600999,0.241128,0.241406,0.018279,-0.068266,-0.084674,-0.05445,-1.760926,-1.760984,2


#### Step 2: Split the Data into Training and Test Sets

In [2]:
# Separate features and labels
X = data.drop(columns=['class', 'txId'])
y = data['class']

# Split the data, stratify by the class to maintain the ratio of fraudulent nodes
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Display the ratio of classes in the training and test sets
print("Training set class distribution:")
print(y_train.value_counts(normalize=True))
print("\nTest set class distribution:")
print(y_test.value_counts(normalize=True))


Training set class distribution:
class
2    0.902392
1    0.097608
Name: proportion, dtype: float64

Test set class distribution:
class
2    0.902395
1    0.097605
Name: proportion, dtype: float64


#### Step 3: Verify the Split

In [3]:
# Calculate and display the ratio of fraudulent nodes in the training set
train_fraud_ratio = y_train.value_counts(normalize=True)
print("Training set fraud ratio:")
print(train_fraud_ratio)

# Calculate and display the ratio of fraudulent nodes in the test set
test_fraud_ratio = y_test.value_counts(normalize=True)
print("\nTest set fraud ratio:")
print(test_fraud_ratio)

Training set fraud ratio:
class
2    0.902392
1    0.097608
Name: proportion, dtype: float64

Test set fraud ratio:
class
2    0.902395
1    0.097605
Name: proportion, dtype: float64


#### Step 4: Save the Training and Test Sets (Optional)

In [5]:
# Save the training and test sets to CSV files in the new directory
X_train.to_csv('../data/splits/training_features.csv', index=False)
y_train.to_csv('../data/splits/training_labels.csv', index=False)
X_test.to_csv('../data/splits/test_features.csv', index=False)
y_test.to_csv('../data/splits/test_labels.csv', index=False)