In [41]:
import numpy as np
import pandas as pd

from tqdm import tqdm
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score, precision_score, recall_score
import scipy.sparse as sp

import warnings
warnings.filterwarnings("ignore")

# ref: https://www.kaggle.com/code/smlopezza/elliptic-data-set-eda-graphs-random-forest

#### Import dataset

In [84]:
# import data 
df_features = pd.read_csv('data/elliptic_txs_features.csv', header=None)
df_edges = pd.read_csv("data/elliptic_txs_edgelist.csv")
df_classes =  pd.read_csv("data/elliptic_txs_classes.csv")

df_classes['class'] = df_classes['class'].map({'unknown': 2, '1':1, '2':0})

# merging dataframes
df_merge = df_features.merge(df_classes, how='left', right_on="txId", left_on=0)
df_merge.drop(0, axis=1, inplace=True)

# check if there are duplicate txId
print("Number of duplicate txId: ", df_merge.duplicated(subset=['txId']).sum())


Number of duplicate txId:  0


In [85]:
# rename column 0 to time_step
# df_merge.rename(columns={1: 'time_step'}, inplace=True)
display(df_merge.head())
display(df_edges.shape)

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,159,160,161,162,163,164,165,166,txId,class
0,1,-0.171469,-0.184668,-1.201369,-0.12197,-0.043875,-0.113002,-0.061584,-0.162097,-0.167933,...,1.46133,1.461369,0.018279,-0.08749,-0.131155,-0.097524,-0.120613,-0.119792,230425980,2
1,1,-0.171484,-0.184668,-1.201369,-0.12197,-0.043875,-0.113002,-0.061584,-0.162112,-0.167948,...,-0.979074,-0.978556,0.018279,-0.08749,-0.131155,-0.097524,-0.120613,-0.119792,5530458,2
2,1,-0.172107,-0.184668,-1.201369,-0.12197,-0.043875,-0.113002,-0.061584,-0.162749,-0.168576,...,-0.979074,-0.978556,-0.098889,-0.106715,-0.131155,-0.183671,-0.120613,-0.119792,232022460,2
3,1,0.163054,1.96379,-0.646376,12.409294,-0.063725,9.782742,12.414558,-0.163645,-0.115831,...,0.241128,0.241406,1.072793,0.08553,-0.131155,0.677799,-0.120613,-0.119792,232438397,0
4,1,1.011523,-0.081127,-1.201369,1.153668,0.333276,1.312656,-0.061584,-0.163523,0.041399,...,0.517257,0.579382,0.018279,0.277775,0.326394,1.29375,0.178136,0.179117,230460314,2


(234355, 2)

In [86]:
X = df_merge.drop(['class', 'txId'], axis=1)
y = df_merge['class']

In [87]:
train_idx = pd.read_csv('data/index/train_classified_idx.csv', names=['id'])
test_idx = pd.read_csv('data/index/test_classified_idx.csv', names=['id'])

X_train = X.loc[train_idx['id'].tolist()]
y_train = y.loc[train_idx['id'].tolist()]

X_test = X.loc[test_idx['id'].tolist()]
y_test = y.loc[test_idx['id'].tolist()]

In [88]:
from sklearn.ensemble import RandomForestClassifier

# Random Forest (also from scikit-learn, with 50 estimators and 50 max features)
# parameters from thr original paper
rf = RandomForestClassifier(n_estimators=100, max_features=50, random_state=42)

# Train the classifier
rf.fit(X_train, y_train)

In [89]:
# Make predictions on the test set
y_pred = rf.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='binary')
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
rocauc = roc_auc_score(y_test, rf.predict_proba(X_test)[:, 1])

print("Accuracy: ", accuracy)
print("F1:       ", f1)
print("Precision:", precision)
print("Recall:   ", recall)
print("ROC AUC:  ", rocauc)

Accuracy:  0.9785242951409718
F1:        0.8145077720207253
Precision: 0.9279811097992916
Recall:    0.7257617728531855
ROC AUC:   0.9110998872619245


In [93]:
df_merge.loc[train_idx['id'].tolist() + test_idx['id'].tolist()]['class'].value_counts()

class
0    42019
1     4545
Name: count, dtype: int64