In [1]:
import cudf as cdf
import pandas as pd
from sklearn.model_selection import train_test_split
import itertools as it
import numpy as np
import os

### Load data and for merging and preprocessing

In [2]:
df = pd.read_csv('LI-Small_Trans.csv')
df2 = pd.read_csv('HI-Small_Trans.csv')
merged = pd.concat([df, df2])
merged["Sending"] = merged["From Bank"].astype(str) + merged['Account']
merged["Receiving"] = merged["To Bank"].astype(str) + merged['Account.1']
processed = merged.drop(['From Bank', 'Account', 'To Bank', 'Account.1'], axis = 1)
processed.to_csv("merged_processed.csv")

### Stratify Sample 1% of data and form training and test sets

In [2]:
sample = merged.groupby('Is Laundering', group_keys= False).apply(lambda x:x.sample(frac = 0.01))
sample.to_csv("sample_merged_processed.csv")

In [3]:
X = sample.drop(['Is Laundering'], axis = 1)
y = sample[['Is Laundering']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.1, random_state= 100)

In [4]:
X_train.to_csv("sample_X_train.csv")
y_train.to_csv("sample_y_train.csv")
X_test.to_csv("sample_X_test.csv")
y_test.to_csv("sample_y_test.csv")

### Get edge lists of both sets

In [9]:
def save_edges_to_csv(edge_list, name):
    edge_frame = cdf.DataFrame(edge_list)
    print(edge_frame)
    no_dupes = edge_frame.drop_duplicates()
    no_dupes.to_csv(name, index = False)
    del no_dupes

def get_in_common(X):
    unique_sending = X['Sending'].unique()
    unique_receiving = X['Receiving'].unique()
    merged_unique = unique_sending.append(unique_receiving)
    merged_unique = merged_unique.unique()
    acct_nums = merged_unique.to_pandas()
    res = []
    for account_number in acct_nums:
        in_common = X[(X['Sending'] == account_number) | (X['Receiving'] == account_number )].index.to_numpy()
        res.append(in_common)
    return res

In [10]:
X_train = cdf.read_csv("sample_X_train.csv")
X_train_in_common = get_in_common(X_train)
edges = []
i = 0
dump_num = 1
for common in X_train_in_common:
    n = len(common)
    for i in range(0, n-1):
        for j in range(i+1, n):
            edges.append({'source' : common[i], 'target' : common[j]})
            i = i + 1
            if i > 15000000:
                i = 0
                save_edges_to_csv(edges, "sample_X_train_edges-pt" + str(dump_num))
                dump_num = dump_num + 1
                edges = []
save_edges_to_csv(edges, "sample_X_train_edges-pt" + str(dump_num))



In [None]:
X_test = cdf.read_csv("sample_X_test.csv")
X_test_in_common = get_in_common(X_test)
edges = []
i = 0
dump_num = 1
for common in X_train_in_common:
    n = len(common)
    for i in range(0, n-1):
        for j in range(i+1, n):
            edges.append({'source' : common[i], 'target' : common[j]})
            i = i + 1
            if i > 15000000:
                i = 0
                save_edges_to_csv(edges, "sample_X_test_edges-pt" + str(dump_num))
                dump_num = dump_num + 1
                edges = []
save_edges_to_csv(edges, "sample_X_test_edges-pt" + str(dump_num))

### Use PecanPy to learn node embeddings of each transaction in each set

In [6]:
os.system("pecanpy --input sample_X_train_edges-pt1 --output X_train_embeddings --mode SparseOTF --implicit_ids --delimiter ,")
os.system("pecanpy --input sample_X_test_edges-pt1 --output X_test_embeddings --mode SparseOTF --implicit_ids --delimiter ,")

0

### Combine embeddings with respective set

In [10]:
def combine_with_embeddings(train_file, embedding_file, target):
    X = pd.read_csv(train_file)
    embed = pd.read_csv(embedding_file, sep = " ", header=None, index_col = 0, skiprows=1 )
    X_embeddings = pd.merge(X, embed, left_index = True, right_index= True, how = "left").replace(np.nan, 0)
    X_embeddings = X_embeddings.drop(columns = ['Unnamed: 0', 'Unnamed: 0.1', 'Timestamp', 'Sending', 'Receiving'])
    X_embeddings.to_csv(target)
    return X_embeddings

In [11]:
X_train_embeddings = combine_with_embeddings("sample_X_train.csv", "X_train_embeddings", 'X_train_with_embeddings.csv')
X_test_embeddings = combine_with_embeddings("sample_X_test.csv", "X_test_embeddings", 'X_test_with_embeddings.csv')

  if (await self.run_code(code, result,  async_=asy)):
