# Anti Money Laundering Python Notebook

In [None]:
import pyantiML

import pandas as pd
import gc
from pathlib import Path
import multiprocessing
import numpy as np

In [None]:
# Function: Turns Full Transactions CSV into Yearly CSVs
def full_to_years_csv(full_pd):
    for year in range(min(full_pd['TIMESTAMP']).year,max(full_pd['TIMESTAMP']).year+1):
        year_pd = full_pd[full_pd['TIMESTAMP'].dt.year == year]
        full_pd[(full_pd['TIMESTAMP']>max(year_pd['TIMESTAMP'])) & (full_pd['TIMESTAMP']<=max(year_pd['TIMESTAMP'])+pd.Timedelta(days=7))]
        year_pd.to_csv(f'{year}.csv', sep = '|', index = False)
    return 1

In [None]:
# Function: Turns Yearly CSVs to Monthly CSVs and appends 7 days of data from the next month
def years_to_months_csv(csv_year_list):
    for i in range(0, len(csv_year_list)):
        csv_year_pd =  pd.read_csv(csv_year_list[i], sep = '|')
        csv_year_pd['TIMESTAMP'] = pd.to_datetime(csv_year_pd['TIMESTAMP'], infer_datetime_format=True, errors='coerce')

        for month in range(1,13):
            print(f'{csv_year_list[i]}file {month}month')
            month_pd = csv_year_pd[csv_year_pd['TIMESTAMP'].dt.month == month]

            if len(month_pd) == 0:
                print(f'{csv_year_list[i]} file {month} month has no length!')
                continue
            #max_val = max(month_pd['TIMESTAMP'])
            #print(f'Max Time stamp PD: {max_val}')
            
            month_pd = month_pd.append(csv_year_pd[(csv_year_pd['TIMESTAMP']>max(month_pd['TIMESTAMP']))&(csv_year_pd['TIMESTAMP']<=max(month_pd['TIMESTAMP'])+pd.Timedelta(days=7))])
            
            month_pd = month_pd[month_pd['RECEIVER'] != month_pd['SENDER']]
            month_pd.to_csv(''.join([str(csv_year_list[i])[0:4],'_',str(month),'.csv']), sep = '|', index = False)


            if (i == len(csv_year_list)-1) & (month == 12):

                month_pd = csv_year_pd[csv_year_pd['TIMESTAMP'].dt.month == 12]       
                month_pd = month_pd[month_pd['RECEIVER']!=month_pd['SENDER']]
                month_pd.to_csv(''.join([csv_year_list[i][0:4],'_',month,'.csv']), sep = '|', index = False)
                continue

            if month == 12:
                month_pd = csv_year_pd[csv_year_pd['TIMESTAMP'].dt.month == 12]
                month_pd = month_pd[month_pd['RECEIVER'] != month_pd['SENDER']]

                next_csv_year_pd =  pd.read_csv(csv_year_list[i+1], sep = '|')
                next_csv_year_pd['TIMESTAMP'] = pd.to_datetime(next_csv_year_pd['TIMESTAMP'], infer_datetime_format=True, errors='coerce')
                if len(month_pd) == 0:
                    continue
                month_pd = month_pd.append(next_csv_year_pd[(next_csv_year_pd['TIMESTAMP']>max(month_pd['TIMESTAMP']))&(next_csv_year_pd['TIMESTAMP']<=max(month_pd['TIMESTAMP'])+pd.Timedelta(days=7))])
                month_pd = month_pd[month_pd['RECEIVER'] != month_pd['SENDER']]
                month_pd.to_csv(''.join([str(csv_year_list[i])[0:4],'_',str(month),'.csv']), sep = '|', index = False)
    return 1 

In [None]:
#Executes Full to Yearly and cleans up memory
full_pd = pd.read_csv('transactions_full.csv', sep = '|')
full_pd['TIMESTAMP'] = pd.to_datetime(full_pd['TIMESTAMP'], infer_datetime_format=True, errors='coerce')
full_to_years_csv(full_pd)
del full_pd
#del month_pd
gc.collect()

In [None]:
#Executes Yearly to Monthly
path = Path('./')
csv_year_list=sorted(path.rglob('20[0-2][0-9].csv'))
years_to_months_csv(csv_year_list)

#Delete Files:
for f in path.rglob('20[0-2][0-9].csv'):
    f.unlink()
with Path('./transactions_full.csv') as f:
    f.unlink()

# Execute Main ML Search in Parallel Processes:

In [None]:
for itter_year in path.rglob('20[0-2][0-9]_*.csv'):
    print(f'Starting ML detection for year {str(itter_year)}')
    myantiml = pyantiML.antiML.from_csv(itter_year)
    gc.collect()
    pool = multiprocessing.Pool(12)
    random_ids = myantiml.unique_ids
    split_nparr_unique_ids = np.array_split(random_ids, 12)
    result = pool.map(myantiml.sus_bridges_pd,split_nparr_unique_ids)
    del myantiml
    gc.collect()
    pool.close()
    pool.join()
    print(result)
    itter_year.unlink()

In [None]:
df_list = []
for x in path.rglob('temp*.csv'):
    print(x)
    df_list.append(pd.read_csv(x, header = None))
    x.unlink()
result_pd = pd.DataFrame({'FROM':pd.concat(df_list)[0],'FROM_TRANS':pd.concat(df_list)[1],'Suspected Bridge':pd.concat(df_list)[2],'TO_TRANS':pd.concat(df_list)[3],'TO':pd.concat(df_list)[4]})

# This Displays All Suspected Bridges with Associated Transactions

In [27]:
# Duplicates must be dropped because monthly overlap at the end of the month
result_pd = result_pd.drop_duplicates()
result_pd

Unnamed: 0,FROM,FROM_TRANS,Suspected Bridge,TO_TRANS,TO
0,ID00001692197092,64310d0c110b7c187fab75ca3878262ae118d24fbd0143...,ID00001692996528,5d3ede6f0de33550c7dfb00e9dc0e0dab4d4535bc9e568...,ID00001692193100
1,ID00001692269656,bb04004be11783789e2df23ea0cccfbe3449eca7c41a7c...,ID00001692334453,d449164734bc7a8aba38f45d7ff58ed986a44406ac342c...,ID00001692269656
0,ID00001693068719,6a21abb2217843e88fe1f2b3bf7ee9c7f864fafa7c945e...,ID00001693034869,b9c89c8a6c0750ad6b3c5da6119419df9a80950c901e54...,ID00001693036867
1,ID00001693050006,89ae28574a478b1b958b415bfa7aa3b55c840f277232fb...,ID00001693034869,2e7e4b88c7e03ca01181ea031c7aff4b8d2ccc9e8be982...,ID00001693069002
2,ID00001692196479,e33eabfe0a8a3ab32fb98c1a938d24247eae1fd94efc7c...,ID00001693034869,b9c89c8a6c0750ad6b3c5da6119419df9a80950c901e54...,ID00001693036867
...,...,...,...,...,...
131,ID00001692272382,edc84174a6166911e83889f9cca776f18c366f8112e097...,ID00001692857736,79427bd94084df82a1d9dc6383a7ec059db216827bc3fe...,ID00001692357908
132,ID00001692272382,edc84174a6166911e83889f9cca776f18c366f8112e097...,ID00001692857736,57077d07b23f574270a7620a90e2aac6cbaeb32f3663d7...,ID00001692357908
133,ID00001692272382,edc84174a6166911e83889f9cca776f18c366f8112e097...,ID00001692857736,73592d621429dafe9d0c8bac98b1ef11f0e40cfe42b66b...,ID00001692357908
134,ID00001692272382,edc84174a6166911e83889f9cca776f18c366f8112e097...,ID00001692857736,d2b34a72536db1b256c2e104c6d577dd0afb79ddae6024...,ID00001692357908


# Send all suspicious transactions to csv file

In [None]:
pd.concat([result_pd['FROM_TRANS'],result_pd['TO_TRANS']]).drop_duplicates().to_csv('suspicious_transactions.csv',index=False, header=False)

# Send all suspicious entities ranked by number of suspicious transactions to csv file

In [159]:
final_sus_entities = pd.DataFrame(pd.concat([result_pd[['FROM_TRANS','Suspected Bridge']].drop_duplicates()['Suspected Bridge'].value_counts(),
result_pd[['FROM','FROM_TRANS']].drop_duplicates()['FROM'].value_counts(),
result_pd[['TO','TO_TRANS']].drop_duplicates()['TO'].value_counts()]).sort_values(ascending=False))

final_sus_entities['Entities'] = final_sus_entities.index
final_sus_entities['Entities'].to_csv('suspicious_entities.csv', header = False, index=False)
final_sus_entities

Unnamed: 0,0,Entities
ID00001692189152,225960,ID00001692189152
ID00001692272382,109308,ID00001692272382
ID00001692199091,82800,ID00001692199091
ID00001692306657,80928,ID00001692306657
ID00001692190076,79008,ID00001692190076
...,...,...
ID00001693066303,12,ID00001693066303
ID00001692662738,12,ID00001692662738
ID00001692957534,12,ID00001692957534
ID00001692469119,12,ID00001692469119


# Entities most frequently engaged in bridging transactions

In [160]:
count_sus = result_pd[['FROM','Suspected Bridge','TO']].groupby(['FROM','Suspected Bridge','TO']).size().reset_index(name = 'Count').sort_values('Count',ascending=False)
count_sus

Unnamed: 0,FROM,Suspected Bridge,TO,Count
12354,ID00001692272382,ID00001692199091,ID00001692189152,538788
21876,ID00001692588632,ID00001692186081,ID00001692309026,315432
21900,ID00001692588632,ID00001692186081,ID00001692346854,304032
20647,ID00001692488287,ID00001692307432,ID00001692308087,171348
21871,ID00001692588632,ID00001692186081,ID00001692187791,168900
...,...,...,...,...
26190,ID00001692659756,ID00001692189152,ID00001692208880,12
26189,ID00001692659754,ID00001692306657,ID00001693044681,12
26188,ID00001692659754,ID00001692306657,ID00001692658858,12
26187,ID00001692659754,ID00001692306657,ID00001692658653,12
