In [1]:
# Connect to Postgres
import os
import psycopg2
def newCursor():
    try:
        connection = psycopg2.connect(user = os.environ["DB_USER"],
                                      host = os.environ["DB_HOST"],
                                      password = os.environ["DB_PASSWORD"],
                                      port = "5432",
                                      database = os.environ["DB_DATABASE"])

        cursor = connection.cursor()
        return cursor

    except (Exception, psycopg2.Error) as error:
        print("Error while connecting to PostgreSQL", error)
        return error

In [2]:
c = newCursor()
q = """
SELECT table_name
  FROM information_schema.tables
 WHERE table_schema='swift'
   AND table_type='BASE TABLE'
"""
c.execute(q)
swift_tables = c.fetchall()
for table in swift_tables:
    print(table[0])

simulated_fin_parsed_hackathon
simulated_data_senders_train_hackathon
simulated_data_senders_test_hackathon


In [3]:
import pandas as pd

c = newCursor()
q = """
SELECT *
FROM swift."simulated_data_senders_train_hackathon"

"""
c.execute(q)
cols = [desc[0] for desc in c.description]
df = pd.DataFrame(c.fetchall(), columns=cols)
df.head()

Unnamed: 0,Transaction Ref,Originator,Sender,Receiver,Beneficiary,Date,Time,Currency,Value,Flag
0,9D7125820D5E,ARPWSTXX,ARPWSTXX,KMZBTCXX,KMZBTCXX,190726,1335,67B,10201,False
1,0D1888782D9A,ARPWSTXX,ARPWSTXX,HUAFSRXX,VLOPPGXX,190429,1259,84D,45806,False
2,D20DA4E657EC,BFNHHRXX,BFNHHRXX,LQGKPNXX,YKMCMCXX,190626,1220,67E,2137157,False
3,AEE73E990B,BCFTZWXX,BCFTZWXX,WGCOPNXX,ZKLYYEXX,191204,915,F8D,406778,False
4,2BEB278530,BSKJPWXX,BSKJPWXX,XDFDCHXX,XEWHINXX,191008,818,41B,65141,False


## 1.0 Install the required libraries
> Category encoders provide functions to convert categoricals

In [4]:
# !pip install category_encoders
# !pip install sklearn
# !pip install imblearn
### below installations not required
# !pip install pycaret[full]

# !pip install xgboost


In [5]:
import pandas as pd
import numpy as np 
from sklearn.metrics import confusion_matrix
import category_encoders as ce
from sklearn.model_selection import train_test_split
import gc
from imblearn.over_sampling import SMOTE
# from imblearn.under_sampling import RandomUnderSampler

In [6]:
df.head()

Unnamed: 0,Transaction Ref,Originator,Sender,Receiver,Beneficiary,Date,Time,Currency,Value,Flag
0,9D7125820D5E,ARPWSTXX,ARPWSTXX,KMZBTCXX,KMZBTCXX,190726,1335,67B,10201,False
1,0D1888782D9A,ARPWSTXX,ARPWSTXX,HUAFSRXX,VLOPPGXX,190429,1259,84D,45806,False
2,D20DA4E657EC,BFNHHRXX,BFNHHRXX,LQGKPNXX,YKMCMCXX,190626,1220,67E,2137157,False
3,AEE73E990B,BCFTZWXX,BCFTZWXX,WGCOPNXX,ZKLYYEXX,191204,915,F8D,406778,False
4,2BEB278530,BSKJPWXX,BSKJPWXX,XDFDCHXX,XEWHINXX,191008,818,41B,65141,False


## 2.0 Create additional features
>We treat the problem as that of binary classification. We'll use sklearn models to train a classifier

In [7]:
df['SR'] = df.Sender+df.Receiver

In [8]:
df['SB']= df.Sender+df.Beneficiary

In [9]:
df['RB']=df.Receiver+df.Beneficiary

In [10]:
df['SC']= df.Sender+df.Currency

In [11]:
df['BC']=df.Beneficiary+df.Currency

> Use the Date and Time fields to create a pythong Datetime field. Using python Datetime fields, we add additional features

In [12]:
df['Datetime'] = df.Date*10000+df.Time
df['Datetime'] = pd.to_datetime(df['Datetime'].astype(str), format='%y%m%d%H%M')
df['mon'] = df.Datetime.dt.month
df['dom'] = df.Datetime.dt.day
df['dow'] = df.Datetime.dt.dayofweek
df['q'] = df.Datetime.dt.quarter
df['h'] = df.Datetime.dt.hour
df['m'] = df.Datetime.dt.minute

In [13]:
df.head()

Unnamed: 0,Transaction Ref,Originator,Sender,Receiver,Beneficiary,Date,Time,Currency,Value,Flag,...,RB,SC,BC,Datetime,mon,dom,dow,q,h,m
0,9D7125820D5E,ARPWSTXX,ARPWSTXX,KMZBTCXX,KMZBTCXX,190726,1335,67B,10201,False,...,KMZBTCXXKMZBTCXX,ARPWSTXX67B,KMZBTCXX67B,2019-07-26 13:35:00,7,26,4,3,13,35
1,0D1888782D9A,ARPWSTXX,ARPWSTXX,HUAFSRXX,VLOPPGXX,190429,1259,84D,45806,False,...,HUAFSRXXVLOPPGXX,ARPWSTXX84D,VLOPPGXX84D,2019-04-29 12:59:00,4,29,0,2,12,59
2,D20DA4E657EC,BFNHHRXX,BFNHHRXX,LQGKPNXX,YKMCMCXX,190626,1220,67E,2137157,False,...,LQGKPNXXYKMCMCXX,BFNHHRXX67E,YKMCMCXX67E,2019-06-26 12:20:00,6,26,2,2,12,20
3,AEE73E990B,BCFTZWXX,BCFTZWXX,WGCOPNXX,ZKLYYEXX,191204,915,F8D,406778,False,...,WGCOPNXXZKLYYEXX,BCFTZWXXF8D,ZKLYYEXXF8D,2019-12-04 09:15:00,12,4,2,4,9,15
4,2BEB278530,BSKJPWXX,BSKJPWXX,XDFDCHXX,XEWHINXX,191008,818,41B,65141,False,...,XDFDCHXXXEWHINXX,BSKJPWXX41B,XEWHINXX41B,2019-10-08 08:18:00,10,8,1,4,8,18


> Combine Currency and Month features

In [14]:
df['Cmon'] = df.Currency+df.mon.astype(str)
df['Cdom'] = df.Currency+df.dom.astype(str)

> Let's perform the train/test spilt Experimentation started with 80/20 split. Finally on a stable and more accurate model we are trying 95/5 split

In [15]:
x_train, x_test, = train_test_split(df,stratify=df['Flag'],random_state=42,test_size=0.05)

In [16]:
df.columns

Index(['Transaction Ref', 'Originator', 'Sender', 'Receiver', 'Beneficiary',
       'Date', 'Time', 'Currency', 'Value', 'Flag', 'SR', 'SB', 'RB', 'SC',
       'BC', 'Datetime', 'mon', 'dom', 'dow', 'q', 'h', 'm', 'Cmon', 'Cdom'],
      dtype='object')

### 2.1 Use count encoder to frequency encode the categorical fields

In [17]:
cat_features=['Sender', 'Receiver', 'Beneficiary', 'Date', 'Currency',  'SR',
       'SB', 'RB', 'SC', 'BC', 'mon', 'dom', 'dow', 'q', 'h','Cmon','Cdom']
count_enc = ce.CountEncoder()

count_encoded_train = count_enc.fit_transform(x_train[cat_features])
count_encoded_test = count_enc.transform(x_test[cat_features])


In [18]:
x_train = x_train.join(count_encoded_train.add_suffix("_count"))
x_test = x_test.join(count_encoded_test.add_suffix("_count"))


In [19]:
x_test.head()

Unnamed: 0,Transaction Ref,Originator,Sender,Receiver,Beneficiary,Date,Time,Currency,Value,Flag,...,RB_count,SC_count,BC_count,mon_count,dom_count,dow_count,q_count,h_count,Cmon_count,Cdom_count
2679805,CBA3D95829ED,BQUSERXX,BQUSERXX,BCBVGHXX,BCBVGHXX,190104,1817,8B9,129421,False,...,100.0,5205,258.0,1,4,4,1,18,3733,1596
273845,9DD08AC11C69,BDCKMLXX,BDCKMLXX,LXZXIQXX,HIVUAWXX,190520,903,97F,1102984,False,...,30.0,9195,310.0,5,20,0,2,9,797,288
3273225,548C54AEC2D9,ATCUIDXX,ATCUIDXX,NQOHIDXX,NQOHIDXX,190604,1139,D9A,137815,False,...,313.0,1197,122.0,6,4,1,2,11,1965,948
2187173,C7BB6CA30EEDE8C9,ARZNGYXX,ARZNGYXX,TYPQINXX,FXGLRSXX,191204,1347,1E6,79005,False,...,145.0,30059,747.0,12,4,2,4,13,6873,3139
1755216,E47CC6A216CA,BFNHHRXX,BFNHHRXX,OHMRHRXX,OHMRHRXX,190320,1628,87F,1526177,False,...,4142.0,30765,3030.0,3,20,2,1,16,2896,1078


### 2.2 Convert the Value field into number. Create more combinations of features on mean of corresponding Value

In [20]:
x_train.Value = x_train.Value.astype(int)
x_test.Value = x_test.Value.astype(int)

In [21]:
x_train['Currency_mean'] = x_train.groupby('Currency')['Value'].transform('mean')
x_train['Bene_mean'] = x_train.groupby('Beneficiary')['Value'].transform('mean')
x_train['SC_count_mean'] = x_train.groupby('SC_count')['Value'].transform('mean')
x_train['BC_count_mean'] = x_train.groupby('BC_count')['Value'].transform('mean')
x_train['Date_mean'] = x_train.groupby('Date')['Value'].transform('mean')

x_test['Currency_mean'] = x_test.groupby('Currency')['Value'].transform('mean')
x_test['Bene_mean'] = x_test.groupby('Beneficiary')['Value'].transform('mean')
x_test['SC_count_mean'] = x_test.groupby('SC_count')['Value'].transform('mean')
x_test['BC_count_mean'] = x_test.groupby('BC_count')['Value'].transform('mean')
x_test['Date_mean'] = x_test.groupby('Date')['Value'].transform('mean')

In [22]:
gc.collect()

42

In [23]:
x_train.columns

Index(['Transaction Ref', 'Originator', 'Sender', 'Receiver', 'Beneficiary',
       'Date', 'Time', 'Currency', 'Value', 'Flag', 'SR', 'SB', 'RB', 'SC',
       'BC', 'Datetime', 'mon', 'dom', 'dow', 'q', 'h', 'm', 'Cmon', 'Cdom',
       'Sender_count', 'Receiver_count', 'Beneficiary_count', 'Date_count',
       'Currency_count', 'SR_count', 'SB_count', 'RB_count', 'SC_count',
       'BC_count', 'mon_count', 'dom_count', 'dow_count', 'q_count', 'h_count',
       'Cmon_count', 'Cdom_count', 'Currency_mean', 'Bene_mean',
       'SC_count_mean', 'BC_count_mean', 'Date_mean'],
      dtype='object')

In [24]:
x_train['h_mean'] = x_train.groupby('h')['Value'].transform('mean')
x_test['h_mean'] = x_test.groupby('h')['Value'].transform('mean')

### 2.3 Create additional feature to capture corresponding risk scores. Target encoder takes the mean of 'Flag' for these features

In [25]:
encoded_columns = ["Date", 'h','Beneficiary','Currency', 'SB', 'BC','SC', 'dow', 'SR', 'RB', 'mon', 'Receiver','dom']
encoderT=ce.TargetEncoder(cols=["Date", 'h','Beneficiary','Currency', 'SB', 'BC','SC', 'dow','SR', 'RB', 'mon', 'Receiver','dom'], smoothing=5 ,return_df=True)
df_train_transformed = encoderT.fit_transform(x_train[encoded_columns], x_train.Flag)
x_train =  x_train.join(df_train_transformed.add_suffix("_tar"))

df_test_transformed = encoderT.transform(x_test[encoded_columns])

x_test =  x_test.join(df_test_transformed.add_suffix("_tar"))


In [26]:
x_train.columns

Index(['Transaction Ref', 'Originator', 'Sender', 'Receiver', 'Beneficiary',
       'Date', 'Time', 'Currency', 'Value', 'Flag', 'SR', 'SB', 'RB', 'SC',
       'BC', 'Datetime', 'mon', 'dom', 'dow', 'q', 'h', 'm', 'Cmon', 'Cdom',
       'Sender_count', 'Receiver_count', 'Beneficiary_count', 'Date_count',
       'Currency_count', 'SR_count', 'SB_count', 'RB_count', 'SC_count',
       'BC_count', 'mon_count', 'dom_count', 'dow_count', 'q_count', 'h_count',
       'Cmon_count', 'Cdom_count', 'Currency_mean', 'Bene_mean',
       'SC_count_mean', 'BC_count_mean', 'Date_mean', 'h_mean', 'Date_tar',
       'h_tar', 'Beneficiary_tar', 'Currency_tar', 'SB_tar', 'BC_tar',
       'SC_tar', 'dow_tar', 'SR_tar', 'RB_tar', 'mon_tar', 'Receiver_tar',
       'dom_tar'],
      dtype='object')

In [27]:
x_test.columns

Index(['Transaction Ref', 'Originator', 'Sender', 'Receiver', 'Beneficiary',
       'Date', 'Time', 'Currency', 'Value', 'Flag', 'SR', 'SB', 'RB', 'SC',
       'BC', 'Datetime', 'mon', 'dom', 'dow', 'q', 'h', 'm', 'Cmon', 'Cdom',
       'Sender_count', 'Receiver_count', 'Beneficiary_count', 'Date_count',
       'Currency_count', 'SR_count', 'SB_count', 'RB_count', 'SC_count',
       'BC_count', 'mon_count', 'dom_count', 'dow_count', 'q_count', 'h_count',
       'Cmon_count', 'Cdom_count', 'Currency_mean', 'Bene_mean',
       'SC_count_mean', 'BC_count_mean', 'Date_mean', 'h_mean', 'Date_tar',
       'h_tar', 'Beneficiary_tar', 'Currency_tar', 'SB_tar', 'BC_tar',
       'SC_tar', 'dow_tar', 'SR_tar', 'RB_tar', 'mon_tar', 'Receiver_tar',
       'dom_tar'],
      dtype='object')

### 2.4 Drop the non numeric fields for which we have already create frequency or target based eoncodings

In [28]:
x_test = x_test.drop(['Transaction Ref', 'Originator', 'Sender', 'Receiver', 'Beneficiary',
       'Date', 'Time', 'Currency','SR', 'SB', 'RB', 'SC', 'BC', 'Datetime','Cmon', 'Cdom'], axis=1)

x_train = x_train.drop(['Transaction Ref', 'Originator', 'Sender', 'Receiver', 'Beneficiary',
       'Date', 'Time', 'Currency','SR', 'SB', 'RB', 'SC', 'BC', 'Datetime','Cmon', 'Cdom'], axis=1)


### 2.5 Separate the target field from other features and create x_train, y_train, x_test and y_test

In [29]:
X_train = x_train.drop(['Flag'],axis=1)
y_train = x_train['Flag']

X_train=X_train.fillna(0)
X_train.shape,y_train.shape

((3492955, 43), (3492955,))

In [30]:
# import gc
gc.collect()

210

In [31]:

over_sm = SMOTE(sampling_strategy=0.03)
X_train, y_train = over_sm.fit_resample(X_train, y_train)
X_train.shape,y_train.shape

((3584663, 43), (3584663,))

In [32]:
X_test = x_test.drop(['Flag'],axis=1)
X_test=X_test.fillna(0)

y_test = x_test['Flag']
X_test.shape,y_test.shape

((183840, 43), (183840,))

In [33]:
gc.collect()

86

### 2.6 [Optional] At this point we can save our pre-processed data and the encoders. Later we can reload them and continue to model fitting. 

> This step helps avoid data losses if kernel crashes. Since we have already pre-processed and saved, we can restart the kernel and resume from sec 2.7. However if sec 2.6 is not executed (i.e. if below cell is commented out) then do not execute sec 2.7 either.

In [34]:
import joblib
joblib.dump(X_train , 'X_train_vTEMP_3.joblib')
joblib.dump(y_train, 'y_trainvTEMP_3.joblib')
joblib.dump(X_test, 'X_testvTEMP_3.joblib')
joblib.dump(y_test, 'y_testvTEMP_3.joblib')
joblib.dump(encoderT, 'encoderTvTEMP_3.joblib')
joblib.dump(count_enc, 'count_encvTEMP_3.joblib')

['count_encvTEMP_3.joblib']

### 2.7 Reload pre-processed data and proceed to model building

In [1]:
import joblib
import category_encoders as ce
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np 
from sklearn.metrics import confusion_matrix

X_train = joblib.load('X_train_vTEMP_3.joblib')
y_train = joblib.load('y_trainvTEMP_3.joblib')
X_test = joblib.load('X_testvTEMP_3.joblib')
y_test = joblib.load('y_testvTEMP_3.joblib')
encoderT = joblib.load('encoderTvTEMP_3.joblib')
count_enc = joblib.load('count_encvTEMP_3.joblib')

In [2]:
# X_train = X_train.drop(['Cmon', 'Cdom'],axis=1)
# X_test = X_test.drop(['Cmon', 'Cdom'],axis=1)

### 3.0 Train and RandomForestClassifier. Save the trained model to file system for later use in test script

In [2]:
from sklearn.ensemble import RandomForestClassifier
#Create a Gaussian Classifier
import time
start_time = time.time()
clf=RandomForestClassifier(n_estimators=200, n_jobs=-1)

#Train the model using the training sets y_pred=clf.predict(X_test)
clf.fit(X_train,y_train)
end_time = time.time()
print('Time taken for one classifier is')
print(end_time - start_time)
joblib.dump(clf, 'random_forest_923_vTEMP_3.joblib')

Time taken for one classifier is
893.5039541721344


['random_forest_923_vTEMP_3.joblib']

In [3]:
y_predrf=clf.predict(X_test)

In [4]:
from sklearn.metrics import classification_report,confusion_matrix
tn, fp, fn, tp = confusion_matrix(y_test, y_predrf).ravel()
tn, fp, fn, tp

(183120, 52, 173, 495)

In [5]:
from sklearn.metrics import f1_score
f1 = f1_score(y_test, y_predrf)
f1

0.8148148148148149

In [6]:
from sklearn.metrics import roc_curve, auc, roc_auc_score
roc_auc_score(y_test,y_predrf)

0.8703670389441892