## Test script
>This script should be executed only after train script is executed
>This script run the data preprocessing on the separately provided test data (without 'Flag')
>After data preprocessing, it predicts the 'Flag' on the test data

In [1]:
# Connect to Postgres
import os
import psycopg2
def newCursor():
    try:
        connection = psycopg2.connect(user = os.environ["DB_USER"],
                                      host = os.environ["DB_HOST"],
                                      password = os.environ["DB_PASSWORD"],
                                      port = "5432",
                                      database = os.environ["DB_DATABASE"])

        cursor = connection.cursor()
        return cursor

    except (Exception, psycopg2.Error) as error:
        print("Error while connecting to PostgreSQL", error)
        return error

In [2]:
c = newCursor()
q = """
SELECT table_name
  FROM information_schema.tables
 WHERE table_schema='swift'
   AND table_type='BASE TABLE'
"""
c.execute(q)
swift_tables = c.fetchall()
for table in swift_tables:
    print(table[0])

simulated_fin_parsed_hackathon
simulated_data_senders_train_hackathon
simulated_data_senders_test_hackathon


## 1.0 Install required libraries

In [3]:
# !pip install category_encoders
# !pip install sklearn

### below installations not required
# !pip install pycaret[full]

# !pip install xgboost
# !pip install imblearn

In [4]:
import pandas as pd
import numpy as np 
from sklearn.metrics import confusion_matrix
import category_encoders as ce
from sklearn.model_selection import train_test_split
import gc
import joblib

## 2.0 Load the test data from DB

In [5]:

c = newCursor()
q = """
SELECT *
FROM swift."simulated_data_senders_test_hackathon"

"""
c.execute(q)
cols = [desc[0] for desc in c.description]
df_TEST = pd.DataFrame(c.fetchall(), columns=cols)
df_TEST.head()

Unnamed: 0,Transaction Ref,Originator,Sender,Receiver,Beneficiary,Date,Time,Currency,Value
0,5B3EDDAD9B56,BEOWBIXX,BEOWBIXX,TUWNPGXX,TUWNPGXX,190416,1759,738,751775
1,919B633D36,AEAESTXX,AEAESTXX,WRLHSTXX,WRLHSTXX,190701,1410,34,14974
2,B39EE91CBC,BJXJBIXX,BJXJBIXX,RGFHBJXX,LZJZSLXX,190418,1402,F7E,360742
3,A6AC7BB94BDED2B7,ARZNGYXX,ARZNGYXX,BCIMMDXX,NHLWRSXX,190822,1043,7.00E+05,207961
4,3A83A5ECCB05D3C7,APUUZAXX,APUUZAXX,CBJZAGXX,EGEHAMXX,190917,1354,273,644294


## 3.0 Load the already saved model and encoders

In [6]:
clf =  joblib.load('random_forest_923_vTEMP_3.joblib')


In [7]:
count_enc = joblib.load('count_encvTEMP_3.joblib')
encoderT = joblib.load('encoderTvTEMP_3.joblib')

cat_features=['Sender', 'Receiver', 'Beneficiary', 'Date', 'Currency',  'SR',
       'SB', 'RB', 'SC', 'BC', 'mon', 'dom', 'dow', 'q', 'h', 'Cmon', 'Cdom']
encoded_columns = ["Date", 'h','Beneficiary','Currency', 'SB', 'BC','SC', 'dow', 'SR', 'RB', 'mon', 'Receiver','dom']

## 3.1 Preprocess the test data

In [8]:
df_TEST['SR'] = df_TEST.Sender+df_TEST.Receiver
df_TEST['SB']= df_TEST.Sender+df_TEST.Beneficiary
df_TEST['RB']=df_TEST.Receiver+df_TEST.Beneficiary
df_TEST['SC']= df_TEST.Sender+df_TEST.Currency
df_TEST['BC']=df_TEST.Beneficiary+df_TEST.Currency



df_TEST['Datetime'] = df_TEST.Date*10000+df_TEST.Time
df_TEST['Datetime'] = pd.to_datetime(df_TEST['Datetime'].astype(str), format='%y%m%d%H%M')
df_TEST['mon'] = df_TEST.Datetime.dt.month
df_TEST['dom'] = df_TEST.Datetime.dt.day
df_TEST['dow'] = df_TEST.Datetime.dt.dayofweek
df_TEST['q'] = df_TEST.Datetime.dt.quarter
df_TEST['h'] = df_TEST.Datetime.dt.hour
df_TEST['m'] = df_TEST.Datetime.dt.minute

df_TEST['Cmon'] = df_TEST.Currency+df_TEST.mon.astype(str)
df_TEST['Cdom'] = df_TEST.Currency+df_TEST.dom.astype(str)

xcount_encoded_test = count_enc.transform(df_TEST[cat_features])
df_TEST = df_TEST.join(xcount_encoded_test.add_suffix("_count"))

df_TEST.Value = df_TEST.Value.astype(int)

df_TEST['Currency_mean'] = df_TEST.groupby('Currency')['Value'].transform('mean')
df_TEST['Bene_mean'] = df_TEST.groupby('Beneficiary')['Value'].transform('mean')
df_TEST['SC_count_mean'] = df_TEST.groupby('SC_count')['Value'].transform('mean')
df_TEST['BC_count_mean'] = df_TEST.groupby('BC_count')['Value'].transform('mean')
df_TEST['Date_mean'] = df_TEST.groupby('Date')['Value'].transform('mean')

df_TEST['h_mean'] = df_TEST.groupby('h')['Value'].transform('mean')

df_test_transformed = encoderT.transform(df_TEST[encoded_columns])

df_TEST =  df_TEST.join(df_test_transformed.add_suffix("_tar"))

df_TEST=df_TEST.fillna(0)

## 3.2 Create and empty dataframe with two columns. This dataframe is for submission of predicted 'Flag'

In [9]:
sub_DF = pd.DataFrame(columns=['Transaction Ref', 'Flag'])
sub_DF

Unnamed: 0,Transaction Ref,Flag


In [10]:
sub_DF['Transaction Ref'] = df_TEST['Transaction Ref']
sub_DF

Unnamed: 0,Transaction Ref,Flag
0,5B3EDDAD9B56,
1,919B633D36,
2,B39EE91CBC,
3,A6AC7BB94BDED2B7,
4,3A83A5ECCB05D3C7,
...,...,...
735330,ADE9C10344CB,
735331,B2AD4BC1DCDB,
735332,DAC79A527A,
735333,2E8091370B8B,


In [11]:
df_TEST = df_TEST.drop(['Transaction Ref', 'Originator', 'Sender', 'Receiver', 'Beneficiary',
       'Date', 'Time', 'Currency','SR', 'SB', 'RB', 'SC', 'BC', 'Datetime','Cmon', 'Cdom'], axis=1)

## 3.3 Predict the 'Flag' on test data

In [13]:
y_predrf_final = clf.predict(df_TEST)
print(y_predrf_final.sum())


2000


In [14]:
sub_DF['Flag'] = y_predrf_final
sub_DF

Unnamed: 0,Transaction Ref,Flag
0,5B3EDDAD9B56,False
1,919B633D36,False
2,B39EE91CBC,False
3,A6AC7BB94BDED2B7,False
4,3A83A5ECCB05D3C7,False
...,...,...
735330,ADE9C10344CB,False
735331,B2AD4BC1DCDB,False
735332,DAC79A527A,False
735333,2E8091370B8B,False


In [15]:
sub_DF.Flag.sum()

2000

## 3.4 Save the final prediction to a csv

In [16]:
sub_DF.to_csv('./assets/submissionF1_924.csv', index=False)

> Optional steps to save the final model and encoders

In [17]:
joblib.dump(clf, 'random_forest_924_v773.joblib') 

['random_forest_924_v773.joblib']

In [18]:
joblib.dump(encoderT, 'encoderTv773.joblib')

['encoderTv773.joblib']

In [19]:
# count_enc
joblib.dump(count_enc, 'count_encv773.joblib')

['count_encv773.joblib']