# Sensitivity Analysis
### Author: Terence Kaplan

In [1]:
# Load necessary packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, PowerTransformer
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.ensemble import RandomForestClassifier

In [2]:
# Read the CSV file into a data frame
df = pd.read_csv('./HI-Small_Trans.csv')

In [3]:
# Convert variable "Timestamp" to date and time and arrange in ascending order
df['Timestamp'] = pd.to_datetime(df['Timestamp'], format='%Y/%m/%d %H:%M')
df = df.sort_values('Timestamp')

In [4]:
# Rename variables "Account" to "From Account" and "Account.1" to "To Account"
df = df.rename(columns={"Account": "From Account", "Account.1": "To Account"})

In [5]:
# Inspect the data set
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5078345 entries, 316720 to 4962234
Data columns (total 11 columns):
 #   Column              Dtype         
---  ------              -----         
 0   Timestamp           datetime64[ns]
 1   From Bank           int64         
 2   From Account        object        
 3   To Bank             int64         
 4   To Account          object        
 5   Amount Received     float64       
 6   Receiving Currency  object        
 7   Amount Paid         float64       
 8   Payment Currency    object        
 9   Payment Format      object        
 10  Is Laundering       int64         
dtypes: datetime64[ns](1), float64(2), int64(3), object(5)
memory usage: 464.9+ MB


### Select Transactions From One Single Bank

In [6]:
# Concatenate the variables "From Bank" and "To Bank"
bank = pd.concat([df['From Bank'], df['To Bank']])

# Count transactions for each bank
bank_counts = bank.value_counts()
bank_counts.head()

70    452751
10    124176
12    121626
1      92326
15     91232
dtype: int64

In [7]:
# Filter for all transactions where either one or both variables "From Bank" and "To Bank"
# contain the unique identifier "70"
df_70 = df[(df['From Bank'] == 70) | (df['To Bank'] == 70)]

In [8]:
# Calculate number of transactions for each payment format

# For all observations
format_counts = df_70['Payment Format'].value_counts().sort_values(ascending=False)
format_percentage = ((format_counts / len(df_70)) * 100).round(1)

# For observations grouped by class

# Calculate number of transactions
format_grouped_counts = df_70.groupby(['Is Laundering', 'Payment Format']).size()

# Calculate total number of transactions for each class
format_total_counts = df_70.groupby('Is Laundering').size()

# Calculate percentages for each class
format_grouped_percentage = (format_grouped_counts / format_total_counts * 100).round(1)

# Merge all into one data frame
format_counts_all = pd.DataFrame({
    'Format': format_counts.index,
    'Counts total': format_counts.values,
    'Percentage total': format_percentage.values,
    'Counts legitimate': format_grouped_counts[0].reindex(format_counts.index, fill_value=0).values,
    'Percentage legitimate': format_grouped_percentage[0].reindex(format_counts.index, fill_value=0).values,
    'Counts fraudulent': format_grouped_counts[1].reindex(format_counts.index, fill_value=0).values,
    'Percentage fraudulent': format_grouped_percentage[1].reindex(format_counts.index, fill_value=0).values
}).set_index('Format')

format_counts_all

Unnamed: 0_level_0,Counts total,Percentage total,Counts legitimate,Percentage legitimate,Counts fraudulent,Percentage fraudulent
Format,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Cheque,213698,47.2,213397,47.2,301,47.6
Credit Card,134680,29.7,134474,29.7,206,32.5
Cash,91272,20.2,91164,20.2,108,17.1
Bitcoin,13101,2.9,13083,2.9,18,2.8


In [9]:
# Filter for all transactions where either one or both variables "From Bank" and "To Bank"
# contain the unique identifier "10"
df = df[(df['From Bank'] == 10) | (df['To Bank'] == 10)]

In [10]:
# Calculate number of transactions for each payment format

# For all observations
format_counts = df['Payment Format'].value_counts().sort_values(ascending=False)
format_percentage = ((format_counts / len(df)) * 100).round(1)

# For observations grouped by class

# Calculate number of transactions
format_grouped_counts = df.groupby(['Is Laundering', 'Payment Format']).size()

# Calculate total number of transactions for each class
format_total_counts = df.groupby('Is Laundering').size()

# Calculate percentages for each class
format_grouped_percentage = (format_grouped_counts / format_total_counts * 100).round(1)

# Merge all into one data frame
format_counts_all = pd.DataFrame({
    'Format': format_counts.index,
    'Counts total': format_counts.values,
    'Percentage total': format_percentage.values,
    'Counts legitimate': format_grouped_counts[0].reindex(format_counts.index, fill_value=0).values,
    'Percentage legitimate': format_grouped_percentage[0].reindex(format_counts.index, fill_value=0).values,
    'Counts fraudulent': format_grouped_counts[1].reindex(format_counts.index, fill_value=0).values,
    'Percentage fraudulent': format_grouped_percentage[1].reindex(format_counts.index, fill_value=0).values
}).set_index('Format')

format_counts_all

Unnamed: 0_level_0,Counts total,Percentage total,Counts legitimate,Percentage legitimate,Counts fraudulent,Percentage fraudulent
Format,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Cheque,47150,41.2,47148,41.2,2,2.0
Credit Card,32379,28.3,32375,28.3,4,4.0
ACH,14133,12.3,14039,12.3,94,93.1
Cash,12036,10.5,12035,10.5,1,1.0
Wire,5047,4.4,5047,4.4,0,0.0
Reinvestment,3606,3.2,3606,3.2,0,0.0
Bitcoin,103,0.1,103,0.1,0,0.0


In [11]:
# Number and ratio of non-fraud and fraud cases
occ = df['Is Laundering'].value_counts()
occ, occ / len(df.index)

(0    114353
 1       101
 Name: Is Laundering, dtype: int64,
 0    0.999118
 1    0.000882
 Name: Is Laundering, dtype: float64)

In [12]:
# Count number of unique accounts
df['From Account'].nunique(), df['To Account'].nunique()

(7332, 9874)

### Creation of all Features (identical to feature_creation.ipynb)
#### V1: Average of the Digits Sums & Average Number of Zeros

In [13]:
# Create function to calculate the sum of digits of a transaction amount
def digit_sum(amount):
    return sum(int(digit) for digit in str(amount) if digit.isdigit())

# Create function to count the number of zeros of a transaction amount
def count_zeros(amount):
    formatted_amount = "{:.6f}".format(amount)
    return formatted_amount.count('0')

# Compute the sum of digits and the number of zeros for each transaction
df['Digit Sum Amount Received'] = df['Amount Received'].apply(digit_sum)
df['Digit Sum Amount Paid'] = df['Amount Paid'].apply(digit_sum)
df['Count Zeros Amount Received'] = df['Amount Received'].apply(count_zeros)
df['Count Zeros Amount Paid'] = df['Amount Paid'].apply(count_zeros)

# Calculate the average digit sum and average number of zeros for paid transactions by each account
paid_digit_stats = df.groupby('From Account').agg({
    'Digit Sum Amount Paid': np.mean,
    'Count Zeros Amount Paid': np.mean
}).reset_index()
paid_digit_stats.rename(columns={
    'Digit Sum Amount Paid': 'Average Digit Sum Paid',
    'Count Zeros Amount Paid': 'Average Count Zeros Paid'
}, inplace=True)

# Calculate the average digit sum and average number of zeros for received transactions by each account
received_digit_stats = df.groupby('To Account').agg({
    'Digit Sum Amount Received': np.mean,
    'Count Zeros Amount Received': np.mean
}).reset_index()
received_digit_stats.rename(columns={
    'Digit Sum Amount Received': 'Average Digit Sum Received',
    'Count Zeros Amount Received': 'Average Count Zeros Received'
}, inplace=True)

# Merge both data sets into a single data frame
digit_stats = pd.merge(paid_digit_stats, received_digit_stats, how='outer', left_on='From Account', right_on='To Account')

# Combine the two account IDs in a new variable "Account ID"
digit_stats['Account ID'] = digit_stats['From Account'].combine_first(digit_stats['To Account'])

# Drop redundant columns
digit_stats = digit_stats.drop(columns=['From Account', 'To Account'])
digit_stats.head()

Unnamed: 0,Average Digit Sum Paid,Average Count Zeros Paid,Average Digit Sum Received,Average Count Zeros Received,Account ID
0,23.574468,4.456042,,,100428660
1,31.675676,4.557432,,,1004287C8
2,26.692308,4.575499,,,100428930
3,21.0,4.0,,,800042CB0
4,23.242938,4.446328,22.931818,4.318182,800042E70


In [14]:
# Merge with variable "From Account"
df_v1 = pd.merge(df, digit_stats, how='left', left_on='From Account', right_on='Account ID')
df_v1.rename(columns={
    'Average Digit Sum Paid': 'From Account Average Digit Sum Paid',
    'Average Count Zeros Paid': 'From Account Average Count Zeros Paid',
    'Average Digit Sum Received': 'From Account Average Digit Sum Received',
    'Average Count Zeros Received': 'From Account Average Count Zeros Received'
}, inplace=True)
df_v1.drop(columns=['Account ID'], inplace=True)

# Merge with variable "To Account"
df_v1 = pd.merge(df_v1, digit_stats, how='left', left_on='To Account', right_on='Account ID')
df_v1.rename(columns={
    'Average Digit Sum Paid': 'To Account Average Digit Sum Paid',
    'Average Count Zeros Paid': 'To Account Average Count Zeros Paid',
    'Average Digit Sum Received': 'To Account Average Digit Sum Received',
    'Average Count Zeros Received': 'To Account Average Count Zeros Received'
}, inplace=True)
df_v1.drop(columns=['Account ID'], inplace=True)
df_v1.head()

Unnamed: 0,Timestamp,From Bank,From Account,To Bank,To Account,Amount Received,Receiving Currency,Amount Paid,Payment Currency,Payment Format,...,Count Zeros Amount Received,Count Zeros Amount Paid,From Account Average Digit Sum Paid,From Account Average Count Zeros Paid,From Account Average Digit Sum Received,From Account Average Count Zeros Received,To Account Average Digit Sum Paid,To Account Average Count Zeros Paid,To Account Average Digit Sum Received,To Account Average Count Zeros Received
0,2022-09-01,10,8017C89F0,10,8017C89F0,1270437.9,US Dollar,1270437.9,US Dollar,Reinvestment,...,6,6,32.736842,4.947368,24.434783,4.608696,32.736842,4.947368,24.434783,4.608696
1,2022-09-01,10,801659620,6156,812677DF0,8414.25,US Dollar,8414.25,US Dollar,Cash,...,4,4,28.25,4.25,31.0,4.0,,,27.333333,4.333333
2,2022-09-01,10,80030B3E0,22205,8024588B0,8073.9,US Dollar,8073.9,US Dollar,Cheque,...,6,6,28.5,5.0,18.333333,4.333333,,,34.5,5.0
3,2022-09-01,4242,8119E7050,10,80010BBD0,1.76,US Dollar,1.76,US Dollar,Credit Card,...,4,4,15.333333,4.0,,,22.193548,4.677419,21.142857,4.714286
4,2022-09-01,5836,81314F3A0,10,8131C71C0,19948.25,US Dollar,19948.25,US Dollar,Credit Card,...,4,4,32.75,4.25,,,,,32.75,4.25


#### V2: Average Ratio of Zeros compared to Total Number of Digits

In [15]:
# Create function to calculate the ratio of zero digits to the total number of a transaction amount
def zero_digit_ratio(amount):
    amount_str = "{:.6f}".format(amount)
    return amount_str.count('0') / (len(amount_str) - int("." in amount_str))

# Compute the ratio of zero digits for each transaction
df['Zero Digit Ratio Amount Received'] = df['Amount Received'].apply(zero_digit_ratio)
df['Zero Digit Ratio Amount Paid'] = df['Amount Paid'].apply(zero_digit_ratio)

# Calculate the average zero digit ratio for paid transactions by each account
paid_zero_digit_ratio_stats = df.groupby('From Account')['Zero Digit Ratio Amount Paid'].mean().reset_index()
paid_zero_digit_ratio_stats.rename(columns={
    'Zero Digit Ratio Amount Paid': 'Average Zero Digit Ratio Paid'
}, inplace=True)

# Calculate the average zero digit ratio for received transactions by each account
received_zero_digit_ratio_stats = df.groupby('To Account')['Zero Digit Ratio Amount Received'].mean().reset_index()
received_zero_digit_ratio_stats.rename(columns={
    'Zero Digit Ratio Amount Received': 'Average Zero Digit Ratio Received'
}, inplace=True)

# Merge both data sets into a single data frame
zero_digit_ratio_stats = pd.merge(paid_zero_digit_ratio_stats, received_zero_digit_ratio_stats, how='outer', left_on='From Account', right_on='To Account')

# Combine the two account IDs in a new variable "Account ID"
zero_digit_ratio_stats['Account ID'] = zero_digit_ratio_stats['From Account'].combine_first(zero_digit_ratio_stats['To Account'])

# Drop redundant columns
zero_digit_ratio_stats = zero_digit_ratio_stats.drop(columns=['From Account', 'To Account'])
zero_digit_ratio_stats.head()

Unnamed: 0,Average Zero Digit Ratio Paid,Average Zero Digit Ratio Received,Account ID
0,0.466902,,100428660
1,0.406567,,1004287C8
2,0.4462,,100428930
3,0.472222,,800042CB0
4,0.467233,0.454263,800042E70


In [16]:
# Merge with variable "From Account"
df_v2 = pd.merge(df_v1, zero_digit_ratio_stats, how='left', left_on='From Account', right_on='Account ID')
df_v2.rename(columns={
    'Average Zero Digit Ratio Paid': 'From Account Average Zero Digit Ratio Paid',
    'Average Zero Digit Ratio Received': 'From Account Average Zero Digit Ratio Received'
}, inplace=True)
df_v2.drop(columns=['Account ID'], inplace=True)

# Merge with variable "To Account"
df_v2 = pd.merge(df_v2, zero_digit_ratio_stats, how='left', left_on='To Account', right_on='Account ID')
df_v2.rename(columns={
    'Average Zero Digit Ratio Paid': 'To Account Average Zero Digit Ratio Paid',
    'Average Zero Digit Ratio Received': 'To Account Average Zero Digit Ratio Received'
}, inplace=True)
df_v2.drop(columns=['Account ID'], inplace=True)
df_v2.head()

Unnamed: 0,Timestamp,From Bank,From Account,To Bank,To Account,Amount Received,Receiving Currency,Amount Paid,Payment Currency,Payment Format,...,From Account Average Digit Sum Received,From Account Average Count Zeros Received,To Account Average Digit Sum Paid,To Account Average Count Zeros Paid,To Account Average Digit Sum Received,To Account Average Count Zeros Received,From Account Average Zero Digit Ratio Paid,From Account Average Zero Digit Ratio Received,To Account Average Zero Digit Ratio Paid,To Account Average Zero Digit Ratio Received
0,2022-09-01,10,8017C89F0,10,8017C89F0,1270437.9,US Dollar,1270437.9,US Dollar,Reinvestment,...,24.434783,4.608696,32.736842,4.947368,24.434783,4.608696,0.410124,0.479004,0.410124,0.479004
1,2022-09-01,10,801659620,6156,812677DF0,8414.25,US Dollar,8414.25,US Dollar,Cash,...,31.0,4.0,,,27.333333,4.333333,0.42702,0.444444,,0.421212
2,2022-09-01,10,80030B3E0,22205,8024588B0,8073.9,US Dollar,8073.9,US Dollar,Cheque,...,18.333333,4.333333,,,34.5,5.0,0.510795,0.50463,,0.481818
3,2022-09-01,4242,8119E7050,10,80010BBD0,1.76,US Dollar,1.76,US Dollar,Credit Card,...,,,22.193548,4.677419,21.142857,4.714286,0.547619,,0.478961,0.518244
4,2022-09-01,5836,81314F3A0,10,8131C71C0,19948.25,US Dollar,19948.25,US Dollar,Credit Card,...,,,,,32.75,4.25,0.397727,,,0.397727


#### Convert all Amounts into US-Dollars (for comparability)

In [17]:
# Convert all currencies to US-Dollars (exchange rates from 09/2022, waehrungsrechner.org)
currency_multipliers = {
    'Australian Dollar': 0.67926,
    'Bitcoin': 20000,
    'Brazil Real': 0.19256,
    'Canadian Dollar': 0.76011,
    'Euro': 0.99622,
    'Mexican Peso': 0.05031,
    'Ruble': 0.01673,
    'Rupee': 0.01255,
    'Saudi Riyal': 0.26612,
    'Shekel': 0.29351,
    'Swiss Franc': 0.98166,
    'UK Pound': 1.15331,
    'Yen': 0.00713,
    'Yuan': 0.14446
}
for currency, multiplier in currency_multipliers.items():
    df_v2.loc[df_v2['Payment Currency'] == currency, 'Amount Paid'] *= multiplier
    df_v2.loc[df_v2['Receiving Currency'] == currency, 'Amount Received'] *= multiplier
    
df_v2.head()

Unnamed: 0,Timestamp,From Bank,From Account,To Bank,To Account,Amount Received,Receiving Currency,Amount Paid,Payment Currency,Payment Format,...,From Account Average Digit Sum Received,From Account Average Count Zeros Received,To Account Average Digit Sum Paid,To Account Average Count Zeros Paid,To Account Average Digit Sum Received,To Account Average Count Zeros Received,From Account Average Zero Digit Ratio Paid,From Account Average Zero Digit Ratio Received,To Account Average Zero Digit Ratio Paid,To Account Average Zero Digit Ratio Received
0,2022-09-01,10,8017C89F0,10,8017C89F0,1270437.9,US Dollar,1270437.9,US Dollar,Reinvestment,...,24.434783,4.608696,32.736842,4.947368,24.434783,4.608696,0.410124,0.479004,0.410124,0.479004
1,2022-09-01,10,801659620,6156,812677DF0,8414.25,US Dollar,8414.25,US Dollar,Cash,...,31.0,4.0,,,27.333333,4.333333,0.42702,0.444444,,0.421212
2,2022-09-01,10,80030B3E0,22205,8024588B0,8073.9,US Dollar,8073.9,US Dollar,Cheque,...,18.333333,4.333333,,,34.5,5.0,0.510795,0.50463,,0.481818
3,2022-09-01,4242,8119E7050,10,80010BBD0,1.76,US Dollar,1.76,US Dollar,Credit Card,...,,,22.193548,4.677419,21.142857,4.714286,0.547619,,0.478961,0.518244
4,2022-09-01,5836,81314F3A0,10,8131C71C0,19948.25,US Dollar,19948.25,US Dollar,Credit Card,...,,,,,32.75,4.25,0.397727,,,0.397727


In [18]:
# Calculate descriptive statistics of "Amount Paid"
df_v2['Amount Paid'].describe(), df_v2.groupby('Is Laundering')['Amount Paid'].describe()

(count    1.144540e+05
 mean     3.107733e+05
 std      9.868217e+06
 min      1.673000e-04
 25%      1.917621e+02
 50%      1.028363e+03
 75%      5.344040e+03
 max      1.721962e+09
 Name: Amount Paid, dtype: float64,
                   count           mean           std        min          25%  \
 Is Laundering                                                                  
 0              114353.0  311041.023849  9.872570e+06   0.000167   191.548964   
 1                 101.0    7654.530465  6.838494e+03  34.997780  2681.841084   
 
                        50%       75%           max  
 Is Laundering                                       
 0              1026.486382   5328.55  1.721962e+09  
 1              6047.450000  11964.97  4.483688e+04  )

#### V3: Dynamic Variables: Min, Mean, Median, Max, Sum & Count

In [19]:
# Create dynamic Min, Mean, Median, Max, Sum & Count for every sender account (From Account)
df_v3 = df_v2.copy()
df_v3['From Account Amount Paid Min'] = df_v3.groupby('From Account')['Amount Paid'].transform(lambda x: x.expanding().min())
df_v3['From Account Amount Paid Mean'] = df_v3.groupby('From Account')['Amount Paid'].transform(lambda x: x.expanding().mean())
df_v3['From Account Amount Paid Median'] = df_v3.groupby('From Account')['Amount Paid'].transform(lambda x: x.expanding().median())
df_v3['From Account Amount Paid Max'] = df_v3.groupby('From Account')['Amount Paid'].transform(lambda x: x.expanding().max())
df_v3['From Account Amount Paid Sum'] = df_v3.groupby('From Account')['Amount Paid'].transform(lambda x: x.expanding().sum())
df_v3['From Account Amount Paid Count'] = df_v3.groupby('From Account')['Amount Paid'].transform(lambda x: x.expanding().count())

# Create dynamic Min, Mean, Median, Max, Sum & Count for every receiver account (To Account)
df_v3['To Account Amount Received Min'] = df_v3.groupby('To Account')['Amount Received'].transform(lambda x: x.expanding().min())
df_v3['To Account Amount Received Mean'] = df_v3.groupby('To Account')['Amount Received'].transform(lambda x: x.expanding().mean())
df_v3['To Account Amount Received Median'] = df_v3.groupby('To Account')['Amount Received'].transform(lambda x: x.expanding().median())
df_v3['To Account Amount Received Max'] = df_v3.groupby('To Account')['Amount Received'].transform(lambda x: x.expanding().max())
df_v3['To Account Amount Received Sum'] = df_v3.groupby('To Account')['Amount Received'].transform(lambda x: x.expanding().sum())
df_v3['To Account Amount Received Count'] = df_v3.groupby('To Account')['Amount Received'].transform(lambda x: x.expanding().count())

df_v3.head()

Unnamed: 0,Timestamp,From Bank,From Account,To Bank,To Account,Amount Received,Receiving Currency,Amount Paid,Payment Currency,Payment Format,...,From Account Amount Paid Median,From Account Amount Paid Max,From Account Amount Paid Sum,From Account Amount Paid Count,To Account Amount Received Min,To Account Amount Received Mean,To Account Amount Received Median,To Account Amount Received Max,To Account Amount Received Sum,To Account Amount Received Count
0,2022-09-01,10,8017C89F0,10,8017C89F0,1270437.9,US Dollar,1270437.9,US Dollar,Reinvestment,...,1270437.9,1270437.9,1270437.9,1.0,1270437.9,1270437.9,1270437.9,1270437.9,1270437.9,1.0
1,2022-09-01,10,801659620,6156,812677DF0,8414.25,US Dollar,8414.25,US Dollar,Cash,...,8414.25,8414.25,8414.25,1.0,8414.25,8414.25,8414.25,8414.25,8414.25,1.0
2,2022-09-01,10,80030B3E0,22205,8024588B0,8073.9,US Dollar,8073.9,US Dollar,Cheque,...,8073.9,8073.9,8073.9,1.0,8073.9,8073.9,8073.9,8073.9,8073.9,1.0
3,2022-09-01,4242,8119E7050,10,80010BBD0,1.76,US Dollar,1.76,US Dollar,Credit Card,...,1.76,1.76,1.76,1.0,1.76,1.76,1.76,1.76,1.76,1.0
4,2022-09-01,5836,81314F3A0,10,8131C71C0,19948.25,US Dollar,19948.25,US Dollar,Credit Card,...,19948.25,19948.25,19948.25,1.0,19948.25,19948.25,19948.25,19948.25,19948.25,1.0


#### V4: Average Sum per Payment Format

In [20]:
# Calculate average amount paid per payment format grouped by "From Account"
average_paid = df_v3.pivot_table(values='Amount Paid', index='From Account', columns='Payment Format', aggfunc='mean')
average_paid.columns = [f'From Account Average Amount Paid in {col}' for col in average_paid.columns]

# Calculate average amount received per payment format grouped by "To Account"
average_received = df_v3.pivot_table(values='Amount Received', index='To Account', columns='Payment Format', aggfunc='mean')
average_received.columns = [f'To Account Average Amount Received in {col}' for col in average_received.columns]

# Merge with variables "From Account" and "To Account"
df_v4 = pd.merge(df_v3, average_paid, how='left', left_on='From Account', right_index=True)
df_v4 = pd.merge(df_v4, average_received, how='left', left_on='To Account', right_index=True)
df_v4.head()

Unnamed: 0,Timestamp,From Bank,From Account,To Bank,To Account,Amount Received,Receiving Currency,Amount Paid,Payment Currency,Payment Format,...,From Account Average Amount Paid in Credit Card,From Account Average Amount Paid in Reinvestment,From Account Average Amount Paid in Wire,To Account Average Amount Received in ACH,To Account Average Amount Received in Bitcoin,To Account Average Amount Received in Cash,To Account Average Amount Received in Cheque,To Account Average Amount Received in Credit Card,To Account Average Amount Received in Reinvestment,To Account Average Amount Received in Wire
0,2022-09-01,10,8017C89F0,10,8017C89F0,1270437.9,US Dollar,1270437.9,US Dollar,Reinvestment,...,20358.122308,635228.755,,,,,28717.375455,864.4,635228.755,
1,2022-09-01,10,801659620,6156,812677DF0,8414.25,US Dollar,8414.25,US Dollar,Cash,...,4412.9,957.55,,,,8414.25,16788.26,4412.9,,
2,2022-09-01,10,80030B3E0,22205,8024588B0,8073.9,US Dollar,8073.9,US Dollar,Cheque,...,,14.02,,18963.69,,,8073.9,,,
3,2022-09-01,4242,8119E7050,10,80010BBD0,1.76,US Dollar,1.76,US Dollar,Credit Card,...,11.313333,,,16799.03,,,16647.78,2442.336667,93579.75,
4,2022-09-01,5836,81314F3A0,10,8131C71C0,19948.25,US Dollar,19948.25,US Dollar,Credit Card,...,19948.25,,6870.74,,,32119.73,19299.23,19948.25,,6870.74


#### V5: Transaction Count

In [21]:
# Count number of transactions each account received
received_counts = df_v2.groupby('To Account').size().reset_index(name='Received Transaction Count')

# Count number of transactions each account paid
paid_counts = df_v2.groupby('From Account').size().reset_index(name='Paid Transaction Count')

# Merge both datasets into a single data frame
transaction_counts = pd.merge(paid_counts, received_counts, how='outer', left_on='From Account', right_on='To Account')

# Combine the two account IDs in a new variable "Account ID"
transaction_counts['Account ID'] = transaction_counts['From Account'].combine_first(transaction_counts['To Account'])

# Drop redundant columns
transaction_counts = transaction_counts.drop(columns=['From Account', 'To Account'])
transaction_counts.head()

Unnamed: 0,Paid Transaction Count,Received Transaction Count,Account ID
0,2491.0,,100428660
1,888.0,,1004287C8
2,351.0,,100428930
3,14.0,,800042CB0
4,177.0,44.0,800042E70


In [22]:
# Merge with variable "From Account"
df_v5 = pd.merge(df_v4, transaction_counts, how='left', left_on='From Account', right_on='Account ID')
df_v5.rename(columns={
    'Paid Transaction Count': 'From Account Paid Transaction Count',
    'Received Transaction Count': 'From Account Received Transaction Count'
}, inplace=True)
df_v5.drop(columns=['Account ID'], inplace=True)

# Merge with variable "To Account"
df_v5 = pd.merge(df_v5, transaction_counts, how='left', left_on='To Account', right_on='Account ID')
df_v5.rename(columns={
    'Paid Transaction Count': 'To Account Paid Transaction Count',
    'Received Transaction Count': 'To Account Received Transaction Count'
}, inplace=True)
df_v5.drop(columns=['Account ID'], inplace=True)
df_v5.head()

Unnamed: 0,Timestamp,From Bank,From Account,To Bank,To Account,Amount Received,Receiving Currency,Amount Paid,Payment Currency,Payment Format,...,To Account Average Amount Received in Bitcoin,To Account Average Amount Received in Cash,To Account Average Amount Received in Cheque,To Account Average Amount Received in Credit Card,To Account Average Amount Received in Reinvestment,To Account Average Amount Received in Wire,From Account Paid Transaction Count,From Account Received Transaction Count,To Account Paid Transaction Count,To Account Received Transaction Count
0,2022-09-01,10,8017C89F0,10,8017C89F0,1270437.9,US Dollar,1270437.9,US Dollar,Reinvestment,...,,,28717.375455,864.4,635228.755,,38.0,23.0,38.0,23.0
1,2022-09-01,10,801659620,6156,812677DF0,8414.25,US Dollar,8414.25,US Dollar,Cash,...,,8414.25,16788.26,4412.9,,,4.0,1.0,,3.0
2,2022-09-01,10,80030B3E0,22205,8024588B0,8073.9,US Dollar,8073.9,US Dollar,Cheque,...,,,8073.9,,,,4.0,3.0,,2.0
3,2022-09-01,4242,8119E7050,10,80010BBD0,1.76,US Dollar,1.76,US Dollar,Credit Card,...,,,16647.78,2442.336667,93579.75,,3.0,,31.0,7.0
4,2022-09-01,5836,81314F3A0,10,8131C71C0,19948.25,US Dollar,19948.25,US Dollar,Credit Card,...,,32119.73,19299.23,19948.25,,6870.74,4.0,,,4.0


#### V6: Standard Deviations & Average Amounts

In [23]:
# Calculate the average and standard deviation of the amount paid by each account
paid_stats = df_v2.groupby('From Account')['Amount Paid'].agg(['mean', 'std']).reset_index()
paid_stats.rename(columns={
    'mean': 'Average Amount Paid',
    'std': 'Std Dev Amount Paid'
}, inplace=True)

# Calculate the average and standard deviation of the amount received by each account
received_stats = df_v2.groupby('To Account')['Amount Received'].agg(['mean', 'std']).reset_index()
received_stats.rename(columns={
    'mean': 'Average Amount Received',
    'std': 'Std Dev Amount Received'
}, inplace=True)

# Merge both data sets into a single data frame
account_stats = pd.merge(paid_stats, received_stats, how='outer', left_on='From Account', right_on='To Account')

# Combine the two account IDs in a new variable "Account ID"
account_stats['Account ID'] = account_stats['From Account'].combine_first(account_stats['To Account'])

# Drop redundant columns
account_stats = account_stats.drop(columns=['From Account', 'To Account'])
account_stats.head()

Unnamed: 0,Average Amount Paid,Std Dev Amount Paid,Average Amount Received,Std Dev Amount Received,Account ID
0,184672.182461,1549057.0,,,100428660
1,10820.438516,56723.83,,,1004287C8
2,44797.808315,204605.3,,,100428930
3,168.15,75.92172,,,800042CB0
4,4852.662939,7541.861,5732.704324,10291.42589,800042E70


In [24]:
# Merge with variable "From Account"
df_v6 = pd.merge(df_v5, account_stats, how='left', left_on='From Account', right_on='Account ID')
df_v6.rename(columns={
    'Average Amount Paid': 'From Account Average Amount Paid',
    'Std Dev Amount Paid': 'From Account Std Dev Amount Paid',
    'Average Amount Received': 'From Account Average Amount Received',
    'Std Dev Amount Received': 'From Account Std Dev Amount Received'
}, inplace=True)
df_v6.drop(columns=['Account ID'], inplace=True)

# Merge with variable "To Account"
df_v6 = pd.merge(df_v6, account_stats, how='left', left_on='To Account', right_on='Account ID')
df_v6.rename(columns={
    'Average Amount Paid': 'To Account Average Amount Paid',
    'Std Dev Amount Paid': 'To Account Std Dev Amount Paid',
    'Average Amount Received': 'To Account Average Amount Received',
    'Std Dev Amount Received': 'To Account Std Dev Amount Received'
}, inplace=True)
df_v6.drop(columns=['Account ID'], inplace=True)
df_v6.head()

Unnamed: 0,Timestamp,From Bank,From Account,To Bank,To Account,Amount Received,Receiving Currency,Amount Paid,Payment Currency,Payment Format,...,To Account Paid Transaction Count,To Account Received Transaction Count,From Account Average Amount Paid,From Account Std Dev Amount Paid,From Account Average Amount Received,From Account Std Dev Amount Received,To Account Average Amount Paid,To Account Std Dev Amount Paid,To Account Average Amount Received,To Account Std Dev Amount Received
0,2022-09-01,10,8017C89F0,10,8017C89F0,1270437.9,US Dollar,1270437.9,US Dollar,Reinvestment,...,38.0,23.0,3465829.0,4414898.0,69347.506087,269133.935816,3465829.0,4414898.0,69347.506087,269133.935816
1,2022-09-01,10,801659620,6156,812677DF0,8414.25,US Dollar,8414.25,US Dollar,Cash,...,,3.0,7643.24,6815.653,957.55,,,,9871.803333,6315.119165
2,2022-09-01,10,80030B3E0,22205,8024588B0,8073.9,US Dollar,8073.9,US Dollar,Cheque,...,,2.0,19140.34,21684.2,192.933333,154.943492,,,13518.795,7700.244355
3,2022-09-01,4242,8119E7050,10,80010BBD0,1.76,US Dollar,1.76,US Dollar,Credit Card,...,31.0,7.0,11.31333,14.46061,,,924040.8,2207581.0,32561.902857,68570.013952
4,2022-09-01,5836,81314F3A0,10,8131C71C0,19948.25,US Dollar,19948.25,US Dollar,Credit Card,...,,4.0,19559.49,10311.53,,,,,19559.4875,10311.528743


#### V7: Indication of Transaction was within the Bank

In [25]:
# New dummy variable "Within Bank" indicating if transaction occured within bank or not
df_v7 = df_v6.copy()
df_v7['Within Bank'] = (df_v7['From Bank'].astype(str) == df_v7['To Bank'].astype(str)).astype(int)
df_v7.head()

Unnamed: 0,Timestamp,From Bank,From Account,To Bank,To Account,Amount Received,Receiving Currency,Amount Paid,Payment Currency,Payment Format,...,To Account Received Transaction Count,From Account Average Amount Paid,From Account Std Dev Amount Paid,From Account Average Amount Received,From Account Std Dev Amount Received,To Account Average Amount Paid,To Account Std Dev Amount Paid,To Account Average Amount Received,To Account Std Dev Amount Received,Within Bank
0,2022-09-01,10,8017C89F0,10,8017C89F0,1270437.9,US Dollar,1270437.9,US Dollar,Reinvestment,...,23.0,3465829.0,4414898.0,69347.506087,269133.935816,3465829.0,4414898.0,69347.506087,269133.935816,1
1,2022-09-01,10,801659620,6156,812677DF0,8414.25,US Dollar,8414.25,US Dollar,Cash,...,3.0,7643.24,6815.653,957.55,,,,9871.803333,6315.119165,0
2,2022-09-01,10,80030B3E0,22205,8024588B0,8073.9,US Dollar,8073.9,US Dollar,Cheque,...,2.0,19140.34,21684.2,192.933333,154.943492,,,13518.795,7700.244355,0
3,2022-09-01,4242,8119E7050,10,80010BBD0,1.76,US Dollar,1.76,US Dollar,Credit Card,...,7.0,11.31333,14.46061,,,924040.8,2207581.0,32561.902857,68570.013952,0
4,2022-09-01,5836,81314F3A0,10,8131C71C0,19948.25,US Dollar,19948.25,US Dollar,Credit Card,...,4.0,19559.49,10311.53,,,,,19559.4875,10311.528743,0


#### V8: Extraction of "day", "hour" and "minute" from "Timestamp"

In [26]:
# Extract time components from variable "Timestamp"
df_v8 = df_v7.copy()
df_v8['Day'] = df_v8['Timestamp'].dt.day
df_v8['Hour'] = df_v8['Timestamp'].dt.hour
df_v8['Minute'] = df_v8['Timestamp'].dt.minute
df_v8.head()

Unnamed: 0,Timestamp,From Bank,From Account,To Bank,To Account,Amount Received,Receiving Currency,Amount Paid,Payment Currency,Payment Format,...,From Account Average Amount Received,From Account Std Dev Amount Received,To Account Average Amount Paid,To Account Std Dev Amount Paid,To Account Average Amount Received,To Account Std Dev Amount Received,Within Bank,Day,Hour,Minute
0,2022-09-01,10,8017C89F0,10,8017C89F0,1270437.9,US Dollar,1270437.9,US Dollar,Reinvestment,...,69347.506087,269133.935816,3465829.0,4414898.0,69347.506087,269133.935816,1,1,0,0
1,2022-09-01,10,801659620,6156,812677DF0,8414.25,US Dollar,8414.25,US Dollar,Cash,...,957.55,,,,9871.803333,6315.119165,0,1,0,0
2,2022-09-01,10,80030B3E0,22205,8024588B0,8073.9,US Dollar,8073.9,US Dollar,Cheque,...,192.933333,154.943492,,,13518.795,7700.244355,0,1,0,0
3,2022-09-01,4242,8119E7050,10,80010BBD0,1.76,US Dollar,1.76,US Dollar,Credit Card,...,,,924040.8,2207581.0,32561.902857,68570.013952,0,1,0,0
4,2022-09-01,5836,81314F3A0,10,8131C71C0,19948.25,US Dollar,19948.25,US Dollar,Credit Card,...,,,,,19559.4875,10311.528743,0,1,0,0


### Selection and Preprocessing (identical to selection_preprocessing.ipynb)
#### Dealing with NaN-Values

In [27]:
df_all_var = df_v8.copy()

# Replace NaN's of all variables by zero except variables computing standard deviations

# Define list of all variable names
all_var = df_all_var.columns.tolist()

# Define list of all variables computing standard deviations
std_dev_var = [
    "From Account Std Dev Amount Paid", 
    "From Account Std Dev Amount Received", 
    "To Account Std Dev Amount Paid", 
    "To Account Std Dev Amount Received"
]

# Define list of all variables containing NaN's which will be replaced by zero
var_nan = [var for var in all_var if var not in std_dev_var]

# Replace NaN's by zero
df_all_var[var_nan] = df_all_var[var_nan].fillna(0)

In [28]:
# For all NaN's of "From Account Std Dev Amount Paid" and "To Account Std Dev Amount Received" with exactly one transaction

# Compute their means
mean_from_account_std_dev_amount_paid = df_all_var['From Account Std Dev Amount Paid'].mean() 
mean_to_account_std_dev_amount_received = df_all_var['To Account Std Dev Amount Received'].mean()

# Replace the NaN's by the corresponding variable mean
df_all_var['From Account Std Dev Amount Paid'].fillna(mean_from_account_std_dev_amount_paid, inplace=True)
df_all_var['To Account Std Dev Amount Received'].fillna(mean_to_account_std_dev_amount_received, inplace=True)

In [29]:
# Compute mean of variable "From Account Std Dev Amount Received"
mean_from_account_std_dev_amount_received = df_all_var['From Account Std Dev Amount Received'].mean()

# Replace NaN by the mean of its variable if account received exactly one transaction
df_all_var.loc[(df_all_var['From Account Std Dev Amount Received'].isna()) & 
               (df_all_var['From Account Received Transaction Count'] == 1), 
               'From Account Std Dev Amount Received'] = mean_from_account_std_dev_amount_received

# Replace NaN by zero if account received no transaction
df_all_var.loc[(df_all_var['From Account Std Dev Amount Received'].isna()) & 
               (df_all_var['From Account Received Transaction Count'] == 0), 
               'From Account Std Dev Amount Received'] = 0

In [30]:
# Compute mean of variable "To Account Std Dev Amount Paid" 
mean_to_account_std_dev_amount_paid = df_all_var['To Account Std Dev Amount Paid'].mean()

# Replace NaN by the mean of its variable if account sent exactly one transaction
df_all_var.loc[(df_all_var['To Account Std Dev Amount Paid'].isna()) & 
               (df_all_var['To Account Paid Transaction Count'] == 1), 
               'To Account Std Dev Amount Paid'] = mean_to_account_std_dev_amount_paid

# Replace NaN by zero if account sent no transaction
df_all_var.loc[(df_all_var['To Account Std Dev Amount Paid'].isna()) & 
               (df_all_var['To Account Paid Transaction Count'] == 0), 
               'To Account Std Dev Amount Paid'] = 0

#### Dropping of Irrelevant Variables

In [31]:
# As information is already given by other variables
df_all_var = df_all_var.drop(columns=['Timestamp', 'From Bank', 'To Bank'])

# As information should not be considered for the models
df_all_var = df_all_var.drop(columns=['From Account', 'To Account'])

In [32]:
# For how many transactions is "Amount Paid" not equal to "Amount Received"
unequal_payments = df_all_var[df_all_var['Amount Received'] != df_all_var['Amount Paid']]
len(unequal_payments)
unequal_payments.describe()

# Remove those transactions
df_all_var = df_all_var[df_all_var['Amount Received'] == df_all_var['Amount Paid']]

# For how many transactions is "Payment Currency" not equal to "Receiving Currency"
unequal_currency = df_all_var[df_all_var['Payment Currency'] != df_all_var['Receiving Currency']]
len(unequal_currency)
unequal_currency.describe()

# Remove those transactions
df_all_var = df_all_var[df_all_var['Payment Currency'] == df_all_var['Receiving Currency']]

# Drop the variables "Amount Received" and "Receiving Currency" as they became redundant
df_all_var = df_all_var.drop(columns=['Amount Received', 'Receiving Currency'])
df_all_var.head()

Unnamed: 0,Amount Paid,Payment Currency,Payment Format,Is Laundering,Digit Sum Amount Received,Digit Sum Amount Paid,Count Zeros Amount Received,Count Zeros Amount Paid,From Account Average Digit Sum Paid,From Account Average Count Zeros Paid,...,From Account Average Amount Received,From Account Std Dev Amount Received,To Account Average Amount Paid,To Account Std Dev Amount Paid,To Account Average Amount Received,To Account Std Dev Amount Received,Within Bank,Day,Hour,Minute
0,1270437.9,US Dollar,Reinvestment,0,33,33,6,6,32.736842,4.947368,...,69347.506087,269133.935816,3465829.0,4414898.0,69347.506087,269133.935816,1,1,0,0
1,8414.25,US Dollar,Cash,0,24,24,4,4,28.25,4.25,...,957.55,654652.954284,0.0,0.0,9871.803333,6315.119165,0,1,0,0
2,8073.9,US Dollar,Cheque,0,27,27,6,6,28.5,5.0,...,192.933333,154.943492,0.0,0.0,13518.795,7700.244355,0,1,0,0
3,1.76,US Dollar,Credit Card,0,14,14,4,4,15.333333,4.0,...,0.0,0.0,924040.8,2207581.0,32561.902857,68570.013952,0,1,0,0
4,19948.25,US Dollar,Credit Card,0,38,38,4,4,32.75,4.25,...,0.0,0.0,0.0,0.0,19559.4875,10311.528743,0,1,0,0


#### Transformation of Categorical Variables to Dummies

In [33]:
# Convert to categorical variables
variables_character = ["Payment Currency", "Payment Format"]

for variable in variables_character:
    df_all_var[variable] = df_all_var[variable].astype('category')
df_all_var.head()

Unnamed: 0,Amount Paid,Payment Currency,Payment Format,Is Laundering,Digit Sum Amount Received,Digit Sum Amount Paid,Count Zeros Amount Received,Count Zeros Amount Paid,From Account Average Digit Sum Paid,From Account Average Count Zeros Paid,...,From Account Average Amount Received,From Account Std Dev Amount Received,To Account Average Amount Paid,To Account Std Dev Amount Paid,To Account Average Amount Received,To Account Std Dev Amount Received,Within Bank,Day,Hour,Minute
0,1270437.9,US Dollar,Reinvestment,0,33,33,6,6,32.736842,4.947368,...,69347.506087,269133.935816,3465829.0,4414898.0,69347.506087,269133.935816,1,1,0,0
1,8414.25,US Dollar,Cash,0,24,24,4,4,28.25,4.25,...,957.55,654652.954284,0.0,0.0,9871.803333,6315.119165,0,1,0,0
2,8073.9,US Dollar,Cheque,0,27,27,6,6,28.5,5.0,...,192.933333,154.943492,0.0,0.0,13518.795,7700.244355,0,1,0,0
3,1.76,US Dollar,Credit Card,0,14,14,4,4,15.333333,4.0,...,0.0,0.0,924040.8,2207581.0,32561.902857,68570.013952,0,1,0,0
4,19948.25,US Dollar,Credit Card,0,38,38,4,4,32.75,4.25,...,0.0,0.0,0.0,0.0,19559.4875,10311.528743,0,1,0,0


In [34]:
# Transform categorical variables to Dummies, left one out to avoid dummy trap, delete original variables
cols_to_dummies = ['Payment Currency', 'Payment Format']
df_all_var = pd.concat(
    [df_all_var.drop(cols_to_dummies, axis=1), pd.get_dummies(df_all_var[cols_to_dummies], drop_first=True)], axis=1)
df_all_var.columns = df_all_var.columns.str.replace('_', ' ')
df_all_var.head()

Unnamed: 0,Amount Paid,Is Laundering,Digit Sum Amount Received,Digit Sum Amount Paid,Count Zeros Amount Received,Count Zeros Amount Paid,From Account Average Digit Sum Paid,From Account Average Count Zeros Paid,From Account Average Digit Sum Received,From Account Average Count Zeros Received,...,Payment Currency UK Pound,Payment Currency US Dollar,Payment Currency Yen,Payment Currency Yuan,Payment Format Bitcoin,Payment Format Cash,Payment Format Cheque,Payment Format Credit Card,Payment Format Reinvestment,Payment Format Wire
0,1270437.9,0,33,33,6,6,32.736842,4.947368,24.434783,4.608696,...,0,1,0,0,0,0,0,0,1,0
1,8414.25,0,24,24,4,4,28.25,4.25,31.0,4.0,...,0,1,0,0,0,1,0,0,0,0
2,8073.9,0,27,27,6,6,28.5,5.0,18.333333,4.333333,...,0,1,0,0,0,0,1,0,0,0
3,1.76,0,14,14,4,4,15.333333,4.0,0.0,0.0,...,0,1,0,0,0,0,0,1,0,0
4,19948.25,0,38,38,4,4,32.75,4.25,0.0,0.0,...,0,1,0,0,0,0,0,1,0,0


#### Divide Data Set into Features and Labels & Train-Test-Split

In [35]:
# Set random state used for all models
random_state = 0

In [36]:
# Create features X and labels y
def split_dataset(dataset):
    X = dataset.drop("Is Laundering", axis=1)
    y = dataset["Is Laundering"].values
    return X, y

X, y = split_dataset(df_all_var)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=random_state, stratify=y)

#### Resampling

In [37]:
# Oversampling of minority class to 1% of majority class
# ros = RandomOverSampler(sampling_strategy=0.01, random_state=random_state)
# X_oversampled, y_oversampled = ros.fit_resample(X_train, y_train)

# Undersampling of majority class to 100x the minority class
rus = RandomUnderSampler(sampling_strategy=0.01, random_state=random_state)
X_resampled, y_resampled = rus.fit_resample(X_train, y_train)

#### Logarithmize and Standardize the Variables

In [38]:
# Create a transformation pipeline, first normalize and then standardize the values
trans_pipe = Pipeline([
    ('power_transform', PowerTransformer(standardize=False)),
    ('scaler', StandardScaler()),
])

# Fit and transform the training data
X_resampled_transformed = trans_pipe.fit_transform(X_resampled)

# Only transform the test data to avoid data leakage
X_test_transformed = trans_pipe.transform(X_test)

#### Random Forest on One Bank Data Set

In [39]:
# Define sets of parameter for randomized search
params_rf = {
    'n_estimators': [100, 200, 300, 400], 
    'max_features': [3, 5, 7, 9, 11, 13], 
    'max_depth': [5, 10, 15, 20, 25],
    'min_samples_split': range(1, 10),
    'min_samples_leaf': range(1, 10)
}

# Define model with remaining parameters
rf = RandomForestClassifier(criterion='gini', bootstrap=True, random_state=random_state)

# Set up a RandomizedSearchCV and fit it to train set
grid_rf = RandomizedSearchCV(estimator=rf, random_state=random_state,
                            param_distributions=params_rf,
                            n_iter=20, cv=5, scoring='f1', n_jobs=-1)

grid_rf.fit(X_resampled_transformed, y_resampled)

# Choose the best estimator
best_rf = grid_rf.best_estimator_ 

In [40]:
# Get best parameter according to tuning
grid_rf.best_params_

{'n_estimators': 400,
 'min_samples_split': 2,
 'min_samples_leaf': 1,
 'max_features': 7,
 'max_depth': 10}

In [41]:
# Predictions on test set
y_pred = best_rf.predict(X_test_transformed)

# Confusion matrix test set
confusion_matrix(y_true=y_test, y_pred=y_pred)

array([[45154,    15],
       [   27,    13]])

In [42]:
# Classification report test set
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     45169
           1       0.46      0.33      0.38        40

    accuracy                           1.00     45209
   macro avg       0.73      0.66      0.69     45209
weighted avg       1.00      1.00      1.00     45209

