## Data Wrangling

In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler

In [2]:
financials = pd.read_csv("/Users/sa12/Documents/Repositories/FinancialFraudDetection/FraudData/financials.csv", index_col=0).dropna()
financials = financials[(financials['type'] == 'CASH_OUT') | (financials['type'] == 'TRANSFER')]

KeyError: 'type'

In [None]:
financials = financials.drop(columns=['nameOrig', 'nameDest','isFlaggedFraud','type'])
financials = financials.rename(columns={'amount':'Amount','oldbalanceOrg':'OldOrig','newbalanceOrig':'NewOrig','oldbalanceDest':'OldDest','newbalanceDest':'NewDest', 'isFraud':'Fraud'})
financials

Unnamed: 0_level_0,Amount,OldOrig,NewOrig,OldDest,NewDest,Fraud
step,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,181.00,181.00,0.0,0.00,0.00,1
1,181.00,181.00,0.0,21182.00,0.00,1
1,229133.94,15325.00,0.0,5083.00,51513.44,0
1,215310.30,705.00,0.0,22425.00,0.00,0
1,311685.89,10835.00,0.0,6267.00,2719172.89,0
...,...,...,...,...,...,...
743,339682.13,339682.13,0.0,0.00,339682.13,1
743,6311409.28,6311409.28,0.0,0.00,0.00,1
743,6311409.28,6311409.28,0.0,68488.84,6379898.11,1
743,850002.52,850002.52,0.0,0.00,0.00,1


In [3]:
from scipy import stats
#remove all rows that have outliers in at least one column
#identify all data with a Z score less than 3 and remove the outliers
#I have to remove each column except the fraud column. This is the most straightforward solution
clean_financials = financials
clean_financials = clean_financials[(np.abs(stats.zscore(clean_financials['Amount'])) < 3)]
clean_financials = clean_financials[(np.abs(stats.zscore(clean_financials['NewOrig'])) < 3)]
clean_financials = clean_financials[(np.abs(stats.zscore(clean_financials['OldOrig'])) < 3)]
clean_financials = clean_financials[(np.abs(stats.zscore(clean_financials['NewDest'])) < 3)]
clean_financials = clean_financials[(np.abs(stats.zscore(clean_financials['OldDest'])) < 3)]
clean_financials['Index'] = range(1,len(clean_financials)+1)
clean_financials = clean_financials.reset_index(drop=True).set_index('Index')



We removed about 100,000 rows from the dataset that contained an outliers

In [None]:
numerical_columns = ['Amount','OldOrig','NewOrig','OldDest', 'NewDest']
# create a standard scaler object
scaler = StandardScaler()
# Scale the numerical columns
scaled_values = scaler.fit_transform(clean_financials[numerical_columns])
scaled_financials = pd.DataFrame(scaled_values, columns=(numerical_columns))
#put the isFraud column back to visualize fraud across distributions
scaled_financials = scaled_financials.merge(clean_financials['Fraud'], how='left', on=scaled_financials.index, right_index=False)
scaled_financials.drop(columns='key_0', inplace=True)
scaled_financials

Unnamed: 0,Amount,OldOrig,NewOrig,OldDest,NewDest,Fraud
0,-0.497578,-0.188792,-0.106421,-0.640529,-0.697481,1
1,-0.497578,-0.188792,-0.106421,-0.630922,-0.697481,1
2,-0.112167,-0.128784,-0.106421,-0.638224,-0.676751,0
3,-0.135437,-0.186715,-0.106421,-0.630358,-0.697481,0
4,0.026797,-0.146575,-0.106421,-0.637687,0.396772,0
...,...,...,...,...,...,...
2739257,0.073925,1.156478,-0.106421,-0.640529,-0.560786,1
2739258,10.126497,24.819381,-0.106421,-0.640529,-0.697481,1
2739259,10.126497,24.819381,-0.106421,-0.609465,1.869926,1
2739260,0.932979,3.178617,-0.106421,-0.640529,-0.697481,1


In [None]:
#scaled_financials.to_csv('/Users/sa12/Documents/Repositories/FinancialFraudDetection/FraudData/financials.csv')