In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("PS_20174392719_1491204439457_log.csv")
df.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0


In [3]:
df.dtypes


step                int64
type               object
amount            float64
nameOrig           object
oldbalanceOrg     float64
newbalanceOrig    float64
nameDest           object
oldbalanceDest    float64
newbalanceDest    float64
isFraud             int64
isFlaggedFraud      int64
dtype: object

Dropping the unwanted columns

1. nameOrig: This column is a unique identifier that belongs to each customer.Since each identifier is unique with every row of the dataset, the machine learning algorithm will not be able to discern any patterns from this feature.
2. nameDest: This column is also a unique identifier that belongs to each customer and as such provides no value to the machine learning algorithm.
3. isFlaggedFraud: This column flags a transaction as fraudulent if a person tries
to transfer more than 200,000 in a single transaction. Since we already have a
feature called isFraud that flags a transaction as fraud, this feature becomes
redundant.

We can drop these features from the dataset by using the following code:

In [4]:
#dropping the redundant features

df = df.drop(['nameOrig', 'nameDest', 'isFlaggedFraud'], axis = 1)
df.head()

Unnamed: 0,step,type,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud
0,1,PAYMENT,9839.64,170136.0,160296.36,0.0,0.0,0
1,1,PAYMENT,1864.28,21249.0,19384.72,0.0,0.0,0
2,1,TRANSFER,181.0,181.0,0.0,0.0,0.0,1
3,1,CASH_OUT,181.0,181.0,0.0,21182.0,0.0,1
4,1,PAYMENT,11668.14,41554.0,29885.86,0.0,0.0,0


In [5]:
df.isFraud.value_counts()

isFraud
0    6354407
1       8213
Name: count, dtype: int64

In [6]:
df.type.value_counts()

type
CASH_OUT    2237500
PAYMENT     2151495
CASH_IN     1399284
TRANSFER     532909
DEBIT         41432
Name: count, dtype: int64

### Reducing the size of the data
The dataset that we are working with contains over 6 million rows of data. Most machine
learning algorithms will take a large amount of time to work with a dataset of this size. In
order to make our execution time quicker, we will reduce the size of the dataset to 20,000
rows. We can do this by using the following code:

In [7]:
# just converting the columns into lowecase
df.columns = df.columns.str.lower()



In [8]:
list(df.columns)

['step',
 'type',
 'amount',
 'oldbalanceorg',
 'newbalanceorig',
 'oldbalancedest',
 'newbalancedest',
 'isfraud']

In [9]:
# storing the Fradualent data into one datframe

df_fraud =  df[df.isfraud== 1]

#Storing the non-fraudulent data into a dataframe

df_nofraud = df[df.isfraud ==0]

In [10]:
df_fraud.shape

(8213, 8)

In [11]:
#Storing 12,000 rows of non-fraudulent data

df_nofraud = df_nofraud.head(12000)

In [12]:
df_nofraud.shape

(12000, 8)

In [13]:
#Joining both datasets together
df_sort = pd.concat([df_fraud,df_nofraud], axis =0)

df_sort

Unnamed: 0,step,type,amount,oldbalanceorg,newbalanceorig,oldbalancedest,newbalancedest,isfraud
2,1,TRANSFER,181.00,181.0,0.00,0.0,0.0,1
3,1,CASH_OUT,181.00,181.0,0.00,21182.0,0.0,1
251,1,TRANSFER,2806.00,2806.0,0.00,0.0,0.0,1
252,1,CASH_OUT,2806.00,2806.0,0.00,26202.0,0.0,1
680,1,TRANSFER,20128.00,20128.0,0.00,0.0,0.0,1
...,...,...,...,...,...,...,...,...
12067,7,PAYMENT,9443.02,898.0,0.00,0.0,0.0,0
12068,7,PAYMENT,9105.47,358278.0,349172.53,0.0,0.0,0
12069,7,PAYMENT,2761.71,0.0,0.00,0.0,0.0,0
12070,7,PAYMENT,10204.13,0.0,0.00,0.0,0.0,0


In [14]:
df_sort.head(20)

Unnamed: 0,step,type,amount,oldbalanceorg,newbalanceorig,oldbalancedest,newbalancedest,isfraud
2,1,TRANSFER,181.0,181.0,0.0,0.0,0.0,1
3,1,CASH_OUT,181.0,181.0,0.0,21182.0,0.0,1
251,1,TRANSFER,2806.0,2806.0,0.0,0.0,0.0,1
252,1,CASH_OUT,2806.0,2806.0,0.0,26202.0,0.0,1
680,1,TRANSFER,20128.0,20128.0,0.0,0.0,0.0,1
681,1,CASH_OUT,20128.0,20128.0,0.0,6268.0,12145.85,1
724,1,CASH_OUT,416001.33,0.0,0.0,102.0,9291619.62,1
969,1,TRANSFER,1277212.77,1277212.77,0.0,0.0,0.0,1
970,1,CASH_OUT,1277212.77,1277212.77,0.0,0.0,2444985.19,1
1115,1,TRANSFER,35063.63,35063.63,0.0,0.0,0.0,1


##### as the data is not shuffled it has first 8213 rows has fraud data,
##### we can shuffle it when applying sklearn train test split at "random state"

In [15]:
from sklearn.preprocessing import OneHotEncoder

# Assuming df is your DataFrame and 'type' is the column to be one-hot encoded

# Step 1: Initialize the OneHotEncoder
one_hot_encoder = OneHotEncoder(sparse_output=False) # or simply leave  without having sparse_output as by default it is false





In [16]:
# Fit and transform the 'type' column
one_hot_encoded = one_hot_encoder.fit_transform(df_sort[['type']])


In [17]:
# Step 3: Print the one-hot encoded values
print("One-hot encoded values:\n", one_hot_encoded)

One-hot encoded values:
 [[0. 0. 0. 0. 1.]
 [0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 1.]
 ...
 [0. 0. 0. 1. 0.]
 [0. 0. 0. 1. 0.]
 [0. 0. 0. 1. 0.]]


In [18]:
# get_feature_names_out(['type']): This method generates column names like type_CASH-IN, type_CASH-OUT, etc., 
# based on the unique values in the 'type' column.

#Create a DataFrame with the one-hot encoded variables
encoded_df = pd.DataFrame(one_hot_encoded, columns=one_hot_encoder.get_feature_names_out(['type']))



In [19]:
encoded_df

Unnamed: 0,type_CASH_IN,type_CASH_OUT,type_DEBIT,type_PAYMENT,type_TRANSFER
0,0.0,0.0,0.0,0.0,1.0
1,0.0,1.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,1.0
3,0.0,1.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...
20208,0.0,0.0,0.0,1.0,0.0
20209,0.0,0.0,0.0,1.0,0.0
20210,0.0,0.0,0.0,1.0,0.0
20211,0.0,0.0,0.0,1.0,0.0


In [20]:
#Concatenate the original DataFrame with the one-hot encoded DataFrame
df_final= pd.concat([df_sort.reset_index(drop =True), encoded_df], axis=1)



In [21]:
df_final.shape

(20213, 13)

In [22]:
list(df_final.columns)

['step',
 'type',
 'amount',
 'oldbalanceorg',
 'newbalanceorig',
 'oldbalancedest',
 'newbalancedest',
 'isfraud',
 'type_CASH_IN',
 'type_CASH_OUT',
 'type_DEBIT',
 'type_PAYMENT',
 'type_TRANSFER']

In [23]:
# drop the "type" column
df_final = df_final.drop('type', axis = 1)
df_final.head()

Unnamed: 0,step,amount,oldbalanceorg,newbalanceorig,oldbalancedest,newbalancedest,isfraud,type_CASH_IN,type_CASH_OUT,type_DEBIT,type_PAYMENT,type_TRANSFER
0,1,181.0,181.0,0.0,0.0,0.0,1,0.0,0.0,0.0,0.0,1.0
1,1,181.0,181.0,0.0,21182.0,0.0,1,0.0,1.0,0.0,0.0,0.0
2,1,2806.0,2806.0,0.0,0.0,0.0,1,0.0,0.0,0.0,0.0,1.0
3,1,2806.0,2806.0,0.0,26202.0,0.0,1,0.0,1.0,0.0,0.0,0.0
4,1,20128.0,20128.0,0.0,0.0,0.0,1,0.0,0.0,0.0,0.0,1.0


In [24]:
df_final.shape

(20213, 12)

In [25]:
# check for missing values
df_final.isnull().any()

step              False
amount            False
oldbalanceorg     False
newbalanceorig    False
oldbalancedest    False
newbalancedest    False
isfraud           False
type_CASH_IN      False
type_CASH_OUT     False
type_DEBIT        False
type_PAYMENT      False
type_TRANSFER     False
dtype: bool

Here we note that every column has some amount of missing values(NaN)

In [26]:
df_final.isnull().sum()

step              0
amount            0
oldbalanceorg     0
newbalanceorig    0
oldbalancedest    0
newbalancedest    0
isfraud           0
type_CASH_IN      0
type_CASH_OUT     0
type_DEBIT        0
type_PAYMENT      0
type_TRANSFER     0
dtype: int64

In [27]:
#Imputing the missing values with a 0
df_final = df_final.fillna(0)

In [28]:
df_final.isnull().sum()

step              0
amount            0
oldbalanceorg     0
newbalanceorig    0
oldbalancedest    0
newbalancedest    0
isfraud           0
type_CASH_IN      0
type_CASH_OUT     0
type_DEBIT        0
type_PAYMENT      0
type_TRANSFER     0
dtype: int64

In [30]:
# save the file to use it for future classification problems
df_final.to_csv('fraud_prediction.csv',index= False)