## Importing Libraries

In [25]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split

# Loading Raw Dataset

In [7]:
data = pd.read_csv('../data/raw/upi_fraud_dataset_raw.csv')

print("Dataset Shape:", data.shape)
print("\nMissing Values:\n", data.isnull().sum())

Dataset Shape: (10000, 19)

Missing Values:
 TransactionID           0
UserID                  0
Amount                  0
Timestamp               0
MerchantCategory        0
TransactionType         0
DeviceID                0
IPAddress               0
Latitude                0
Longitude               0
AvgTransactionAmount    0
TransactionFrequency    0
UnusualLocation         0
UnusualAmount           0
NewDevice               0
FailedAttempts          0
FraudFlag               0
PhoneNumber             0
BankName                0
dtype: int64


## Handling Missing Value

In [None]:
# There are No Missing Values in the DataSet

In [8]:
print("Duplicate Rows:", data.duplicated().sum())

data = data.drop_duplicates()
print("New Shape After Removing Duplicates:", data.shape)

Duplicate Rows: 0
New Shape After Removing Duplicates: (10000, 19)


## Handling Data types

In [17]:
print(data.dtypes)

TransactionID             int64
UserID                    int64
Amount                  float64
MerchantCategory          int64
TransactionType           int64
DeviceID                  int64
IPAddress                 int32
Latitude                float64
Longitude               float64
AvgTransactionAmount    float64
TransactionFrequency      int32
UnusualLocation            bool
UnusualAmount              bool
NewDevice                  bool
FailedAttempts            int64
FraudFlag                  bool
PhoneNumber               int64
BankName                  int32
Hour                      int32
Day                       int32
Month                     int32
Weekday                   int32
dtype: object


### Objects to Numerical

In [15]:
from sklearn.preprocessing import LabelEncoder
categorical_cols = ['UserID', 'MerchantCategory', 'TransactionType', 'DeviceID', 'IPAddress', 'BankName', 'TransactionFrequency']
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    label_encoders[col] = le

### Timestamp to DataTime

In [16]:
data['Timestamp'] = pd.to_datetime(data['Timestamp'])

data['Hour'] = data['Timestamp'].dt.hour
data['Day'] = data['Timestamp'].dt.day
data['Month'] = data['Timestamp'].dt.month
data['Weekday'] = data['Timestamp'].dt.weekday

data = data.drop(columns=['Timestamp'])

### Boolean to Numerical Types

In [18]:
bool_columns = ['UnusualLocation', 'UnusualAmount', 'NewDevice', 'FraudFlag']
data[bool_columns] = data[bool_columns].astype(int)

In [21]:
print(data.dtypes)

TransactionID             int64
UserID                    int64
Amount                  float64
MerchantCategory          int64
TransactionType           int64
DeviceID                  int64
IPAddress                 int32
Latitude                float64
Longitude               float64
AvgTransactionAmount    float64
TransactionFrequency      int32
UnusualLocation           int32
UnusualAmount             int32
NewDevice                 int32
FailedAttempts            int64
FraudFlag                 int32
PhoneNumber               int64
BankName                  int32
Hour                      int32
Day                       int32
Month                     int32
Weekday                   int32
dtype: object


## Removing Outliers

In [20]:
Q1 = data['Amount'].quantile(0.25)
Q3 = data['Amount'].quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

data = data[(data['Amount'] >= lower_bound) & (data['Amount'] <= upper_bound)]

## Feature Scaling

### For Standardization

In [26]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import pandas as pd

data_standardized = data.copy()
data_minmax = data.copy()
scaler_std = StandardScaler()
num_cols = ['Amount', 'Latitude', 'Longitude', 'AvgTransactionAmount', 'FailedAttempts', 'Hour', 'Day', 'Month', 'Weekday']
data_standardized[num_cols] = scaler_std.fit_transform(data_standardized[num_cols])

### For MinMax Scaling

In [27]:
scaler_minmax = MinMaxScaler()
data_minmax[num_cols] = scaler_minmax.fit_transform(data_minmax[num_cols])

## Handling Class Imbalance

### Main Dataset

In [31]:
print("Class Distribution:\n", data['FraudFlag'].value_counts(normalize=True))

Class Distribution:
 FraudFlag
0    0.769244
1    0.230756
Name: proportion, dtype: float64


In [None]:
from imblearn.over_sampling import SMOTE

X = data.drop(columns=['FraudFlag'])
y = data['FraudFlag']

smote = SMOTE(sampling_strategy=0.3, random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

data = pd.DataFrame(X_resampled, columns=X.columns)
data['FraudFlag'] = y_resampled

In [32]:
print("New Class Distribution:\n", data['FraudFlag'].value_counts(normalize=True))

New Class Distribution:
 FraudFlag
0    0.769244
1    0.230756
Name: proportion, dtype: float64


### Standardized Dataset

In [35]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(sampling_strategy=0.3, random_state=42)

X_std = data_standardized.drop(columns=['FraudFlag'])
y_std = data_standardized['FraudFlag']
X_resampled_std, y_resampled_std = smote.fit_resample(X_std, y_std)

data_standardized = pd.DataFrame(X_resampled_std, columns=X_std.columns)
data_standardized['FraudFlag'] = y_resampled_std

### MinMax Dataset

In [36]:
X_minmax = data_minmax.drop(columns=['FraudFlag'])
y_minmax = data_minmax['FraudFlag']
X_resampled_minmax, y_resampled_minmax = smote.fit_resample(X_minmax, y_minmax)
data_minmax = pd.DataFrame(X_resampled_minmax, columns=X_minmax.columns)
data_minmax['FraudFlag'] = y_resampled_minmax

## Saving Datasets

In [38]:
data.to_csv('../data/processed/upi_fraud_dataset_clean.csv', index=False)
data_standardized.to_csv('../data/processed/upi_fraud_dataset_standardized.csv', index=False)
data_minmax.to_csv('../data/processed/upi_fraud_dataset_minmax.csv', index=False)

print("Processed datasets saved successfully!")

Processed datasets saved successfully!
