In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split

# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

# Loading Raw Dataset

In [8]:
# Load raw dataset
data = pd.read_csv('../data/raw/upi_fraud_dataset_raw.csv')

# Display basic info
print("Dataset Shape:", data.shape)
print("\nMissing Values:\n", data.isnull().sum())
data.head()

## Handling Missing Value

In [None]:
# There are No Missing Values in the DataSet

## Removal of Duplicates

In [9]:
print("Duplicate Rows:", data.duplicated().sum())

data = data.drop_duplicates()
print("New Shape After Removing Duplicates:", data.shape)

## Encoding Categorical Features

In [10]:
encoder = LabelEncoder()
for col in ['TransactionType', 'MerchantCategory', 'DeviceID', 'BankName']:
    data[col] = encoder.fit_transform(data[col])
data.head()

## Droping Unncessary Columns

In [11]:
data = data.drop(columns=['IPAddress', 'PhoneNumber'])

## Convert Boolean columns into 0 or 1

In [12]:
bool_cols = ['UnusualLocation', 'UnusualAmount', 'NewDevice', 'FraudFlag']
data[bool_cols] = data[bool_cols].astype(int)

## Encoding TransactionFrequency to numerical values

In [13]:
data['TransactionFrequency'] = data['TransactionFrequency'].str.extract('(\\d+)').astype(float)

## Label Encoding For Categorical Columns

In [14]:
encoder = LabelEncoder()
for col in ['DeviceID', 'BankName']:
    data[col] = encoder.fit_transform(data[col])

## Removing Outliers using IQR

In [17]:
numeric_data = data.select_dtypes(include=[np.number])
Q1 = numeric_data.quantile(0.25)
Q3 = numeric_data.quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

outliers = ((numeric_data < lower_bound) | (numeric_data > upper_bound)).any(axis=1)
data = data[~outliers]

## Feature Scaling using StandardScaler

In [18]:
scaler = StandardScaler()
numeric_cols = ['Amount', 'Latitude', 'Longitude', 'AvgTransactionAmount', 'TransactionFrequency', 'FailedAttempts']
data[numeric_cols] = scaler.fit_transform(data[numeric_cols])

## Saving Dataset

In [None]:
data.to_csv("../data/processed/upi_fraud_dataset_cleaned.csv", index=False)