## Importing Libraries

In [76]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## Load Dataset

In [77]:
data = pd.read_csv('../data/raw/upi_fraud_dataset_raw.csv')
print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 19 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   TransactionID         10000 non-null  int64  
 1   UserID                10000 non-null  object 
 2   Amount                10000 non-null  float64
 3   Timestamp             10000 non-null  object 
 4   MerchantCategory      10000 non-null  object 
 5   TransactionType       10000 non-null  object 
 6   DeviceID              10000 non-null  object 
 7   IPAddress             10000 non-null  object 
 8   Latitude              10000 non-null  float64
 9   Longitude             10000 non-null  float64
 10  AvgTransactionAmount  10000 non-null  float64
 11  TransactionFrequency  10000 non-null  object 
 12  UnusualLocation       10000 non-null  bool   
 13  UnusualAmount         10000 non-null  bool   
 14  NewDevice             10000 non-null  bool   
 15  FailedAttempts      

## Check for Missing Values

In [78]:
print(data.isnull().sum())

## Summary statistics for numerical features

In [79]:
print(data.describe())

## Distribution of the target variable (FraudFlag)

In [80]:
if 'FraudFlag' in data.columns:
    print("\nDistribution of FraudFlag:")
    print(data['FraudFlag'].value_counts())

    plt.figure(figsize=(4, 4))
    sns.countplot(x='FraudFlag', data=data)
    plt.title('Distribution of FraudFlag (0 = Legitimate, 1 = Fraud)')
    plt.show()

## Correlation matrix for numerical features

In [81]:
numerical_features = data.select_dtypes(include=[np.number]).columns
if len(numerical_features) > 1:
    plt.figure(figsize=(4, 3))
    corr_matrix = data[numerical_features].corr()
    sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f')
    plt.title('Correlation Matrix')
    plt.show()

## Distribution of transaction amounts

In [82]:
plt.figure(figsize=(7, 4))
sns.histplot(data["Amount"], bins=50, kde=True, color="blue")
plt.title("Distribution of Transaction Amounts")
plt.xlabel("Transaction Amount")
plt.ylabel("Frequency")
plt.show()

## Outlier Detection in Transaction

In [83]:
plt.figure(figsize=(7, 4))
sns.boxplot(x=data["Amount"], color="red")
plt.title("Outlier Detection in Transaction Amounts")
plt.show()

## Count of transactions by TransactionType

In [84]:
plt.figure(figsize=(8, 5))
sns.countplot(x="TransactionType", data=data)
plt.title("Transaction Type Distribution")
plt.xlabel("Transaction Type")
plt.ylabel("Count")
plt.show()

## Count of transactions by MerchantCategory

In [85]:
plt.figure(figsize=(12, 5))
sns.countplot(
    y="MerchantCategory",
    data=data,
    order=data["MerchantCategory"].value_counts().index,
    hue="MerchantCategory",
    palette="viridis",
    legend=False
)
plt.title("Transactions by Merchant Category")
plt.xlabel("Count")
plt.ylabel("Merchant Category")
plt.show()


## Distribution of transactions over time

In [86]:
if 'Timestamp' in data.columns:
    data['Timestamp'] = pd.to_datetime(data['Timestamp'])
    data['Hour'] = data['Timestamp'].dt.hour
    data['DayOfWeek'] = data['Timestamp'].dt.day_name()

    # Transactions by hour
    plt.figure(figsize=(6, 4))
    sns.countplot(x='Hour', hue='Hour', data=data, palette='Set3', legend=False)
    plt.title('Transactions by Hour of the Day')
    plt.xlabel('Hour')
    plt.ylabel('Count')
    plt.show()

    # Transactions by day of the week
    plt.figure(figsize=(7, 4))
    sns.countplot(x='DayOfWeek', hue='DayOfWeek', data=data, 
                  order=['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'], 
                  palette='Set1', legend=False)
    plt.title('Transactions by Day of the Week')
    plt.xlabel('Day of the Week')
    plt.ylabel('Count')
    plt.show()
