In [25]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
from scipy.stats import chi2_contingency

In [26]:
# Load featured dataset
data_path = '/content/drive/MyDrive/Fraud Detection/bank_transactions_featured.csv'
df = pd.read_csv(data_path)

In [27]:
# Create directory for EDA outputs
os.makedirs('/content/drive/MyDrive/Fraud Detection', exist_ok=True)

# Dataset Overview

In [29]:
print("✅ Dataset Shape:", df.shape)
print("\n✅ Columns Overview:")
print(df.columns.tolist())

print("\n✅ Data Types and Nulls:")
print(df.info())
print("\n✅ Missing Values per Column:")
print(df.isnull().sum())

print("\n✅ Large Transaction Distribution (%):")
print(df['is_large_transaction'].value_counts(normalize=True) * 100)

print("\n✅ Unusual Location Distribution (%):")
print(df['is_unusual_location'].value_counts(normalize=True) * 100)

✅ Dataset Shape: (2372, 26)

✅ Columns Overview:
['TransactionID', 'AccountID', 'TransactionAmount', 'TransactionDate', 'TransactionType', 'Location', 'DeviceID', 'IP Address', 'MerchantID', 'Channel', 'CustomerAge', 'CustomerOccupation', 'TransactionDuration', 'LoginAttempts', 'AccountBalance', 'PreviousTransactionDate', 'is_large_transaction', 'log_transaction_amount', 'transaction_hour', 'transaction_day_of_week', 'odd_hour_transaction', 'user_transaction_count', 'user_avg_transaction_amount', 'deviation_from_user_avg', 'user_primary_location', 'is_unusual_location']

✅ Data Types and Nulls:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2372 entries, 0 to 2371
Data columns (total 26 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   TransactionID                2372 non-null   object 
 1   AccountID                    2372 non-null   object 
 2   TransactionAmount            2372 non-null   float64
 

# Feature Distributions

In [30]:
# is_large_transaction distribution
plt.figure(figsize=(6,4))
sns.countplot(x='is_large_transaction', data=df)
plt.title('Large Transaction Distribution')
plt.savefig('/content/drive/MyDrive/Fraud Detection/large_transaction_distribution.png')
plt.close()

In [31]:
# is_unusual_location distribution
plt.figure(figsize=(6,4))
sns.countplot(x='is_unusual_location', data=df)
plt.title('Unusual Location Distribution')
plt.savefig('/content/drive/MyDrive/Fraud Detection/unusual_location_distribution.png')
plt.close()

# Transaction Amount Analysis

In [33]:
plt.figure(figsize=(8,4))
sns.histplot(df['TransactionAmount'], bins=50, kde=True)
plt.title('Transaction Amount Distribution')
plt.xlim(0, df['TransactionAmount'].quantile(0.99))  # trim extreme outliers
plt.savefig('/content/drive/MyDrive/Fraud Detection/amount_distribution.png')
plt.close()

# Time Features Analysis

In [34]:
# Hourly patterns
if 'transaction_hour' in df.columns:
    plt.figure(figsize=(10,4))
    sns.countplot(x='transaction_hour', data=df)
    plt.title('Transactions by Hour of Day')
    plt.savefig('/content/drive/MyDrive/Fraud Detection/transactions_by_hour.png')
    plt.close()

In [35]:
# Day of week patterns
if 'transaction_day_of_week' in df.columns:
    plt.figure(figsize=(10,4))
    sns.countplot(x='transaction_day_of_week', data=df)
    plt.title('Transactions by Day of Week')
    plt.savefig('/content/drive/MyDrive/Fraud Detection/transactions_by_day_of_week.png')
    plt.close()

# Cross-Feature Analysis

In [36]:
# Large vs Unusual
plt.figure(figsize=(6,4))
sns.countplot(x='is_large_transaction', hue='is_unusual_location', data=df)
plt.title('Large Transaction vs Unusual Location')
plt.savefig('/content/drive/MyDrive/Fraud Detection/large_vs_unusual.png')
plt.close()

print("\n✅ Corrected EDA Completed. Plots saved under /content/drive/MyDrive/Fraud Detection/ ✅")


✅ Corrected EDA Completed. Plots saved under /content/drive/MyDrive/Fraud Detection/ ✅
