# Fraud Transaction Analysis
Analysis of fraud patterns in transaction data with key visualizations

 ## Load and Prepare Data

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import os


warnings.filterwarnings('ignore')

# Set visualization style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 11

def load_and_prepare_data(file_path):
    """Load and prepare transaction 01_data_cleaning for analysis"""
    df = pd.read_csv(file_path)

    # Convert amount column to numeric
    df['Amount'] = df['Amount'].replace('[\$,]', '', regex=True).astype(float)

    # Convert dates
    df['Post Date'] = pd.to_datetime(df['Post Date'], errors='coerce')
    df['Account Open Date'] = pd.to_datetime(df['Account Open Date'], errors='coerce')

    # Create fraud indicator
    df['Is_Fraud'] = df['Fraud Adjustment Indicator'].notna() & (df['Fraud Adjustment Indicator'].str.strip() != '')

    # Clean IDs
    df['Member ID'] = df['Member ID'].astype(str).str.lstrip('0')
    df['Account ID'] = df['Account ID'].astype(str).str.lstrip('0')

    # Basic statistics
    total_transactions = len(df)
    fraud_cases = df['Is_Fraud'].sum()
    fraud_rate = fraud_cases / total_transactions * 100

    print(f"Dataset Overview:")
    print(f"- Total transactions: {total_transactions:,}")
    print(f"- Fraud cases: {fraud_cases:,}")
    print(f"- Fraud rate: {fraud_rate:.4f}%")
    print(f"- Date range: {df['Post Date'].min().strftime('%Y-%m-%d')} to {df['Post Date'].max().strftime('%Y-%m-%d')}")
    print(f"- Unique accounts: {df['Account ID'].nunique():,}")
    print(f"- Unique members: {df['Member ID'].nunique():,}")

    return df

# Load 01_data_cleaning
file_path_1 = '../../data/raw/transaction_data.csv'
file_path_2 = '../../data/raw/TransactionData09232025.csv'
df1 = load_and_prepare_data(file_path_1)
df2 = load_and_prepare_data(file_path_2)

df_combined = pd.concat([df1, df2], ignore_index=True)
df_combined.drop_duplicates(inplace=True)

output_path = "/processed/transaction_data_merged.csv"
df_combined.to_csv(output_path, index=False)

# Display basic info
print(f"\nDataset shape: {df_combined.shape}")
display(df_combined.head())

FileNotFoundError: [Errno 2] No such file or directory: '../../data/raw/transaction_data.csv'

In [None]:
import pandas as pd
import numpy as np

merged_file = "/processed/transaction_data_merged.csv"
cleaned_file = "../../data/processed/transaction_data_cleaned.csv"

# Read Data
df = pd.read_csv(merged_file)

missing_tbl = (
    df.isna().sum().rename('missing_count').to_frame()
)
missing_tbl['missing_pct'] = (missing_tbl['missing_count'] / len(df) * 100).round(2)
print("\nMissing overview BEFORE cleaning:")
print(
    missing_tbl[missing_tbl['missing_count'] > 0]
      .sort_values('missing_count', ascending=False)
      .head(30)
)


# Amount Missing
df['Amount_missing'] = df['Amount'].isna()
amount_mean = df['Amount'].mean(skipna=True)
df['Amount'] = df['Amount'].fillna(amount_mean)

# Type Missing
for col in ['Type', 'Product ID', 'Action Type', 'Transaction Description']:
    if col in df.columns:
        df[col] = df[col].fillna("Unknown")


# Age == 0 
if 'Member Age' in df.columns:
    age_zero_count = (df['Member Age'] == 0).sum()
    print(f"Age==0 numbers: {age_zero_count}")

    # Fix by mean
    age_median = df.loc[df['Member Age'] > 0, 'Member Age'].median()
    df.loc[df['Member Age'] == 0, 'Member Age'] = age_median

# Outliers
Q1 = df['Amount'].quantile(0.25)
Q3 = df['Amount'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

def filter_outliers(row):
    if row['Is_Fraud']:  # fraud 01_data_cleaning keep
        return True
    # common 01_data_cleaning â†’ check if in the field
    return (lower_bound <= row['Amount'] <= upper_bound)

df = df[df.apply(filter_outliers, axis=1)]


df.to_csv(cleaned_file, index=False)
print(f"Cleaned 01_data_cleaning saved to: {cleaned_file}")
print(f"Final dataset shape: {df.shape}")
display(df.head())


In [None]:
print("\n=== Data Cleaning Validation ===")

print("Missing values after cleaning:")
print(df.isna().sum())

if 'Member Age' in df.columns:
    print(f"Remaining Age==0 count: {(df['Member Age'] == 0).sum()}")

df_raw = pd.read_csv(merged_file)
fraud_removed = df_raw[df_raw['Is_Fraud'] & ~df_raw.index.isin(df.index)]
print(f"Fraud rows removed during cleaning: {len(fraud_removed)}")

import matplotlib.pyplot as plt

plt.figure(figsize=(10,5))
df_raw['Amount'].hist(alpha=0.5, bins=50, label='Raw')
df['Amount'].hist(alpha=0.5, bins=50, label='Cleaned')
plt.xlabel("Amount")
plt.ylabel("Frequency")
plt.title("Amount Distribution Before vs After Cleaning")
plt.legend()
plt.show()


In [13]:
import matplotlib.pyplot as plt
import pandas as pd

raw = pd.read_csv(merged_file)

# 
missing_counts = raw.isna().sum()
missing_counts = missing_counts[missing_counts > 0].sort_values(ascending=False)
total_rows = len(raw)
missing_pct = (missing_counts / total_rows * 100)

#
plt.figure(figsize=(10, max(4, 0.35*len(missing_pct))))
plt.barh(missing_pct.index[::-1], missing_pct.values[::-1])
for y, v in enumerate(missing_pct.values[::-1]):
    plt.text(v + 0.2, y, f'{v:.2f}%', va='center')
plt.xlabel('Missing percentage (%)')
plt.title('Missing values per column (all)')
plt.tight_layout()
plt.show()

# 
pie_series = missing_pct.drop('Fraud Adjustment Indicator', errors='ignore')

if not pie_series.empty:
    # combine 1%
    small_mask = pie_series < 1
    other_sum = pie_series[small_mask].sum()
    pie_series = pie_series[~small_mask]
    if other_sum > 0:
        pie_series.loc['Other (<1%)'] = other_sum

    plt.figure(figsize=(8,8))
    wedges, _, autotexts = plt.pie(
        pie_series.values,
        labels=None,      
        startangle=90,
        autopct=lambda p: f'{p:.2f}%' if p >= 1 else ''
    )
    plt.legend(
        wedges,
        [f'{k}' for k in pie_series.index],
        title='Columns',
        loc='center left',
        bbox_to_anchor=(1.0, 0.5)
    )
    plt.title('Missing value breakdown (excluding dominant column)')
    plt.tight_layout()
    plt.show()
else:
    print("No columns to plot in the pie chart after excluding the dominant column.")





FileNotFoundError: [Errno 2] No such file or directory: '../data/raw/transaction_data_merged.csv'