In [3]:
# Fraud Detection - Data Cleaning and Preparation

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings

warnings.filterwarnings("ignore")

# Setup plotting
plt.style.use("seaborn-v0_8")
sns.set_palette("husl")
plt.rcParams["figure.figsize"] = (12, 8)

print("üîß Fraud Detection - Data Cleaning")
print("=" * 50)

# Load data
print("üìÅ Loading data...")
users = pd.read_csv("../../data/raw/fraud_users.csv")
transactions = pd.read_csv("../../data/raw/fraud_transactions.csv")

print(f"Users data: {users.shape}")
print(f"Transactions data: {transactions.shape}")

# Data Overview
print("\nüìä Data Overview:")
print("\nUsers Data Info:")
print(users.info())
print("\nTransactions Data Info:")
print(transactions.info())

# Check for missing values
print("\nüîç Missing Values:")
print("Users:")
print(users.isnull().sum())
print("\nTransactions:")
print(transactions.isnull().sum())

# Check basic statistics
print("\nüìà Basic Statistics:")
print("Transactions amount statistics:")
print(transactions["amount"].describe())

print(
    f"\nFraud rate: {transactions['is_fraud'].mean():.4f} ({transactions['is_fraud'].mean():.2%})"
)


# Data Cleaning Functions
def clean_users_data(users_df):
    """Clean users data"""
    print("\nüßπ Cleaning users data...")

    # Convert date columns
    users_df["signup_date"] = pd.to_datetime(users_df["signup_date"])

    # Handle any missing values
    users_clean = users_df.dropna()

    # Remove duplicates
    users_clean = users_clean.drop_duplicates(subset=["user_id"])

    print(f"Users after cleaning: {users_clean.shape}")
    return users_clean


def clean_transactions_data(transactions_df):
    """Clean transactions data"""
    print("\nüßπ Cleaning transactions data...")

    # Convert date column
    transactions_df["transaction_date"] = pd.to_datetime(
        transactions_df["transaction_date"]
)

    # Extract additional time features
    transactions_df["hour"] = transactions_df["transaction_date"].dt.hour
    transactions_df["day_of_week"] = transactions_df["transaction_date"].dt.day_name()
    transactions_df["month"] = transactions_df["transaction_date"].dt.month
    transactions_df["is_weekend"] = (
        transactions_df["transaction_date"].dt.dayofweek.isin([5, 6]).astype(int)
)

    # Handle missing values
    transactions_clean = transactions_df.dropna()

    # Remove duplicates
    transactions_clean = transactions_clean.drop_duplicates(subset=["transaction_id"])

    # Remove negative amounts (data quality issue)
    transactions_clean = transactions_clean[transactions_clean["amount"] > 0]

    print(f"Transactions after cleaning: {transactions_clean.shape}")
    return transactions_clean


# Apply cleaning
users_clean = clean_users_data(users)
transactions_clean = clean_transactions_data(transactions)

# Merge datasets for analysis
print("\nüîó Merging datasets...")
fraud_data = transactions_clean.merge(users_clean, on="user_id", how="left")

# Calculate user behavior features
print("\nüéØ Creating user behavior features...")

# User transaction statistics
user_stats = (
    transactions_clean.groupby("user_id")
    .agg(
        {"amount": ["mean", "std", "max", "count"], "transaction_date": ["min", "max"]}
)
    .reset_index()
)

user_stats.columns = [
    "user_id",
    "avg_amount",
    "std_amount",
    "max_amount",
    "transaction_count",
    "first_transaction",
    "last_transaction",
]

# Calculate days since first transaction
user_stats["days_since_first_transaction"] = (
    pd.to_datetime("2024-01-01") - user_stats["first_transaction"]
).dt.days
user_stats["transaction_frequency"] = user_stats["transaction_count"] / user_stats[
    "days_since_first_transaction"
].clip(lower=1)

# Merge user stats back
fraud_data = fraud_data.merge(user_stats, on="user_id", how="left")

# Create fraud-specific features
fraud_data["amount_to_avg_ratio"] = fraud_data["amount"] / fraud_data["avg_amount"]
fraud_data["is_high_value"] = (
    fraud_data["amount"] > fraud_data["amount"].quantile(0.95)
).astype(int)
fraud_data["is_late_night"] = fraud_data["hour"].between(0, 5).astype(int)
if "location" in fraud_data.columns:
    fraud_data["is_international"] = (fraud_data["location"] == "International").astype(int)
else:
    fraud_data["is_international"] = 0

print(f"Final merged dataset: {fraud_data.shape}")

# Save cleaned data
print("\nüíæ Saving cleaned data...")
users_clean.to_csv("../../data/processed/fraud_users_clean.csv", index=False)
transactions_clean.to_csv("../../data/processed/fraud_transactions_clean.csv", index=False)
fraud_data.to_csv("../../data/processed/fraud_data_clean.csv", index=False)

print("‚úÖ Data cleaning completed!")
print(f"Final dataset shape: {fraud_data.shape}")
print(
    f"Fraud cases: {fraud_data['is_fraud'].sum()} ({fraud_data['is_fraud'].mean():.2%})"
)

# Display sample of cleaned data
print("\nüìã Sample of cleaned data:")
sample_cols = [col for col in ["user_id", "amount", "category", "location", "is_fraud"] if col in fraud_data.columns]
print(fraud_data[sample_cols].head(10))

üîß Fraud Detection - Data Cleaning
üìÅ Loading data...
Users data: (5000, 6)
Transactions data: (50000, 10)

üìä Data Overview:

Users Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   user_id           5000 non-null   object 
 1   signup_date       5000 non-null   object 
 2   location          5000 non-null   object 
 3   device_type       5000 non-null   object 
 4   account_age_days  5000 non-null   int64  
 5   trust_score       5000 non-null   float64
dtypes: float64(1), int64(1), object(4)
memory usage: 234.5+ KB
None

Transactions Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   transaction_id    50000 non-null  object 
 1   user_id           5