In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load data
df = pd.read_csv('../data/upi_transactions.csv')

# Basic exploration
print("Dataset Overview:")
print(df.head())
print(f"\nShape: {df.shape}")
print(f"\nData Types:\n{df.dtypes}")
print(f"\nMissing Values:\n{df.isnull().sum()}")

# Fraud distribution
fraud_rate = df['is_fraud'].mean()
print(f"\nFraud Rate: {fraud_rate:.2%}")

# Visualizations
plt.figure(figsize=(15, 10))

# Fraud distribution
plt.subplot(2, 3, 1)
df['is_fraud'].value_counts().plot(kind='pie', autopct='%1.1f%%')
plt.title('Fraud Distribution')
plt.ylabel('')

# Amount distribution
plt.subplot(2, 3, 2)
sns.histplot(data=df, x='amount', hue='is_fraud', bins=50, alpha=0.7)
plt.title('Transaction Amount Distribution')
plt.xlabel('Amount (INR)')

# Merchant category distribution
plt.subplot(2, 3, 3)
category_fraud = df.groupby('merchant_category')['is_fraud'].mean().sort_values(ascending=False)
category_fraud.plot(kind='bar')
plt.title('Fraud Rate by Merchant Category')
plt.xticks(rotation=45, ha='right')
plt.ylabel('Fraud Rate')

# Location distribution
plt.subplot(2, 3, 4)
location_fraud = df.groupby('location')['is_fraud'].mean().sort_values(ascending=False)
location_fraud.plot(kind='bar')
plt.title('Fraud Rate by Location')
plt.xticks(rotation=45, ha='right')
plt.ylabel('Fraud Rate')

# Hourly pattern
plt.subplot(2, 3, 5)
df['timestamp'] = pd.to_datetime(df['timestamp'])
df['hour'] = df['timestamp'].dt.hour
hourly_fraud = df.groupby('hour')['is_fraud'].mean()
hourly_fraud.plot(kind='line', marker='o')
plt.title('Fraud Rate by Hour')
plt.xlabel('Hour of Day')
plt.ylabel('Fraud Rate')
plt.grid(True)

# Amount vs Fraud
plt.subplot(2, 3, 6)
sns.boxplot(data=df, x='is_fraud', y='amount')
plt.title('Amount Distribution by Fraud Status')
plt.xticks([0, 1], ['Legitimate', 'Fraud'])

plt.tight_layout()
plt.savefig('../reports/data_exploration.png', dpi=300, bbox_inches='tight')
plt.show()

print("Exploration complete. Plots saved to reports/data_exploration.png")