# Fraud Detection System - Data Exploration

This notebook explores the transaction data to understand patterns and characteristics of fraudulent transactions.

In [1]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Set style for better visualizations

sns.set_theme()  # This sets Seaborn's default styling
sns.set_palette('husl')

# Display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

OSError: 'seaborn' is not a valid package style, path of style file, URL of style file, or library style name (library styles are listed in `style.available`)

In [None]:
# Load the data
df = pd.read_csv('../data/sample_transactions.csv')

# Display basic information about the dataset
print("Dataset Shape:", df.shape)
print("\nDataset Info:")
df.info()

In [None]:
# Display first few rows
df.head()

In [None]:
# Check for missing values
missing_values = df.isnull().sum()
print("Missing Values per Column:")
print(missing_values[missing_values > 0])

In [None]:
# Basic statistics of numerical columns
df.describe()

In [None]:
# Distribution of fraud cases
fraud_distribution = df['is_fraud'].value_counts(normalize=True) * 100
print("Fraud Distribution (%):")
print(fraud_distribution)

plt.figure(figsize=(8, 6))
sns.countplot(data=df, x='is_fraud')
plt.title('Distribution of Fraud Cases')
plt.xlabel('Is Fraud')
plt.ylabel('Count')
plt.show()

In [None]:
# Analyze payment methods
plt.figure(figsize=(10, 6))
sns.countplot(data=df, x='payment_method', hue='is_fraud')
plt.title('Payment Methods vs Fraud')
plt.xlabel('Payment Method')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.show()

In [None]:
# Analyze transaction amounts
plt.figure(figsize=(12, 6))
sns.boxplot(data=df, x='is_fraud', y='amount')
plt.title('Transaction Amounts vs Fraud')
plt.xlabel('Is Fraud')
plt.ylabel('Amount')
plt.show()

In [None]:
# Analyze device types
plt.figure(figsize=(12, 6))
sns.countplot(data=df, x='device_id', hue='is_fraud')
plt.title('Device Types vs Fraud')
plt.xlabel('Device Type')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.show()

In [None]:
# Analyze locations
df['city'] = df['shipping_address'].str.split(',').str[1].str.strip()
plt.figure(figsize=(12, 6))
sns.countplot(data=df, x='city', hue='is_fraud')
plt.title('Cities vs Fraud')
plt.xlabel('City')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.show()

In [None]:
# Time-based analysis
df['timestamp'] = pd.to_datetime(df['timestamp'])
df['hour'] = df['timestamp'].dt.hour

plt.figure(figsize=(12, 6))
sns.countplot(data=df, x='hour', hue='is_fraud')
plt.title('Hour of Day vs Fraud')
plt.xlabel('Hour of Day')
plt.ylabel('Count')
plt.show()