# Fraud Detection System - Data Preprocessing

This notebook handles the preprocessing steps for the transaction data, including:
1. Handling missing values
2. Encoding categorical variables
3. Scaling numerical features
4. Handling class imbalance
5. Feature engineering
6. Train-test split

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt
import seaborn as sns

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

In [None]:
# Load the data
df = pd.read_csv('../data/sample_transactions.csv')

# Display basic information
print("Dataset Shape:", df.shape)
print("\nDataset Info:")
df.info()

## 1. Handle Missing Values

In [None]:
# Check for missing values
missing_values = df.isnull().sum()
print("Missing Values per Column:")
print(missing_values[missing_values > 0])

# Visualize missing values
plt.figure(figsize=(10, 6))
sns.heatmap(df.isnull(), yticklabels=False, cbar=False, cmap='viridis')
plt.title('Missing Values Heatmap')
plt.show()

## 2. Feature Engineering

In [None]:
# Convert timestamp to datetime
df['timestamp'] = pd.to_datetime(df['timestamp'])

# Extract time-based features
df['hour'] = df['timestamp'].dt.hour
df['day_of_week'] = df['timestamp'].dt.dayofweek
df['month'] = df['timestamp'].dt.month

# Extract location features from shipping address
df['city'] = df['shipping_address'].str.split(',').str[1].str.strip()
df['country'] = df['shipping_address'].str.split(',').str[-1].str.strip()

# Calculate transaction frequency per user (if user_id exists)
if 'user_id' in df.columns:
    user_transaction_counts = df['user_id'].value_counts()
    df['user_transaction_frequency'] = df['user_id'].map(user_transaction_counts)

# Display new features
print("New Features Created:")
print(df[['hour', 'day_of_week', 'month', 'city', 'country']].head())

## 3. Prepare Features for Modeling

In [None]:
# Identify categorical and numerical columns
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Remove target variable from numerical columns if present
if 'is_fraud' in numerical_cols:
    numerical_cols.remove('is_fraud')

print("Categorical columns:", categorical_cols)
print("\nNumerical columns:", numerical_cols)

In [None]:
# Create preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ])

# Prepare features and target
X = df.drop('is_fraud', axis=1)
y = df['is_fraud']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print("Training set shape:", X_train.shape)
print("Testing set shape:", X_test.shape)

## 4. Handle Class Imbalance

In [None]:
# Check class distribution
print("Class distribution in training set:")
print(y_train.value_counts(normalize=True))

# Apply SMOTE to handle class imbalance
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

print("\nClass distribution after SMOTE:")
print(pd.Series(y_train_resampled).value_counts(normalize=True))

## 5. Save Preprocessed Data

In [None]:
# Save preprocessed data
import joblib

# Save the preprocessor
joblib.dump(preprocessor, '../models/preprocessor.joblib')

# Save the preprocessed data
preprocessed_data = {
    'X_train': X_train,
    'X_test': X_test,
    'y_train': y_train,
    'y_test': y_test,
    'X_train_resampled': X_train_resampled,
    'y_train_resampled': y_train_resampled
}

joblib.dump(preprocessed_data, '../data/preprocessed_data.joblib')

print("Preprocessed data saved successfully!")