# Asian Market Quant Project

## 2. Data Engineering Pipeline

**Author:** Wong Wai Hin  
**Date:** July 20, 2025

This notebook demonstrates the data engineering pipeline for the Asian Market Quant project, focusing on:

1. Data ingestion and cleaning
2. Currency normalization to USD
3. Corporate actions and futures roll handling
4. Calculation of returns
5. Data dictionary generation


In [1]:
# Import libraries and modules
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import sys

# Add project root to path for imports
sys.path.append('..')

# Import our data engineering module
from src.data_engineering import (
    load_raw_data,
    clean_and_standardize,
    normalize_to_usd,
    adjust_for_corporate_actions,
    handle_futures_rolls,
    calculate_returns,
    create_data_dictionary,
    run_data_pipeline
)

# Import asset class mapping
from src.asset_class_mapping import ASSET_MAPPING, create_ticker_to_asset_class_map

# Set plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette("colorblind")

print("Libraries and modules imported successfully")

Libraries and modules imported successfully


## 1. Find and Load Raw Data

First, we need to locate our raw data file and load it using our data engineering module.


In [2]:
# Look for Excel files in data/raw directory
raw_dir = '../data/raw'
excel_files = [f for f in os.listdir(raw_dir) if f.endswith(('.xlsx', '.xls'))]

print(f"Found {len(excel_files)} Excel files in {raw_dir}:")
for i, file in enumerate(excel_files):
    print(f"{i+1}. {file}")

# Choose the first Excel file if available
if excel_files:
    input_file = os.path.join(raw_dir, excel_files[0])
    print(f"\nUsing {input_file} as input")
else:
    input_file = input("Enter path to Excel file: ")

Found 2 Excel files in ../data/raw:
1. original_data.xlsx
2. MSCI_Comps.xlsx

Using ../data/raw/original_data.xlsx as input


In [None]:
# Load raw data
raw_data = load_raw_data(input_file)

# Display basic information
print("\nRaw Data Information:")
print(f"Shape: {raw_data.shape}")
print(f"Date Range: {raw_data.index[0]} to {raw_data.index[-1]}")
print(f"Number of Assets: {raw_data.shape[1]}")

# Show first few rows
print("\nSample Data:")
raw_data.head()

## 2. Data Cleaning and Standardization

Next, we clean and standardize the data by:

- Ensuring dates are in datetime format
- Resampling to business day frequency
- Forward-filling missing values (holidays)


In [1]:
# Verify raw_data is loaded
if 'raw_data' not in locals() or raw_data is None:
    print("Error: Raw data not found. Please run the data loading cell above first.")
else:
    # Clean and standardize data
    cleaned_data = clean_and_standardize(raw_data)
    
    if cleaned_data is not None:
        # Display data quality statistics
        missing_values = cleaned_data.isna().sum().sum()
        total_values = cleaned_data.size
        completeness = (1 - missing_values / total_values) * 100

        print("Data Quality Statistics:")
        print(f"Total Values: {total_values:,}")
        print(f"Missing Values: {missing_values:,}")
        print(f"Data Completeness: {completeness:.2f}%")

        # Check for any remaining NaN values
        nan_counts = cleaned_data.isna().sum()
        columns_with_nans = nan_counts[nan_counts > 0]

        if len(columns_with_nans) > 0:
            print("\nColumns with missing values:")
            for col, count in columns_with_nans.items():
                percent = (count / len(cleaned_data)) * 100
                print(f"- {col}: {count} missing values ({percent:.2f}%)")
        else:
            print("\nNo missing values found after cleaning!")
    else:
        print("Error: Data cleaning failed. Please check the input data format.")

Error: Raw data not found. Please run the data loading cell above first.


## 3. Currency Normalization

Now we normalize all asset prices to USD, which is crucial for cross-asset comparison and risk budgeting.


In [4]:
# Normalize to USD
usd_data = normalize_to_usd(cleaned_data)

# Check for any significant changes after normalization
fx_tickers = [ticker for ticker in cleaned_data.columns 
              if ticker in ASSET_MAPPING.get('fx_crosses', {}).get('tickers', [])]

print(f"FX tickers used for normalization: {fx_tickers}")

# For demonstration, plot one asset before and after normalization
# Let's use a Japanese asset as an example, since it needs JPY->USD conversion
if 'NKY Index' in cleaned_data.columns and 'USDJPY Curncy' in cleaned_data.columns:
    plt.figure(figsize=(12, 6))
    
    # Plot original value
    plt.subplot(1, 2, 1)
    cleaned_data['NKY Index'].plot()
    plt.title('Nikkei 225 (Original JPY)', fontweight='bold')
    plt.ylabel('Price (JPY)')
    plt.grid(True, alpha=0.3)
    
    # Plot USD-normalized value
    plt.subplot(1, 2, 2)
    usd_data['NKY Index'].plot()
    plt.title('Nikkei 225 (USD Normalized)', fontweight='bold')
    plt.ylabel('Price (USD Equivalent)')
    plt.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    
    print("USD Normalization applied successfully")
else:
    print("Could not find suitable example for normalization visualization")

NameError: name 'cleaned_data' is not defined

## 4. Corporate Actions and Futures Roll Handling

Now we adjust for corporate actions (e.g., dividends) and handle futures contract rolls.


In [None]:
# Adjust for corporate actions
adjusted_data = adjust_for_corporate_actions(usd_data)

# Handle futures rolls
processed_data = handle_futures_rolls(adjusted_data)

# For demonstration, compare a futures contract before and after roll handling
futures_contracts = ['CO1 Comdty', 'S 1 Comdty']
for contract in futures_contracts:
    if contract in usd_data.columns:
        plt.figure(figsize=(12, 6))
        
        plt.subplot(1, 2, 1)
        usd_data[contract].plot()
        plt.title(f'{contract} (Before Roll Handling)', fontweight='bold')
        plt.ylabel('Price (USD)')
        plt.grid(True, alpha=0.3)
        
        plt.subplot(1, 2, 2)
        processed_data[contract].plot()
        plt.title(f'{contract} (After Roll Handling)', fontweight='bold')
        plt.ylabel('Price (USD)')
        plt.grid(True, alpha=0.3)
        
        plt.tight_layout()
        plt.show()
        
        break

## 5. Returns Calculation

Calculate both daily and monthly returns from our processed price data.


In [None]:
# Calculate daily returns
daily_returns = calculate_returns(processed_data, 'D')

# Calculate monthly returns
monthly_returns = calculate_returns(processed_data, 'M')

# Display statistics for daily returns
print("Daily Returns Statistics:")
print(daily_returns.describe().T[['mean', 'std', 'min', 'max']].sort_values('std', ascending=False).head(10))

# Plot distribution of daily returns for a sample asset
if 'SPX Index' in daily_returns.columns:
    plt.figure(figsize=(12, 5))
    
    plt.subplot(1, 2, 1)
    daily_returns['SPX Index'].hist(bins=50)
    plt.title('S&P 500 Daily Returns Distribution', fontweight='bold')
    plt.xlabel('Daily Return')
    plt.grid(True, alpha=0.3)
    
    plt.subplot(1, 2, 2)
    daily_returns['SPX Index'].plot()
    plt.title('S&P 500 Daily Returns Time Series', fontweight='bold')
    plt.ylabel('Daily Return')
    plt.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()

## 6. Data Dictionary Creation

Create a comprehensive data dictionary with metadata about each asset.


In [None]:
# Create data dictionary
data_dict = create_data_dictionary(processed_data, None)  # Don't save yet, just create

# Display data dictionary
display(data_dict)

## 7. Run Complete Pipeline and Save Results

Finally, run the complete pipeline and save all outputs.


In [None]:
# Define output directory
output_dir = '../data/processed'

# Run the complete pipeline
results = run_data_pipeline(input_file, output_dir)

# Display saved file information
print("\nFiles saved to disk:")
for root, dirs, files in os.walk(output_dir):
    for file in files:
        full_path = os.path.join(root, file)
        size_kb = os.path.getsize(full_path) / 1024
        print(f"  - {os.path.relpath(full_path, '..')}: {size_kb:.1f} KB")

## 8. Sample Analysis with Processed Data

Let's perform a quick analysis with our processed data to verify everything is working correctly.


In [None]:
# Load saved processed data
daily_prices = pd.read_pickle(os.path.join(output_dir, 'daily_prices.pkl'))
daily_returns = pd.read_pickle(os.path.join(output_dir, 'daily_returns.pkl'))

# Calculate correlation matrix for daily returns
correlation = daily_returns.corr()

# Visualize correlation matrix
plt.figure(figsize=(12, 10))
sns.heatmap(correlation, annot=False, cmap='coolwarm', center=0, linewidths=.5)
plt.title('Correlation Matrix of Daily Returns', fontsize=16, fontweight='bold')
plt.tight_layout()
plt.show()

# Analyze returns by asset class
ticker_map = create_ticker_to_asset_class_map()
asset_class_returns = {}

for col in daily_returns.columns:
    asset_info = ticker_map.get(col, {})
    asset_class = asset_info.get('asset_class', 'unknown').replace('_', ' ').title()
    
    if asset_class not in asset_class_returns:
        asset_class_returns[asset_class] = []
    
    asset_class_returns[asset_class].append(col)

# Plot average returns by asset class
plt.figure(figsize=(12, 6))
asset_class_means = []
asset_class_stds = []
asset_class_names = []

for asset_class, tickers in asset_class_returns.items():
    if asset_class != 'Unknown' and tickers:  # Skip unknown and empty classes
        # Calculate average return and volatility for this asset class
        class_returns = daily_returns[tickers].mean(axis=1)
        mean_return = class_returns.mean() * 252  # Annualize
        std_return = class_returns.std() * np.sqrt(252)  # Annualize
        
        asset_class_names.append(asset_class)
        asset_class_means.append(mean_return)
        asset_class_stds.append(std_return)

# Create a bar chart with error bars
plt.bar(asset_class_names, asset_class_means, yerr=asset_class_stds, 
        capsize=10, color='skyblue', alpha=0.7)
plt.title('Annualized Returns by Asset Class', fontsize=16, fontweight='bold')
plt.ylabel('Annualized Return')
plt.grid(True, alpha=0.3, axis='y')
plt.xticks(rotation=45)

for i, v in enumerate(asset_class_means):
    plt.text(i, v + 0.01, f"{v:.2%}", ha='center', fontweight='bold')

plt.tight_layout()
plt.show()

## Next Steps

Now that our data engineering pipeline is complete, we can proceed to:

1. Perform exploratory analysis to identify correlations and regime changes
2. Design signal prototypes for each asset class
3. Apply hierarchical risk parity within our risk budget framework
