# Data Understanding and EDA

This notebook performs initial exploratory data analysis on the financial dataset with multiple stock indices and economic indicators.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sys
import os

# Add src directory to path to import custom modules
sys.path.append(os.path.join('..', 'src'))

from data_loading import load_financial_dataset
from data_cleaning import clean_financial_data
from visualization import plot_stock_comparison, plot_time_series, plot_correlation_heatmap

# Load the financial dataset
df = load_financial_dataset()
print('Data shape:', df.shape)
print('\nFirst few rows:')
print(df.head())

print('\nData info:')
print(df.info())

print('\nBasic statistics:')
print(df.describe())

print('\nStock indices in dataset:', df['Stock Index'].unique())
print('\nDate range:', df['Date'].min(), 'to', df['Date'].max())

In [None]:
# Check for missing values
print('Missing values per column:')
print(df.isnull().sum())

# Clean the data
df_clean = clean_financial_data(df)
print('\nShape after cleaning:', df_clean.shape)

# Check for missing values after cleaning
print('\nMissing values after cleaning:')
print(df_clean.isnull().sum().head(10))

In [None]:
# Analyze stock indices
print('Stock indices and their data point counts:')
print(df_clean['Stock Index'].value_counts())

# Plot stock price comparison
plot_stock_comparison(df_clean.head(1000))  # Plot first 1000 rows for performance

In [None]:
# Visualize economic indicators over time
economic_cols = ['GDP Growth (%)', 'Inflation Rate (%)', 'Unemployment Rate (%)', 'Interest Rate (%)']
available_cols = [col for col in economic_cols if col in df_clean.columns]

if available_cols:
    plot_time_series(df_clean, available_cols)
    
# Plot correlation heatmap for key financial metrics
key_cols = ['Open Price', 'Close Price', 'Daily High', 'Daily Low', 'Trading Volume', 'GDP Growth (%)', 'Inflation Rate (%)']
available_key_cols = [col for col in key_cols if col in df_clean.columns]
plot_correlation_heatmap(df_clean, cols=available_key_cols)