# Exploratory Data Analysis (EDA) for Processed Financial Data

This notebook explores the processed financial data stored in `data/processed/processed_data.csv`. We will:
- Load and inspect the dataset
- Display summary statistics and data information
- Visualize the distribution of company values
- Analyze records over time (if dates are available)
- Display a correlation matrix (if applicable)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Enable inline plotting
%matplotlib inline

# Load the processed data
df = pd.read_csv('../data/processed/processed_data.csv')

# Display the first few rows
print('First few rows of the dataset:')
display(df.head())

# Display summary statistics
print('Summary statistics:')
display(df.describe())

# Display data information
print('Dataset info:')
df.info()

In [None]:
# Plot the distribution of company values
plt.figure(figsize=(10, 6))
sns.histplot(df['value'], bins=20, kde=True)
plt.xlabel('Company Value')
plt.ylabel('Frequency')
plt.title('Distribution of Company Values')
plt.show()

In [None]:
# If the date column is available, convert it to datetime and plot records per date
if 'date' in df.columns:
    df['date'] = pd.to_datetime(df['date'], errors='coerce')
    print(f"Unique dates in dataset: {df['date'].nunique()}")
    
    # Plot number of records per date
    plt.figure(figsize=(12, 6))
    df['date'].value_counts().sort_index().plot(kind='line')
    plt.xlabel('Date')
    plt.ylabel('Number of Records')
    plt.title('Records per Date')
    plt.show()

In [None]:
# Display a correlation matrix for numeric columns (if more exist)
plt.figure(figsize=(8, 6))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()