# Data Exploration

In this notebook, we will perform initial data exploration and visualizations to understand the dataset better.

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Set visualization style
sns.set(style='whitegrid')

# Load the dataset
data = pd.read_csv('../data/processed/your_processed_data.csv')  # Update with your processed data path

# Display the first few rows of the dataset
data.head()

# Summary statistics
data.describe()

# Check for missing values
missing_values = data.isnull().sum()
missing_values[missing_values > 0]

# Visualize distributions of numerical features
numerical_features = data.select_dtypes(include=['float64', 'int64']).columns
for feature in numerical_features:
    plt.figure(figsize=(10, 6))
    sns.histplot(data[feature], bins=30, kde=True)
    plt.title(f'Distribution of {feature}')
    plt.xlabel(feature)
    plt.ylabel('Frequency')
    plt.show()

# Correlation heatmap
plt.figure(figsize=(12, 8))
correlation_matrix = data.corr()
sns.heatmap(correlation_matrix, annot=True, fmt='.2f', cmap='coolwarm', square=True)
plt.title('Correlation Heatmap')
plt.show()