# Data Exploration

This notebook explores the dataset to understand its structure and content.

In [None]:
# data_exploration.ipynb

# Import necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from src.data_loading import load_data

# Load the dataset
df = load_data('../data/raw/dataset.csv')

# Display the first few rows
df.head()

In [None]:
# Get basic information about the dataset
df.info()

In [None]:
# Describe numerical features
df.describe()

In [None]:
# Check for missing values
missing_values = df.isnull().sum()
print("Missing values per column:")
print(missing_values)

In [None]:
# Visualize the distribution of numerical features
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns

for col in numerical_cols:
    plt.figure(figsize=(8, 4))
    sns.histplot(df[col], kde=True)
    plt.title(f'Distribution of {col}')
    plt.show()

In [None]:
# Visualize correlations between numerical features
plt.figure(figsize=(12, 10))
corr_matrix = df[numerical_cols].corr()
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()

In [None]:
# Visualize relationships between pairs of variables
sns.pairplot(df[numerical_cols])
plt.show()

In [None]:
# Analyze categorical features
categorical_cols = df.select_dtypes(include=['object']).columns
for col in categorical_cols:
    plt.figure(figsize=(8, 4))
    sns.countplot(y=col, data=df)
    plt.title(f'Count of {col}')
    plt.show()

In [None]:
# Save the exploratory analysis results (optional)
df.to_csv('../data/processed/explored_data.csv', index=False)