# 🧭 Explore Raw Data

This notebook provides a general overview of the dataset using basic Pandas functions and visualizations with Matplotlib.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Set plot style
sns.set(style="whitegrid")


In [None]:
# Load the raw metadata CSV
df = pd.read_csv('../data/raw/metadata.csv')

# Display basic information
df.info()

# Show first few rows
df.head()


In [None]:
# Summary statistics for numerical columns
df.describe()


In [None]:
# Check for missing values
df.isnull().sum()


## 📊 Variable Distributions

In [None]:
# Plot histograms for numerical columns
numeric_cols = df.select_dtypes(include='number').columns

df[numeric_cols].hist(bins=15, figsize=(15, 10), layout=(len(numeric_cols) // 3 + 1, 3))
plt.tight_layout()
plt.show()


In [None]:
# Boxplots to observe dispersion and outliers
for col in numeric_cols:
    plt.figure(figsize=(6, 4))
    sns.boxplot(x=df[col])
    plt.title(f'Distribution of {col}')
    plt.show()


In [None]:
# Plot distribution of categorical variables (if any)
categorical_cols = df.select_dtypes(include='object').columns

for col in categorical_cols:
    plt.figure(figsize=(8, 4))
    df[col].value_counts().plot(kind='bar')
    plt.title(f'Distribution of {col}')
    plt.xlabel(col)
    plt.ylabel("Count")
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()
