# 01 - Data Loading & Exploration

**Purpose:** Load the Spotify dataset, understand its structure, and prepare it for analysis.

- Data acquisition
- Structure and schema overview
- Data quality checks
- Save cleaned dataset for subsequent notebooks

In [None]:
import sys
from pathlib import Path

project_root = Path.cwd().parent
data_path = project_root / "data"
sys.path.append(str(data_path))

In [None]:
from spotify_data import download_spotify_dataset

data_dir = download_spotify_dataset()

## Load Raw Data

In [None]:
import pandas as pd

csv_path = Path("../data/raw/spotify_analysis_dataset.csv")
df = pd.read_csv(csv_path)
print(f"Dataset shape: {df.shape[0]:,} rows, {df.shape[1]} columns")
df.head()

## Dataset Structure

In [None]:
df.info()

In [None]:
print("Column names and types:")
print("-" * 40)
for col in df.columns:
    print(f"{col}: {df[col].dtype}")

## Data Quality Checks

In [None]:
#check for missing values
missing = df.isnull().sum()
missing_pct = (missing / len(df) * 100).round(2)

missing_df = pd.DataFrame({'missing_count': missing, 'missing_pct': missing_pct})
missing_df = missing_df[missing_df['missing_count'] > 0]

if len(missing_df) > 0:
    print("Columns with missing values:")
    print(missing_df)
else:
    print("No missing values found.")

In [None]:
#check for duplicates
duplicates = df.duplicated(subset=['track_id']).sum()
print(f"Duplicate track_ids: {duplicates:,}")

if duplicates > 0:
    print("Removing duplicates...")
    df = df.drop_duplicates(subset=['track_id'])
    print(f"New shape: {df.shape[0]:,} rows")

## Dataset Summary

In [None]:
print("Dataset Overview")
print("=" * 40)
print(f"Total tracks: {df['track_id'].nunique():,}")
print(f"Unique artists: {df['artist'].nunique():,}")
print(f"Unique albums: {df['album'].nunique():,}")
print(f"Date range: {df['release_date'].min()} to {df['release_date'].max()}")

In [None]:
df.describe()

## Prepare Data for Analysis

Add derived columns that will be used in subsequent notebooks.

In [None]:
#convert and extract date features
df['release_date'] = pd.to_datetime(df['release_date'])
df['release_year'] = df['release_date'].dt.year

#convert duration to minutes
df['duration_min'] = df['duration_ms'] / 60000

#create popularity tiers
df['popularity_tier'] = pd.cut(df['popularity'], bins=[0, 33, 66, 100], 
                                labels=['Low', 'Medium', 'High'])

print("Added columns: release_year, duration_min, popularity_tier")
print(f"\nPopularity tier distribution:")
print(df['popularity_tier'].value_counts())

## Save Processed Data

In [None]:
processed_path = Path("../data/processed")
processed_path.mkdir(parents=True, exist_ok=True)

output_file = processed_path / "spotify_cleaned.csv"
df.to_csv(output_file, index=False)
print(f"Saved processed data to: {output_file}")
print(f"Shape: {df.shape[0]:,} rows, {df.shape[1]} columns")

---
**Next:** [02_feature_analysis.ipynb](02_feature_analysis.ipynb) - Deep dive into audio features and popularity correlations