# Week 2 Day 1‚Äì2: Audio Feature Extraction

**Goals:**
1. Test MFCC feature extraction
2. Test Chroma feature extraction
3. Test Spectral feature extraction
4. Test Rhythm feature extraction
5. Visualize different types of features

**Dates:** November 6‚Äì7, 2025  
**Week 2 Day 1‚Äì2 Tasks**

---

## Day 1: MFCC Feature Extraction


### 1. Import required libraries


In [None]:
import sys
sys.path.append('..')

from src.features.traditional import extract_mfcc
import librosa
import librosa.display
import matplotlib.pyplot as plt
import numpy as np
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Optional: enable Chinese font support if needed in plots
plt.rcParams['font.sans-serif'] = ['Microsoft YaHei', 'SimHei', 'Arial Unicode MS']
plt.rcParams['axes.unicode_minus'] = False


### 2. Set test audio file


In [None]:
# Set test audio path
# ‚ö†Ô∏è Note: DEAM song IDs are not continuous; 1.mp3 does not exist
# Available song IDs include: [2, 3, 4, 5, 7, 8, 10, 12, 13, 17, 18, 19, 20, ...]
test_song_id = 7  # ‚Üê use an existing song_id
test_audio = Path(f'../data/DEAM/DEAM_audio/MEMD_audio/{test_song_id}.mp3')

if test_audio.exists():
    print(f"Test file found: {test_audio}")
    print(f"Song ID: {test_song_id}")
else:
    print(f"Test file not found: {test_audio}")
    print("Available song IDs include: 2, 3, 4, 5, 7, 8, 10, 12, 13, 17, ...")
    print("Please set test_song_id to an existing value.")


### 3. Extract MFCC features


In [None]:
# Extract MFCC features
mfcc_features = extract_mfcc(str(test_audio), n_mfcc=20)

print("\nMFCC feature shapes:")
print(f"  - MFCC mean: {mfcc_features['mfcc_mean'].shape}")
print(f"  - MFCC std: {mfcc_features['mfcc_std'].shape}")
print(f"  - MFCC delta mean: {mfcc_features['mfcc_delta_mean'].shape}")

print("\nFirst 5 MFCC mean coefficients:")
print(mfcc_features['mfcc_mean'][:5])


### 4. Visualize MFCC features


In [None]:
# Load audio
y, sr = librosa.load(str(test_audio), sr=22050)
print(f"Audio duration: {len(y) / sr:.2f} seconds")
print(f"Sample rate: {sr} Hz")

# Extract MFCC (for visualization)
mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=20, hop_length=512)

# Plot MFCC heatmap
plt.figure(figsize=(14, 6))
librosa.display.specshow(mfcc, sr=sr, x_axis='time', cmap='coolwarm', hop_length=512)
plt.colorbar(label='MFCC value')
plt.title(f'MFCC Feature Visualization (Song ID: {test_song_id})', fontsize=14)
plt.xlabel('Time (seconds)', fontsize=12)
plt.ylabel('MFCC coefficients', fontsize=12)
plt.tight_layout()
plt.show()


### 5. Day 1 sanity checks


In [None]:
print("Day 1 sanity checks:\n")

checks = [
    ("Notebook created successfully", True),
    ("extract_mfcc() returns 3 feature entries", len(mfcc_features) == 3),
    ("MFCC mean is 20-dimensional", len(mfcc_features['mfcc_mean']) == 20),
    ("MFCC std is 20-dimensional", len(mfcc_features['mfcc_std']) == 20),
    ("MFCC delta mean is 20-dimensional", len(mfcc_features['mfcc_delta_mean']) == 20),
    ("MFCC heatmap rendered without error", True),
]

for i, (description, passed) in enumerate(checks, 1):
    status = "OK " if passed else "FAIL"
    print(f"[{status}] {i}. {description}")

all_passed = all(check[1] for check in checks)
if all_passed:
    print("\nAll Day 1 checks passed.")
else:
    print("\nSome Day 1 checks failed. Please review.")


---

## Day 2: Other Feature Types

### 1. Chroma feature extraction


In [None]:
from src.features.traditional import extract_chroma

chroma_features = extract_chroma(str(test_audio))

print("\nChroma feature shapes:")
for key, value in chroma_features.items():
    print(f"  - {key}: {value.shape}")


### 2. Visualize Chroma features


In [None]:
# Extract Chroma for visualization
chroma_stft = librosa.feature.chroma_stft(y=y, sr=sr, hop_length=512)

# Plot Chroma heatmap
plt.figure(figsize=(14, 5))
librosa.display.specshow(
    chroma_stft,
    sr=sr,
    x_axis='time',
    y_axis='chroma',
    cmap='viridis',
    hop_length=512
)
plt.colorbar(label='Intensity')
plt.title('Chroma STFT Feature Visualization', fontsize=14)
plt.xlabel('Time (seconds)')
plt.ylabel('Pitch class')
plt.tight_layout()
plt.show()


### 3. Spectral feature extraction


In [None]:
from src.features.traditional import extract_spectral

spectral_features = extract_spectral(str(test_audio))

print("\nSpectral features:")
for key, value in spectral_features.items():
    if isinstance(value, np.ndarray):
        print(f"  - {key}: {value.shape}")
    else:
        print(f"  - {key}: {value:.4f}")


### 4. Rhythm feature extraction


In [None]:
from src.features.traditional import extract_rhythm

rhythm_features = extract_rhythm(str(test_audio))

print("\nRhythm features:")
for key, value in rhythm_features.items():
    print(f"  - {key}: {value}")

print(f"\nEstimated tempo: {rhythm_features['tempo']:.1f} BPM")


### 5. Combined feature extraction test


In [None]:
from src.features.traditional import extract_all_features

all_features = extract_all_features(str(test_audio))

print(f"\nExtracted {len(all_features)} aggregated features.")
print("\nAll features:")
print("=" * 60)

for i, (key, value) in enumerate(all_features.items(), 1):
    if isinstance(value, (int, float, np.floating)):
        print(f"{i:3d}. {key:30s}: {value:10.4f}")
    else:
        print(f"{i:3d}. {key:30s}: {value}")


### 6. Day 1‚Äì2 summary


In [None]:
print("=" * 60)
print("Week 2 Day 1‚Äì2 Summary")
print("=" * 60)

print("\nCompleted tasks:")
print("  1. MFCC feature extraction and visualization")
print("  2. Chroma feature extraction and visualization")
print("  3. Spectral feature extraction")
print("  4. Rhythm feature extraction")
print("  5. Combined feature extraction test")

print("\nFeature dimensionality (approximate):")
print("  - MFCC: 60 dimensions (20 √ó 3)")
print("  - Chroma: 48 dimensions (12 √ó 4)")
print("  - Spectral: ~22 dimensions")
print("  - Rhythm: 3 dimensions")
print(f"  - Total: {len(all_features)} aggregated dimensions")


---

## üöÄ Day 3: Batch Feature Extraction

**Goals:**
- Extract features for all 1,802 songs
- Save features to CSV for model training
- Verify data integrity

**Estimated time:** 30‚Äì60 minutes


In [None]:
# Import required libraries (can be skipped if previously imported)
import sys
sys.path.append('..')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

from src.features.traditional import extract_all_features


### 1. Setup paths and load annotations


In [None]:
# Set paths
BASE_DIR = Path('..')
AUDIO_DIR = BASE_DIR / "data" / "DEAM" / "DEAM_audio" / "MEMD_audio"
ANNOTATION_DIR = (
    BASE_DIR
    / "data" / "DEAM"
    / "DEAM_Annotations"
    / "annotations"
    / "annotations averaged per song"
    / "song_level"
)
OUTPUT_DIR = BASE_DIR / "data" / "processed"

# Load annotation data
df1 = pd.read_csv(ANNOTATION_DIR / "static_annotations_averaged_songs_1_2000.csv")
df2 = pd.read_csv(ANNOTATION_DIR / "static_annotations_averaged_songs_2000_2058.csv")
df_annotations = pd.concat([df1, df2], ignore_index=True)
df_annotations.columns = df_annotations.columns.str.strip()
df_annotations = df_annotations.set_index('song_id')

print(f"Loaded {len(df_annotations)} annotations.\n")

print("Path check:")
print(f"  ‚Ä¢ Audio directory exists: {AUDIO_DIR.exists()}")
print(f"  ‚Ä¢ Audio file count: {len(list(AUDIO_DIR.glob('*.mp3')))}")
print(f"  ‚Ä¢ Output directory (will be created if missing): {OUTPUT_DIR}")


### 2. Test mode ‚Äì extract features for 10 songs

Run on a small subset first to ensure everything works as expected.


In [None]:
print("=" * 70)
print("Test mode: extract features for 10 songs")
print("=" * 70)

# Prepare test songs
print("\nPreparing test songs...")
test_songs = []
for audio_file in sorted(AUDIO_DIR.glob("*.mp3"))[:15]:  # scan a bit more to skip missing ids
    song_id = int(audio_file.stem)
    if song_id in df_annotations.index:
        test_songs.append(
            {
                'song_id': song_id,
                'audio_path': audio_file,
                'valence': df_annotations.loc[song_id, 'valence_mean'],
                'arousal': df_annotations.loc[song_id, 'arousal_mean'],
            }
        )
    if len(test_songs) >= 10:
        break

print(f"Prepared {len(test_songs)} test songs.")
print(f"Song IDs: {[s['song_id'] for s in test_songs]}")

# Batch extraction
results_test = []
failed_test = []

print("\nExtracting features for test songs...\n")
for song_info in tqdm(test_songs, desc="Test extraction", unit="song"):
    try:
        features = extract_all_features(str(song_info['audio_path']))
        features['song_id'] = song_info['song_id']
        features['valence'] = song_info['valence']
        features['arousal'] = song_info['arousal']
        results_test.append(features)
    except Exception as e:
        failed_test.append({'song_id': song_info['song_id'], 'error': str(e)})
        print(f"\nFailed: Song {song_info['song_id']} - {e}")

# Convert to DataFrame
df_test = pd.DataFrame(results_test)
cols = ['song_id', 'valence', 'arousal'] + [
    c for c in df_test.columns if c not in ['song_id', 'valence', 'arousal']
]
df_test = df_test[cols]

# Save test result
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
test_output = OUTPUT_DIR / "deam_features_test.csv"
df_test.to_csv(test_output, index=False)

print("\n" + "=" * 70)
print("Test extraction finished.")
print("=" * 70)
print(f"Successful: {len(results_test)}/{len(test_songs)}")
print(f"Saved to: {test_output}")
print(f"Shape: {df_test.shape}")

print("\nPreview of the first 10 songs:")
display(
    df_test[
        ['song_id', 'valence', 'arousal', 'mfcc_mean_0', 'mfcc_mean_1', 'tempo']
    ].head(10)
)


In [None]:
# Inspect overall feature structure
print(f"Test DataFrame shape: {df_test.shape}")

print(f"\nAll column names ({len(df_test.columns)} columns):")
print(df_test.columns.tolist())

feature_cols = [
    c for c in df_test.columns if c not in ['song_id', 'valence', 'arousal']
]
print(f"\nNumber of feature columns: {len(feature_cols)}")
print("\nFeature groups:")
print(f"  ‚Ä¢ MFCC: {len([c for c in feature_cols if 'mfcc' in c])} dimensions")
print(f"  ‚Ä¢ Chroma: {len([c for c in feature_cols if 'chroma' in c])} dimensions")
print(f"  ‚Ä¢ Spectral: {len([c for c in feature_cols if 'spectral' in c])} dimensions")
print(
    "  ‚Ä¢ Rhythm: "
    f"{len([c for c in feature_cols if c in ['tempo', 'beat_count', 'beat_strength', 'zcr_mean', 'zcr_std']])} "
    "dimensions"
)

print("\nFull feature vector for the first song:")
display(df_test.iloc[0])


### 3. Full batch extraction ‚Äì all 1,802 songs

‚ö†Ô∏è **Important notes:**
- This step may take **30‚Äì60 minutes**
- Make sure your machine does not go to sleep
- Run only after the test mode has succeeded


In [None]:
import time

print("=" * 70)
print("Full batch extraction ‚Äì all songs")
print("=" * 70)

# Prepare all songs
print("\nPreparing song list...")
all_songs = []
for audio_file in AUDIO_DIR.glob("*.mp3"):
    song_id = int(audio_file.stem)
    if song_id in df_annotations.index:
        all_songs.append(
            {
                'song_id': song_id,
                'audio_path': audio_file,
                'valence': df_annotations.loc[song_id, 'valence_mean'],
                'arousal': df_annotations.loc[song_id, 'arousal_mean'],
            }
        )

print(f"Found {len(all_songs)} valid songs.\n")

# Batch extraction
print("Extracting features for all songs...\n")
start_time = time.time()

all_results = []
all_failed = []

for song_info in tqdm(all_songs, desc="Batch extraction", unit="song"):
    try:
        features = extract_all_features(str(song_info['audio_path']))
        features['song_id'] = song_info['song_id']
        features['valence'] = song_info['valence']
        features['arousal'] = song_info['arousal']
        all_results.append(features)
    except Exception as e:
        all_failed.append({'song_id': song_info['song_id'], 'error': str(e)})

elapsed = time.time() - start_time

# Save full result
print("\nSaving final results...")
df_all = pd.DataFrame(all_results)
cols = ['song_id', 'valence', 'arousal'] + [
    c for c in df_all.columns if c not in ['song_id', 'valence', 'arousal']
]
df_all = df_all[cols]

final_output = OUTPUT_DIR / "deam_features_all.csv"
df_all.to_csv(final_output, index=False)

# Summary statistics
print("\n" + "=" * 70)
print("Batch extraction summary")
print("=" * 70)
print(f"Total songs:      {len(all_songs)}")
print(f"Successfully processed: {len(all_results)}")
print(f"Failed:           {len(all_failed)}")
print(f"Success rate:     {len(all_results) / len(all_songs) * 100:.1f}%")
print(f"\nElapsed time:     {elapsed / 60:.1f} minutes")
print(f"Throughput:       {len(all_results) / (elapsed / 60):.1f} songs/minute")
print(f"\nOutput file:      {final_output}")
print(f"Data shape:       {df_all.shape}")
print(f"File size:        {final_output.stat().st_size / 1024 / 1024:.2f} MB")

if all_failed:
    print(f"\nExamples of failed songs ({len(all_failed)} in total):")
    for fail in all_failed[:5]:
        print(f"  - Song {fail['song_id']}: {fail['error']}")
    if len(all_failed) > 5:
        print(f"  ... and {len(all_failed) - 5} more.")


### 4. Validate extracted features


In [None]:
# Load the saved feature file
output_file = OUTPUT_DIR / "deam_features_all.csv"

if output_file.exists():
    df_loaded = pd.read_csv(output_file)

    print("=" * 70)
    print("Feature file validation")
    print("=" * 70)

    print("\nFile information:")
    print(f"  Path:  {output_file}")
    print(f"  Size:  {output_file.stat().st_size / 1024 / 1024:.2f} MB")

    print("\nData shape:")
    print(f"  Rows (songs):       {len(df_loaded)}")
    print(f"  Columns (total):    {len(df_loaded.columns)}")
    print(f"  Feature dimensions: {len(df_loaded.columns) - 3} (excluding song_id, valence, arousal)")

    print("\nData quality:")
    print(f"  Total missing values:   {df_loaded.isnull().sum().sum()}")
    print(f"  Duplicate song_ids:     {df_loaded['song_id'].duplicated().sum()}")
    print(
        f"  Valence range:          [{df_loaded['valence'].min():.2f}, {df_loaded['valence'].max():.2f}]"
    )
    print(
        f"  Arousal range:          [{df_loaded['arousal'].min():.2f}, {df_loaded['arousal'].max():.2f}]"
    )

    print("\nFeature group statistics:")
    feature_cols = [
        c for c in df_loaded.columns if c not in ['song_id', 'valence', 'arousal']
    ]
    print(f"  ‚Ä¢ MFCC:    {len([c for c in feature_cols if 'mfcc' in c])} dimensions")
    print(f"  ‚Ä¢ Chroma:  {len([c for c in feature_cols if 'chroma' in c])} dimensions")
    print(f"  ‚Ä¢ Spectral:{len([c for c in feature_cols if 'spectral' in c])} dimensions")
    print(
        "  ‚Ä¢ Rhythm:  "
        f"{len([c for c in feature_cols if c in ['tempo', 'beat_count', 'beat_strength', 'zcr_mean', 'zcr_std']])} "
        "dimensions"
    )

    print("\nLabel statistics:")
    print(
        f"  Valence: Œº={df_loaded['valence'].mean():.2f}, œÉ={df_loaded['valence'].std():.2f}"
    )
    print(
        f"  Arousal: Œº={df_loaded['arousal'].mean():.2f}, œÉ={df_loaded['arousal'].std():.2f}"
    )

    print("\nPreview of first 5 songs:")
    display(
        df_loaded[
            ['song_id', 'valence', 'arousal', 'mfcc_mean_0', 'tempo']
        ].head()
    )
else:
    print("Feature file not found. Please run the batch extraction cell first.")


### 5. Visualize feature distributions


In [None]:
if 'df_loaded' in locals():
    fig, axes = plt.subplots(2, 3, figsize=(15, 10))

    # 1. Valence‚ÄìArousal distribution
    scatter = axes[0, 0].scatter(
        df_loaded['valence'],
        df_loaded['arousal'],
        c=df_loaded['arousal'],
        cmap='RdYlBu_r',
        alpha=0.5,
        s=20,
        edgecolors='none',
    )
    axes[0, 0].set_xlabel('Valence', fontsize=11)
    axes[0, 0].set_ylabel('Arousal', fontsize=11)
    axes[0, 0].set_title(
        'Emotion Distribution (All 1,802 Songs)',
        fontsize=12,
        fontweight='bold',
    )
    axes[0, 0].grid(alpha=0.3)
    axes[0, 0].axhline(5, color='gray', linestyle='--', alpha=0.3)
    axes[0, 0].axvline(5, color='gray', linestyle='--', alpha=0.3)
    plt.colorbar(scatter, ax=axes[0, 0], label='Arousal')

    # 2. MFCC[0] distribution (energy)
    axes[0, 1].hist(
        df_loaded['mfcc_mean_0'],
        bins=50,
        color='steelblue',
        alpha=0.7,
        edgecolor='black',
    )
    axes[0, 1].axvline(
        df_loaded['mfcc_mean_0'].mean(),
        color='red',
        linestyle='--',
        label=f"Mean: {df_loaded['mfcc_mean_0'].mean():.1f}",
    )
    axes[0, 1].set_xlabel('MFCC[0] (Energy)', fontsize=11)
    axes[0, 1].set_ylabel('Frequency', fontsize=11)
    axes[0, 1].set_title(
        'Energy Distribution',
        fontsize=12,
        fontweight='bold',
    )
    axes[0, 1].legend()
    axes[0, 1].grid(alpha=0.3, axis='y')

    # 3. MFCC[1] distribution (timbre brightness)
    axes[0, 2].hist(
        df_loaded['mfcc_mean_1'],
        bins=50,
        color='coral',
        alpha=0.7,
        edgecolor='black',
    )
    axes[0, 2].axvline(
        df_loaded['mfcc_mean_1'].mean(),
        color='red',
        linestyle='--',
        label=f"Mean: {df_loaded['mfcc_mean_1'].mean():.1f}",
    )
    axes[0, 2].set_xlabel('MFCC[1] (Brightness)', fontsize=11)
    axes[0, 2].set_ylabel('Frequency', fontsize=11)
    axes[0, 2].set_title(
        'Timbre Brightness Distribution',
        fontsize=12,
        fontweight='bold',
    )
    axes[0, 2].legend()
    axes[0, 2].grid(alpha=0.3, axis='y')

    # 4. Tempo distribution
    axes[1, 0].hist(
        df_loaded['tempo'],
        bins=50,
        color='lightgreen',
        alpha=0.7,
        edgecolor='black',
    )
    axes[1, 0].axvline(
        df_loaded['tempo'].mean(),
        color='red',
        linestyle='--',
        label=f"Mean: {df_loaded['tempo'].mean():.1f} BPM",
    )
    axes[1, 0].set_xlabel('Tempo (BPM)', fontsize=11)
    axes[1, 0].set_ylabel('Frequency', fontsize=11)
    axes[1, 0].set_title(
        'Tempo Distribution',
        fontsize=12,
        fontweight='bold',
    )
    axes[1, 0].legend()
    axes[1, 0].grid(alpha=0.3, axis='y')

    # 5. Spectral centroid vs Valence
    axes[1, 1].scatter(
        df_loaded['spectral_centroid_mean'],
        df_loaded['valence'],
        alpha=0.3,
        s=15,
        c='purple',
        edgecolors='none',
    )
    axes[1, 1].set_xlabel('Spectral Centroid (Hz)', fontsize=11)
    axes[1, 1].set_ylabel('Valence', fontsize=11)
    axes[1, 1].set_title(
        'Brightness vs Valence',
        fontsize=12,
        fontweight='bold',
    )
    axes[1, 1].grid(alpha=0.3)

    # 6. Beat strength vs Arousal
    axes[1, 2].scatter(
        df_loaded['beat_strength'],
        df_loaded['arousal'],
        alpha=0.3,
        s=15,
        c='orange',
        edgecolors='none',
    )
    axes[1, 2].set_xlabel('Beat Strength', fontsize=11)
    axes[1, 2].set_ylabel('Arousal', fontsize=11)
    axes[1, 2].set_title(
        'Beat Strength vs Arousal',
        fontsize=12,
        fontweight='bold',
    )
    axes[1, 2].grid(alpha=0.3)

    plt.tight_layout()
    plt.show()
else:
    print("Please load df_loaded first (run the validation cell).")


### 6. Feature correlation analysis


In [None]:
if 'df_loaded' in locals():
    # Select key features for correlation analysis
    key_features = [
        'mfcc_mean_0',
        'mfcc_mean_1',
        'mfcc_std_0',
        'spectral_centroid_mean',
        'spectral_rolloff_mean',
        'spectral_bandwidth_mean',
        'tempo',
        'beat_strength',
        'zcr_mean',
        'valence',
        'arousal',
    ]

    # Compute correlation matrix
    corr_matrix = df_loaded[key_features].corr()

    # Plot correlation heatmap
    plt.figure(figsize=(10, 8))
    sns.heatmap(
        corr_matrix,
        annot=True,
        fmt='.2f',
        cmap='coolwarm',
        center=0,
        square=True,
        linewidths=1,
        cbar_kws={"shrink": 0.8},
    )
    plt.title('Feature Correlation Matrix', fontsize=14, fontweight='bold', pad=15)
    plt.tight_layout()
    plt.show()

    # Print top correlated features with valence and arousal
    print("=" * 60)
    print("Feature correlations with emotion labels")
    print("=" * 60)

    print("\nTop correlations with Valence:")
    valence_corr = corr_matrix['valence'].drop('valence').sort_values(ascending=False)
    for i, (feat, corr) in enumerate(valence_corr.head(5).items(), 1):
        print(f"  {i}. {feat:30s}: {corr:+.3f}")

    print("\nTop correlations with Arousal:")
    arousal_corr = corr_matrix['arousal'].drop('arousal').sort_values(ascending=False)
    for i, (feat, corr) in enumerate(arousal_corr.head(5).items(), 1):
        print(f"  {i}. {feat:30s}: {corr:+.3f}")

    print("\nInterpretation:")
    print("  ‚Ä¢ Positive correlation (> 0): higher feature value ‚Üí higher emotion value.")
    print("  ‚Ä¢ Negative correlation (< 0): higher feature value ‚Üí lower emotion value.")
    print("  ‚Ä¢ |correlation| > 0.3 is usually considered a moderate correlation.")
else:
    print("Please load df_loaded first (run the validation cell).")


---

## üìã Feature Analysis Summary and Modeling Insights

### üîç Key Findings

#### 1Ô∏è‚É£ Feature distributions

- All features show reasonable distributions with no extreme outliers.
- Emotion labels (valence and arousal) are concentrated around the mid-range, which matches real-world music.
- No obvious feature extraction errors were observed.

#### 2Ô∏è‚É£ Multicollinearity

Highly redundant feature groups:

| Feature group                                      | Correlation | Note                    |
|----------------------------------------------------|------------:|-------------------------|
| `spectral_centroid` ‚Üî `spectral_rolloff`           |       0.98  | Almost fully redundant |
| `spectral_centroid` ‚Üî `spectral_bandwidth`         |       0.89  | Highly correlated      |
| `mfcc_mean_1` ‚Üî spectral features                  |      -0.90  | Strong negative link   |
| `mfcc_mean_0` ‚Üî spectral features                  | 0.68‚Äì0.70   | Moderate correlation   |

Impact:

- Linear models may suffer from unstable coefficients and reduced interpretability.
- Feature importance may be spread across redundant features.
- Tree-based models (Random Forest, XGBoost) are less affected.

---

#### 3Ô∏è‚É£ Relationship between features and emotion

**Top correlations with Valence (pleasantness):**

| Rank | Feature name                | Corr. | Type         |
|------|-----------------------------|------:|--------------|
| 1    | `mfcc_mean_0`               | +0.59 | Energy       |
| 2    | `arousal`                   | +0.57 | Label (cross)|
| 3    | `spectral_rolloff_mean`     | +0.55 | Spectral     |
| 4    | `spectral_centroid_mean`    | +0.53 | Spectral     |
| 5    | `spectral_bandwidth_mean`   | +0.51 | Spectral     |

**Top correlations with Arousal (activation):**

| Rank | Feature name                | Corr. | Type         |
|------|-----------------------------|------:|--------------|
| 1    | `mfcc_mean_0`               | +0.58 | Energy       |
| 2    | `spectral_rolloff_mean`     | +0.57 | Spectral     |
| 3    | `valence`                   | +0.57 | Label (cross)|
| 4    | `spectral_centroid_mean`    | +0.55 | Spectral     |
| 5    | `spectral_bandwidth_mean`   | +0.51 | Spectral     |

**Key insights:**

1. `mfcc_mean_0` is a strong global feature:
   - Highest correlation with both valence and arousal.
   - Represents overall energy / loudness.
   - Should be kept in all models.

2. Spectral features (`spectral_rolloff`, `spectral_centroid`, `spectral_bandwidth`):
   - Very similar ranking and values.
   - Confirmed redundancy ‚Üí we can keep just one representative feature.

3. Valence and arousal are moderately correlated (~ +0.57):
   - High-valence songs are often also high-arousal.
   - Low-valence songs are often low-arousal.

4. Weakly correlated but potentially useful features:
   - `tempo` has very weak linear correlation (~0.09) but may have nonlinear effects.
   - `beat_strength` has weak-moderate correlation (~0.22) and provides additional rhythmic information.

---

**Correlation magnitude interpretation:**

| |corr| range | Interpretation      | Example feature             |
|--------------|--------------------|-----------------------------|
| 0.50‚Äì0.70    | Moderate‚Äìstrong    | `mfcc_mean_0`, spectral     |
| 0.30‚Äì0.50    | Moderate           | `zcr_mean` (~0.40)          |
| 0.10‚Äì0.30    | Weak               | `beat_strength` (~0.22)     |
| < 0.10       | Very weak / none   | `tempo` (~0.09)             |

---

### üéØ Feature selection strategies

#### Strategy A: Manual feature selection (for linear models)

```python
# Keep representative, less-redundant features
selected_features = [
    # Energy (most important)
    'mfcc_mean_0', 'mfcc_std_0',

    # One representative spectral feature
    'spectral_centroid_mean',

    # Harmonic features (Chroma)
    'chroma_stft_mean_*',          # 12 pitch class features

    # Spectral contrast
    'spectral_contrast_mean_*',    # 7 bands

    # Rhythm features (independent information)
    'tempo', 'beat_strength',

    # Other independent features
    'zcr_mean',
]
```

Pros:
- Reduces multicollinearity.
- Improves linear model stability.
- Speeds up training.

Cons:
- May lose some information.

---

#### Strategy B: PCA-based dimensionality reduction

```python
from sklearn.decomposition import PCA

# Apply PCA to highly correlated groups
pca_spectral = PCA(n_components=1)   # spectral trio ‚Üí 1 component
pca_mfcc = PCA(n_components=10)      # 20 MFCCs ‚Üí 10 components
```

Pros:
- Keeps most variance.
- Automatically handles collinearity.

Cons:
- Reduces feature interpretability.

---

#### Strategy C: Model-based feature importance (tree models)

```python
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor()
rf.fit(X_train, y_train)

importances = rf.feature_importances_
top_indices = np.argsort(importances)[::-1][:50]
```

Pros:
- Data-driven.
- Captures nonlinear relationships.

Cons:
- Requires training a model first.

---

#### Strategy D: Keep all features (for robust nonlinear models)

Tree-based models such as:

- Random Forest
- XGBoost
- LightGBM

can often cope with:

- Multicollinearity (by choosing splits)
- Implicit feature selection (via importance)

---

### üí° Modeling plan for Week 2 Day 4‚Äì5

1. Baseline model (Linear Regression):
   - Use Strategy A (manual selection).
   - Remove highly collinear spectral features.
   - Keep `spectral_centroid_mean` as representative.

2. Advanced model (Random Forest):
   - Use Strategy D (all features).
   - Let the model learn feature importance.

3. Comparative experiments:
   - Compare performance across feature sets.
   - Evaluate the impact of multicollinearity.
   - Analyze feature importance rankings.

---

### ‚úÖ Data quality summary

```
Extracted features successfully.
- 1,802 songs √ó ~133 feature dimensions
- No missing values, no obvious outliers
- Reasonable distributions for all features
- Clear correlation with emotion labels
- Ready for model training
```

---


### 7. Day 3 summary


In [None]:
print("=" * 70)
print("Week 2 Day 1‚Äì3 Summary")
print("=" * 70)

print("\nCompleted tasks:")
print("  Day 1: MFCC feature extraction and visualization")
print("  Day 2: Chroma / Spectral / Rhythm feature extraction")
print("  Day 3: Batch feature extraction for 1,802 songs")
print("  Day 3: Saved features to CSV")
print("  Day 3: Validated data integrity")
print("  Day 3: Visualized feature distributions")
print("  Day 3: Performed feature correlation analysis")

if 'df_loaded' in locals():
    print("\nFinal dataset:")
    print(f"  ‚Ä¢ Number of songs:   {len(df_loaded)}")
    print(f"  ‚Ä¢ Feature dimensions:{len(df_loaded.columns) - 3}")
    print(f"  ‚Ä¢ Output file:       data/processed/deam_features_all.csv")
    print(
        "  ‚Ä¢ File size:         "
        f"{(OUTPUT_DIR / 'deam_features_all.csv').stat().st_size / 1024 / 1024:.2f} MB"
    )
else:
    print("\nFull dataset not found. Test data has been saved.")

print("\nNext steps (Week 2 Day 4‚Äì5):")
print("  ‚Ä¢ Data preprocessing and train/validation split")
print("  ‚Ä¢ Train baseline regression model (Linear Regression)")
print("  ‚Ä¢ Train Random Forest model")
print("  ‚Ä¢ Evaluate and compare models")
print("  ‚Ä¢ Visualize prediction results")
print("  ‚Ä¢ Analyze feature importance")

print("\nData files:")
print(f"  Test subset:    {OUTPUT_DIR / 'deam_features_test.csv'}")
if (OUTPUT_DIR / 'deam_features_all.csv').exists():
    print(f"  Full dataset:   {OUTPUT_DIR / 'deam_features_all.csv'}")
else:
    print("  Full dataset:   not generated yet.")
