In [1]:
import pandas as pd
import numpy as np
import json
from collections import defaultdict
import pickle

In [2]:
df = pd.read_parquet("data/spotify_clean.parquet")
print(f"Loaded dataset: {df.shape}")
print(f"Columns: {df.columns.tolist()}")

# Load ID-to-index mapping
with open("data/id_to_index.json", "r") as f:
    id_to_index = json.load(f)

n_samples = len(df)

Loaded dataset: (169776, 19)
Columns: ['valence', 'year', 'acousticness', 'artists', 'danceability', 'duration_ms', 'energy', 'explicit', 'id', 'instrumentalness', 'key', 'liveness', 'loudness', 'mode', 'name', 'popularity', 'release_date', 'speechiness', 'tempo']


In [3]:
# 1. Categorical Metadata Indexes
# ============================================
print("\n=== Building Categorical Indexes ===")

categorical_indexes = {}

# Artist index: artist -> set of indices
if 'artists' in df.columns:
    artist_to_indices = defaultdict(set)
    for idx, artists_str in enumerate(df['artists']):
        # Handle multiple artists (assuming they're in format like "['artist1', 'artist2']")
        if pd.isna(artists_str):
            continue
        if isinstance(artists_str, str):
            # Remove brackets and quotes, then split
            artists_str = artists_str.strip("[]'\"")
            artists = [a.strip().strip("'\"") for a in artists_str.split(',')]
        else:
            artists = [str(artists_str)]
        
        for artist in artists:
            if artist:  # Skip empty strings
                artist_to_indices[artist].add(idx)
    
    categorical_indexes['artist'] = {k: list(v) for k, v in artist_to_indices.items()}
    print(f"Artist index: {len(artist_to_indices)} unique artists")

# Mode index (major/minor)
if 'mode' in df.columns:
    mode_to_indices = defaultdict(list)
    for idx, mode in enumerate(df['mode']):
        mode_to_indices[int(mode)].append(idx)
    
    categorical_indexes['mode'] = dict(mode_to_indices)
    print(f"Mode index: {len(mode_to_indices)} modes")

# Key index
if 'key' in df.columns:
    key_to_indices = defaultdict(list)
    for idx, key in enumerate(df['key']):
        key_to_indices[int(key)].append(idx)
    
    categorical_indexes['key'] = dict(key_to_indices)
    print(f"Key index: {len(key_to_indices)} keys")

# Explicit content
if 'explicit' in df.columns:
    explicit_to_indices = defaultdict(list)
    for idx, explicit in enumerate(df['explicit']):
        explicit_to_indices[bool(explicit)].append(idx)
    
    categorical_indexes['explicit'] = dict(explicit_to_indices)
    print(f"Explicit index: {len(explicit_to_indices)} values")


=== Building Categorical Indexes ===
Artist index: 28678 unique artists
Mode index: 2 modes
Key index: 12 keys
Explicit index: 2 values


In [4]:
# 2. Numeric Range Indexes (Sorted Arrays)
# ============================================
print("\n=== Building Numeric Range Indexes ===")

numeric_indexes = {}

# Year index
if 'year' in df.columns:
    year_sorted_indices = np.argsort(df['year'].values)
    year_sorted_values = df['year'].values[year_sorted_indices]
    numeric_indexes['year'] = {
        'sorted_indices': year_sorted_indices.tolist(),
        'sorted_values': year_sorted_values.tolist()
    }
    print(f"Year range: {year_sorted_values.min()} - {year_sorted_values.max()}")

# Tempo index
if 'tempo' in df.columns:
    tempo_sorted_indices = np.argsort(df['tempo'].values)
    tempo_sorted_values = df['tempo'].values[tempo_sorted_indices]
    numeric_indexes['tempo'] = {
        'sorted_indices': tempo_sorted_indices.tolist(),
        'sorted_values': tempo_sorted_values.tolist()
    }
    print(f"Tempo range: {tempo_sorted_values.min():.2f} - {tempo_sorted_values.max():.2f}")

# Popularity index
if 'popularity' in df.columns:
    pop_sorted_indices = np.argsort(df['popularity'].values)
    pop_sorted_values = df['popularity'].values[pop_sorted_indices]
    numeric_indexes['popularity'] = {
        'sorted_indices': pop_sorted_indices.tolist(),
        'sorted_values': pop_sorted_values.tolist()
    }
    print(f"Popularity range: {pop_sorted_values.min()} - {pop_sorted_values.max()}")

# Duration index
if 'duration_ms' in df.columns:
    dur_sorted_indices = np.argsort(df['duration_ms'].values)
    dur_sorted_values = df['duration_ms'].values[dur_sorted_indices]
    numeric_indexes['duration_ms'] = {
        'sorted_indices': dur_sorted_indices.tolist(),
        'sorted_values': dur_sorted_values.tolist()
    }
    print(f"Duration range: {dur_sorted_values.min()} - {dur_sorted_values.max()} ms")


=== Building Numeric Range Indexes ===
Year range: 1921 - 2020
Tempo range: 0.00 - 1.00
Popularity range: 0.0 - 1.0
Duration range: 15307 - 899933 ms

=== Creating Bitmap Utilities ===
Testing bitmap utilities...
Mode=1 bitmap: 120039 tracks out of 169776


In [None]:
# 3. Bitmap Representation for Fast Filtering
# ============================================
print("\n=== Creating Bitmap Utilities ===")

def create_bitmap_from_indices(indices, total_size):
    """Create a boolean bitmap from a list of indices"""
    bitmap = np.zeros(total_size, dtype=bool)
    bitmap[indices] = True
    return bitmap

def indices_in_range(sorted_indices, sorted_values, min_val, max_val):
    """Binary search to find indices within a range"""
    left = np.searchsorted(sorted_values, min_val, side='left')
    right = np.searchsorted(sorted_values, max_val, side='right')
    return sorted_indices[left:right].tolist()

# Test bitmap creation
print("Testing bitmap utilities...")
if 'mode' in categorical_indexes:
    test_bitmap = create_bitmap_from_indices(categorical_indexes['mode'].get(1, []), n_samples)
    print(f"Mode=1 bitmap: {test_bitmap.sum()} tracks out of {n_samples}")

In [5]:
# 4. Create Filter Helper Class
# ============================================

class MetadataFilter:
    """Helper class for efficient metadata filtering"""
    
    def __init__(self, categorical_indexes, numeric_indexes, n_samples):
        self.categorical_indexes = categorical_indexes
        self.numeric_indexes = numeric_indexes
        self.n_samples = n_samples
    
    def apply_filter(self, filter_conditions):
        """
        Apply filter conditions and return a boolean bitmap
        
        filter_conditions format:
        {
            'artist': ['Coldplay', 'Radiohead'],
            'year': (2000, 2020),
            'tempo': (80, 150),
            'mode': [1]
        }
        """
        # Start with all True
        bitmap = np.ones(self.n_samples, dtype=bool)
        
        for key, value in filter_conditions.items():
            if key in self.categorical_indexes:
                # Categorical filter
                valid_indices = set()
                if isinstance(value, list):
                    for val in value:
                        if val in self.categorical_indexes[key]:
                            valid_indices.update(self.categorical_indexes[key][val])
                else:
                    if value in self.categorical_indexes[key]:
                        valid_indices.update(self.categorical_indexes[key][value])
                
                temp_bitmap = np.zeros(self.n_samples, dtype=bool)
                temp_bitmap[list(valid_indices)] = True
                bitmap &= temp_bitmap
            
            elif key in self.numeric_indexes:
                # Numeric range filter
                if isinstance(value, tuple) and len(value) == 2:
                    min_val, max_val = value
                    sorted_indices = np.array(self.numeric_indexes[key]['sorted_indices'])
                    sorted_values = np.array(self.numeric_indexes[key]['sorted_values'])
                    
                    valid_indices = indices_in_range(sorted_indices, sorted_values, min_val, max_val)
                    
                    temp_bitmap = np.zeros(self.n_samples, dtype=bool)
                    temp_bitmap[valid_indices] = True
                    bitmap &= temp_bitmap
        
        return bitmap
    
    def get_valid_indices(self, filter_conditions):
        """Return list of valid indices after filtering"""
        bitmap = self.apply_filter(filter_conditions)
        return np.where(bitmap)[0].tolist()
    
    def get_selectivity(self, filter_conditions):
        """Return selectivity (fraction of data passing filter)"""
        bitmap = self.apply_filter(filter_conditions)
        return bitmap.sum() / self.n_samples

In [6]:
# 5. Save Metadata Indexes
# ============================================
print("\n=== Saving Metadata Indexes ===")

# Save as JSON (for readability)
with open("data/categorical_indexes.json", "w") as f:
    json.dump(categorical_indexes, f)

with open("data/numeric_indexes.json", "w") as f:
    json.dump(numeric_indexes, f)

# Save helper class with pickle
metadata_filter = MetadataFilter(categorical_indexes, numeric_indexes, n_samples)
with open("data/metadata_filter.pkl", "wb") as f:
    pickle.dump(metadata_filter, f)

print("Saved metadata indexes:")
print("  - data/categorical_indexes.json")
print("  - data/numeric_indexes.json")
print("  - data/metadata_filter.pkl")


=== Saving Metadata Indexes ===
Saved metadata indexes:
  - data/categorical_indexes.json
  - data/numeric_indexes.json
  - data/metadata_filter.pkl


In [7]:
# 6. Test Filtering
# ============================================
print("\n=== Testing Metadata Filtering ===")

# Example filter - test with data that exists
test_filters = {}

# Add year filter if available
if 'year' in numeric_indexes:
    year_values = np.array(numeric_indexes['year']['sorted_values'])
    year_min, year_max = int(year_values.min()), int(year_values.max())
    test_filters['year'] = (max(year_min, 2010), min(year_max, 2020))
    
# Add mode filter if available
if 'mode' in categorical_indexes:
    test_filters['mode'] = [1]  # Major key

if test_filters:
    valid_indices = metadata_filter.get_valid_indices(test_filters)
    selectivity = metadata_filter.get_selectivity(test_filters)
    
    print(f"Test filter: {test_filters}")
    print(f"Valid tracks: {len(valid_indices)} ({selectivity*100:.2f}%)")
    print(f"Example valid indices: {valid_indices[:10]}")
else:
    print("Skipping test - no suitable filter conditions available")

print("\n=== Metadata Indexing Complete ===")


=== Testing Metadata Filtering ===
Test filter: {'year': (2010, 2020), 'mode': [1]}
Valid tracks: 13940 (8.21%)
Example valid indices: [17552, 17554, 17557, 17558, 17559, 17564, 17565, 17566, 17567, 17568]

=== Metadata Indexing Complete ===
