In this notebook we will do feature analysis but only on the training set, to avoid any leakage.
We will then also prepare and standardize the test set.
The output of this notebook will be the scaled x_train

In [1]:
# Capturing necessary libraries
import mido
import numpy as np
import os
import glob
import pandas as pd
from collections import Counter
from sklearn.preprocessing import StandardScaler

In [4]:
# Loading our test and training folders
composers = ['Mozart', 'Chopin', 'Bach', 'Beethoven']
train_files = {c: glob.glob(f'data/train/{c}/*.mid*') for c in composers}
test_files = {c: glob.glob(f'data/test/{c}/*.mid*') for c in composers}

In [5]:
# Creating a function to extract 20 features for every music track

# Note: For the following function, Claude Sonnet 4.0 was used on 7/21/25 to identify the 20 features to extract.
def extract_features(file_path):
   
    try:
        mid = mido.MidiFile(file_path)
        features = {}
        
        # Collecting musical data
        notes = []
        velocities = []
        note_durations = []
        time_between_notes = []
        
        for track in mid.tracks:
            current_time = 0
            note_on_times = {}
            last_note_time = 0
            
            for msg in track:
                current_time += msg.time
                
                if msg.type == 'note_on' and msg.velocity > 0:
                    notes.append(msg.note)
                    velocities.append(msg.velocity)
                    note_on_times[msg.note] = current_time
                    
                    if last_note_time > 0:
                        time_between_notes.append(current_time - last_note_time)
                    last_note_time = current_time
                    
                elif msg.type == 'note_off' or (msg.type == 'note_on' and msg.velocity == 0):
                    if msg.note in note_on_times:
                        duration = current_time - note_on_times[msg.note]
                        note_durations.append(duration)
                        del note_on_times[msg.note]
        
        if not notes:
            return None
        
        # 20 KEY FEATURES FOR COMPOSER CLASSIFICATION
        
        # 1-4: Basic song structure
        features['duration'] = mid.length
        features['num_tracks'] = len(mid.tracks)
        features['num_notes'] = len(notes)
        features['note_density'] = len(notes) / max(mid.length, 1)
        
        # 5-8: Pitch characteristics
        features['pitch_mean'] = np.mean(notes)
        features['pitch_std'] = np.std(notes)
        features['pitch_range'] = max(notes) - min(notes)
        features['pitch_min'] = min(notes)
        
        # 9-11: Melodic movement
        if len(notes) > 1:
            intervals = [notes[i+1] - notes[i] for i in range(len(notes)-1)]
            features['interval_mean'] = np.mean(intervals)
            features['ascending_ratio'] = sum(1 for x in intervals if x > 0) / len(intervals)
            features['large_leaps_ratio'] = sum(1 for x in intervals if abs(x) > 4) / len(intervals)
        else:
            features['interval_mean'] = 0
            features['ascending_ratio'] = 0
            features['large_leaps_ratio'] = 0
        
        # 12-15: Rhythm and timing
        if note_durations:
            features['note_duration_mean'] = np.mean(note_durations)
            features['note_duration_std'] = np.std(note_durations)
        else:
            features['note_duration_mean'] = 0
            features['note_duration_std'] = 0
            
        if time_between_notes:
            features['time_between_mean'] = np.mean(time_between_notes)
            features['time_between_std'] = np.std(time_between_notes)
        else:
            features['time_between_mean'] = 0
            features['time_between_std'] = 0
        
        # 16-18: Dynamics (velocity)
        if velocities:
            features['velocity_mean'] = np.mean(velocities)
            features['velocity_std'] = np.std(velocities)
            features['velocity_range'] = max(velocities) - min(velocities)
        else:
            features['velocity_mean'] = 0
            features['velocity_std'] = 0
            features['velocity_range'] = 0
        
        # 19-20: Harmonic content (simplified)
        pitch_classes = [note % 12 for note in notes]
        features['most_common_pitch_class'] = Counter(pitch_classes).most_common(1)[0][0]
        features['pitch_class_diversity'] = len(set(pitch_classes))
        
        return features
        
    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        return None

# Test the function
print("Testing 20-feature extraction...")
test_features = extract_features(train_files['Mozart'][0])
print(f"Number of features extracted: {len(test_features)}")
print("\nAll 20 features:")
for i, (key, value) in enumerate(test_features.items(), 1):
    print(f"{i:2d}. {key}: {value}")

Testing 20-feature extraction...
Number of features extracted: 20

All 20 features:
 1. duration: 334.2413769874992
 2. num_tracks: 15
 3. num_notes: 2900
 4. note_density: 8.6763644469681
 5. pitch_mean: 62.69931034482759
 6. pitch_std: 8.679500196572771
 7. pitch_range: 52
 8. pitch_min: 34
 9. interval_mean: -0.006553984132459468
10. ascending_ratio: 0.29975853742669883
11. large_leaps_ratio: 0.17868230424284237
12. note_duration_mean: 157.4865424430642
13. note_duration_std: 188.7591678724114
14. time_between_mean: 265.32065029401593
15. time_between_std: 811.0955621779974
16. velocity_mean: 64.87517241379311
17. velocity_std: 17.254992405136203
18. velocity_range: 87
19. most_common_pitch_class: 0
20. pitch_class_diversity: 12


In [6]:
# Now applying feature extraction to all training files
print("Extracting features from all training files...")

all_features = []
failed_files = []

for composer, files in train_files.items():
    print(f"Processing {composer}: {len(files)} files")
    successful = 0
    
    for file_path in files:
        features = extract_features(file_path)
        
        if features:
            features['composer'] = composer
            features['file_path'] = file_path
            all_features.append(features)
            successful += 1
        else:
            failed_files.append(file_path)
    
    print(f"  Successfully processed: {successful}/{len(files)} files")

print(f"\nOverall Results:")
print(f"Total files processed successfully: {len(all_features)}")
print(f"Total files failed: {len(failed_files)}")

if failed_files:
    print(f"Failed files: {len(failed_files)}")

# Showing summary by composer
print(f"\nFiles per composer:")
composer_counts = {}
for feature_dict in all_features:
    composer = feature_dict['composer']
    composer_counts[composer] = composer_counts.get(composer, 0) + 1

for composer, count in composer_counts.items():
    print(f"  {composer}: {count} files")

Extracting features from all training files...
Processing Mozart: 86 files
  Successfully processed: 86/86 files
Processing Chopin: 132 files
  Successfully processed: 132/132 files
Processing Bach: 136 files
  Successfully processed: 136/136 files
Processing Beethoven: 127 files
Error processing data/train/Beethoven/Anhang 14-3.mid: Could not decode key with 3 flats and mode 255
  Successfully processed: 126/127 files

Overall Results:
Total files processed successfully: 480
Total files failed: 1
Failed files: 1

Files per composer:
  Mozart: 86 files
  Chopin: 132 files
  Bach: 136 files
  Beethoven: 126 files


In [7]:
# Converting the list of feature dictionaries into a pandas DataFrame
training_features = pd.DataFrame(all_features)
print(f"Created DataFrame with {len(training_features)} rows and {len(training_features.columns)} columns")

# Checking out the first few rows
training_features.head()

Created DataFrame with 480 rows and 22 columns


Unnamed: 0,duration,num_tracks,num_notes,note_density,pitch_mean,pitch_std,pitch_range,pitch_min,interval_mean,ascending_ratio,...,note_duration_std,time_between_mean,time_between_std,velocity_mean,velocity_std,velocity_range,most_common_pitch_class,pitch_class_diversity,composer,file_path
0,334.241377,15,2900,8.676364,62.69931,8.6795,52,34,-0.006554,0.299759,...,188.759168,265.32065,811.095562,64.875172,17.254992,87,0,12,Mozart,data/train/Mozart/K191 Bassoon Concerto 2mov.mid
1,192.003603,6,1288,6.708207,74.009317,5.957138,28,60,0.009324,0.602953,...,359.59845,165.720062,212.641001,62.0,22.0,44,7,12,Mozart,data/train/Mozart/K617 Adagio.mid
2,199.974998,4,1697,8.486061,65.923394,9.7461,40,43,-0.005307,0.399175,...,38.798861,47.234495,55.812359,88.0,0.0,0,2,12,Mozart,data/train/Mozart/K393 Solfeggi n1.mid
3,56.025918,4,260,4.640709,63.407692,9.296564,40,41,-0.138996,0.501931,...,146.971262,265.78125,168.01511,64.0,0.0,0,0,9,Mozart,data/train/Mozart/K02 Minuet in F.mid
4,59.25695,3,378,6.378999,63.460317,12.338041,41,40,-0.082228,0.366048,...,51.184995,80.053476,80.420003,127.0,0.0,0,9,10,Mozart,data/train/Mozart/Contradance n2.mid


In [8]:
# Dropping the file path column (perhaps unnecessarily) to prevent contamination during training
training_features.drop('file_path', axis=1, inplace=True)

# Create a mapping from composer names to numbers for later scaling purposes
composer_mapping = {
    'Bach': 0,
    'Beethoven': 1,
    'Chopin': 2,
    'Mozart': 3
}

# Replacing the composer names with their numbers
training_features = training_features.replace(composer_mapping)
training_features.head()

  training_features = training_features.replace(composer_mapping)


Unnamed: 0,duration,num_tracks,num_notes,note_density,pitch_mean,pitch_std,pitch_range,pitch_min,interval_mean,ascending_ratio,...,note_duration_mean,note_duration_std,time_between_mean,time_between_std,velocity_mean,velocity_std,velocity_range,most_common_pitch_class,pitch_class_diversity,composer
0,334.241377,15,2900,8.676364,62.69931,8.6795,52,34,-0.006554,0.299759,...,157.486542,188.759168,265.32065,811.095562,64.875172,17.254992,87,0,12,3
1,192.003603,6,1288,6.708207,74.009317,5.957138,28,60,0.009324,0.602953,...,469.959807,359.59845,165.720062,212.641001,62.0,22.0,44,7,12,3
2,199.974998,4,1697,8.486061,65.923394,9.7461,40,43,-0.005307,0.399175,...,38.985857,38.798861,47.234495,55.812359,88.0,0.0,0,2,12,3
3,56.025918,4,260,4.640709,63.407692,9.296564,40,41,-0.138996,0.501931,...,170.353846,146.971262,265.78125,168.01511,64.0,0.0,0,0,9,3
4,59.25695,3,378,6.378999,63.460317,12.338041,41,40,-0.082228,0.366048,...,91.746032,51.184995,80.053476,80.420003,127.0,0.0,0,9,10,3


In [9]:
# Assigning x and y as the dataframes for variables and results
x_train = training_features.drop(columns=['composer'])
y_train = training_features['composer']

# Printing the shape of x and y:
print("Shape of x_train:", x_train.shape)
print("Shape of y_train:", y_train.shape)

Shape of x_train: (480, 20)
Shape of y_train: (480,)


In [10]:
# Creating a StandardScaler object
scaler = StandardScaler()

# Fitting the scaler to the dataframe x_train
scaler.fit(x_train)

# Transforming x_train and to apply standardization
x_train_scaled_array = scaler.transform(x_train)

# Converting the scaled array back into a dataframe so as to calculate individual means and standard deviations
x_train_scaled = pd.DataFrame(x_train_scaled_array, columns=x_train.columns)

# Computing the means and the standard deviations for each column
x_train_column_means = x_train_scaled.mean(axis=0)  
x_train_column_stds = x_train_scaled.std(axis=0) 

# Printing the means and std devs for each column, rounding to 6 digits
for i, (mean, std) in enumerate(zip(x_train_column_means, x_train_column_stds)):
    print(f"Column {i+1}: Mean = {mean:.6f}, Std Dev = {std:.6f}")

Column 1: Mean = -0.000000, Std Dev = 1.001043
Column 2: Mean = 0.000000, Std Dev = 1.001043
Column 3: Mean = -0.000000, Std Dev = 1.001043
Column 4: Mean = 0.000000, Std Dev = 1.001043
Column 5: Mean = 0.000000, Std Dev = 1.001043
Column 6: Mean = -0.000000, Std Dev = 1.001043
Column 7: Mean = -0.000000, Std Dev = 1.001043
Column 8: Mean = 0.000000, Std Dev = 1.001043
Column 9: Mean = -0.000000, Std Dev = 1.001043
Column 10: Mean = 0.000000, Std Dev = 1.001043
Column 11: Mean = -0.000000, Std Dev = 1.001043
Column 12: Mean = 0.000000, Std Dev = 1.001043
Column 13: Mean = -0.000000, Std Dev = 1.001043
Column 14: Mean = 0.000000, Std Dev = 1.001043
Column 15: Mean = -0.000000, Std Dev = 1.001043
Column 16: Mean = -0.000000, Std Dev = 1.001043
Column 17: Mean = 0.000000, Std Dev = 1.001043
Column 18: Mean = -0.000000, Std Dev = 1.001043
Column 19: Mean = -0.000000, Std Dev = 1.001043
Column 20: Mean = -0.000000, Std Dev = 1.001043


Looks like we're all set for the training data. Now to apply to same standardization to the test data.

In [11]:
# Applying feature extraction to all test files
print("Extracting features from all test files...")
all_test_features = []
failed_test_files = []

for composer, files in test_files.items():
    print(f"Processing {composer}: {len(files)} files")
    successful = 0
    
    for file_path in files:
        features = extract_features(file_path)
        
        if features:
            features['composer'] = composer
            features['file_path'] = file_path
            all_test_features.append(features)
            successful += 1
        else:
            failed_test_files.append(file_path)
    
    print(f"  Successfully processed: {successful}/{len(files)} files")

print(f"\nOverall Test Results:")
print(f"Total test files processed successfully: {len(all_test_features)}")
print(f"Total test files failed: {len(failed_test_files)}")

if failed_test_files:
    print(f"Failed test files: {len(failed_test_files)}")

# Show summary by composer for test files
print(f"\nTest files per composer:")
test_composer_counts = {}
for feature_dict in all_test_features:
    composer = feature_dict['composer']
    test_composer_counts[composer] = test_composer_counts.get(composer, 0) + 1

for composer, count in test_composer_counts.items():
    print(f"  {composer}: {count} files")

Extracting features from all test files...
Processing Mozart: 32 files
  Successfully processed: 32/32 files
Processing Chopin: 51 files
  Successfully processed: 51/51 files
Processing Bach: 52 files
  Successfully processed: 52/52 files
Processing Beethoven: 44 files
  Successfully processed: 44/44 files

Overall Test Results:
Total test files processed successfully: 179
Total test files failed: 0

Test files per composer:
  Mozart: 32 files
  Chopin: 51 files
  Bach: 52 files
  Beethoven: 44 files


In [14]:
# Converting the list of test feature dictionaries into a DataFrame
test_features = pd.DataFrame(all_test_features)
print(f"Created DataFrame with {len(test_features)} rows and {len(test_features.columns)} columns")

# Checking out the first few rows
test_features.head()

Created DataFrame with 179 rows and 22 columns


Unnamed: 0,duration,num_tracks,num_notes,note_density,pitch_mean,pitch_std,pitch_range,pitch_min,interval_mean,ascending_ratio,...,note_duration_std,time_between_mean,time_between_std,velocity_mean,velocity_std,velocity_range,most_common_pitch_class,pitch_class_diversity,composer,file_path
0,192.003603,6,1288,6.708207,74.009317,5.957138,28,60,0.009324,0.602953,...,359.59845,165.720062,212.641001,62.0,22.0,44,7,12,Mozart,data/test/Mozart/K617 Adagio.mid
1,150.454395,3,876,5.822362,63.976027,8.695016,45,38,-0.035429,0.509714,...,77.587114,119.972509,121.21326,63.441781,22.567707,63,9,11,Mozart,data/test/Mozart/Sonatina n22 4mov.mid
2,139.194933,12,2295,16.487669,62.36732,9.509114,55,31,-0.013514,0.394507,...,47.79446,75.704774,245.570987,63.420044,23.337227,122,7,12,Mozart,data/test/Mozart/K317 Coronation Mass 5mov.mid
3,171.692322,23,2201,12.819443,60.209905,10.712912,57,29,-0.003636,0.386364,...,228.878148,295.26352,510.849037,64.981372,13.087693,41,2,12,Mozart,data/test/Mozart/K626 Requiem 07 Lacrimosa.mid
4,143.933506,26,4588,31.87583,59.302092,10.435538,51,31,-0.003052,0.463702,...,144.725995,212.417049,706.651479,100.060375,18.240691,85,2,12,Mozart,data/test/Mozart/K626 Requiem kyrie.mid


In [15]:
# Dropping the file path column so that the columns of our dataframe match the shape of our testing dataframe
test_features.drop('file_path', axis=1, inplace=True)


# Mapping composer names to numbers, for scaling and predicting purposes
composer_mapping = {
    'Bach': 0,
    'Beethoven': 1,
    'Chopin': 2,
    'Mozart': 3
}

# Replace the composer names with numbers (modifies the DataFrame in place)
test_features = test_features.replace(composer_mapping)

# Making sure everything worked
test_features.head()

  test_features = test_features.replace(composer_mapping)


Unnamed: 0,duration,num_tracks,num_notes,note_density,pitch_mean,pitch_std,pitch_range,pitch_min,interval_mean,ascending_ratio,...,note_duration_mean,note_duration_std,time_between_mean,time_between_std,velocity_mean,velocity_std,velocity_range,most_common_pitch_class,pitch_class_diversity,composer
0,192.003603,6,1288,6.708207,74.009317,5.957138,28,60,0.009324,0.602953,...,469.959807,359.59845,165.720062,212.641001,62.0,22.0,44,7,12,3
1,150.454395,3,876,5.822362,63.976027,8.695016,45,38,-0.035429,0.509714,...,112.671249,77.587114,119.972509,121.21326,63.441781,22.567707,63,9,11,3
2,139.194933,12,2295,16.487669,62.36732,9.509114,55,31,-0.013514,0.394507,...,55.884532,47.79446,75.704774,245.570987,63.420044,23.337227,122,7,12,3
3,171.692322,23,2201,12.819443,60.209905,10.712912,57,29,-0.003636,0.386364,...,262.523658,228.878148,295.26352,510.849037,64.981372,13.087693,41,2,12,3
4,143.933506,26,4588,31.87583,59.302092,10.435538,51,31,-0.003052,0.463702,...,139.564952,144.725995,212.417049,706.651479,100.060375,18.240691,85,2,12,3


In [16]:
# Assigning x and y as the dataframes for variables and results
x_test = test_features.drop(columns=['composer'])
y_test = test_features['composer']

# Printing the shape of x and y:
print("Shape of x_test:", x_test.shape)
print("Shape of y_test:", y_test.shape)

Shape of x_test: (179, 20)
Shape of y_test: (179,)


In [17]:
# Transforming x_train and to apply standardization
x_test_scaled_array = scaler.transform(x_test)

# Converting the scaled array back into a dataframe so as to calculate individual means and standard deviations
x_test_scaled = pd.DataFrame(x_test_scaled_array, columns=x_test.columns)

# Computing the means and the standard deviations for each column
x_test_column_means = x_test_scaled.mean(axis=0)  
x_test_column_stds = x_test_scaled.std(axis=0) 

# Printing the means and std devs for each column, rounding to 6 digits
for i, (mean, std) in enumerate(zip(x_test_column_means, x_test_column_stds)):
    print(f"Column {i+1}: Mean = {mean:.6f}, Std Dev = {std:.6f}")

Column 1: Mean = 0.018712, Std Dev = 0.793598
Column 2: Mean = 0.027808, Std Dev = 0.983591
Column 3: Mean = 0.064096, Std Dev = 1.070530
Column 4: Mean = 0.026490, Std Dev = 0.940816
Column 5: Mean = -0.051393, Std Dev = 0.908515
Column 6: Mean = 0.056319, Std Dev = 0.959828
Column 7: Mean = 0.071085, Std Dev = 0.963405
Column 8: Mean = -0.098165, Std Dev = 0.925503
Column 9: Mean = 0.056755, Std Dev = 0.635759
Column 10: Mean = -0.007007, Std Dev = 0.917818
Column 11: Mean = -0.045579, Std Dev = 0.952132
Column 12: Mean = -0.055811, Std Dev = 0.912632
Column 13: Mean = -0.056752, Std Dev = 0.946188
Column 14: Mean = -0.028197, Std Dev = 0.920345
Column 15: Mean = -0.001428, Std Dev = 0.999440
Column 16: Mean = 0.035650, Std Dev = 0.955301
Column 17: Mean = 0.083148, Std Dev = 0.995310
Column 18: Mean = 0.043303, Std Dev = 0.981042
Column 19: Mean = -0.017340, Std Dev = 0.999153
Column 20: Mean = 0.090390, Std Dev = 0.871728


Note: since the data for test was not used it fitting the scaler, means and std devs don't come out to exactly 0 or 1, and that's exactly what we want in this case.

At this point we've extracted the features from the audio files, converted them into dataframes, and organized that into test and train datasets with scaled values.  

The next step is to make these dataframes easily accessible in other notebooks, to enable collaboration.

In [18]:
# Saving all the DataFrames as CSVs
x_train_scaled.to_csv('data/x_train_scaled.csv', index=False)
x_test_scaled.to_csv('data/x_test_scaled.csv', index=False)
y_train.to_csv('data/y_train.csv', index=False)
y_test.to_csv('data/y_test.csv', index=False)

print("All DataFrames saved:")
print(f"  x_train_scaled: {x_train_scaled.shape} -> data/x_train_scaled.csv")
print(f"  x_test_scaled: {x_test_scaled.shape} -> data/x_test_scaled.csv") 
print(f"  y_train: {y_train.shape} -> data/y_train.csv")
print(f"  y_test: {y_test.shape} -> data/y_test.csv")

All DataFrames saved:
  x_train_scaled: (480, 20) -> data/x_train_scaled.csv
  x_test_scaled: (179, 20) -> data/x_test_scaled.csv
  y_train: (480,) -> data/y_train.csv
  y_test: (179,) -> data/y_test.csv


# To load the dataframes in new notebooks, run this code:

x_train = pd.read_csv('Data/x_train.csv')
x_test = pd.read_csv('Data/x_test.csv')
y_train = pd.read_csv('Data/y_train.csv')
y_test = pd.read_csv('Data/y_test.csv')

And with that, we're all set - all the necessary dataframes for model training have now been created and are easily importable to new notebooks via CSVs.

Composer encoding for reference purposes:
    'Bach': 1
    'Beethoven': 2
    'Chopin': 3
    'Mozart': 4