In this notebook we will do feature analysis but only on the training set, to avoid any leakage.
We will then also prepare and standardize the test set.
The output of this notebook will be the scaled x_train

In [1]:
# Capturing necessary libraries
import mido
import numpy as np
import os
import glob
import pandas as pd
from collections import Counter
from sklearn.preprocessing import StandardScaler

In [2]:
# Loading our test and training folders
composers = ['Mozart', 'Chopin', 'Bach', 'Beethoven']
train_files = {c: glob.glob(f'Data/train/{c}/*.mid*') for c in composers}
test_files = {c: glob.glob(f'Data/test/{c}/*.mid*') for c in composers}

In [3]:
# Creating a function to extract 20 features for every music track

# Note: For the following function, Claude Sonnet 4.0 was used on 7/21/25 to identify the 20 features to extract.
def extract_features(file_path):
   
    try:
        mid = mido.MidiFile(file_path)
        features = {}
        
        # Collecting musical data
        notes = []
        velocities = []
        note_durations = []
        time_between_notes = []
        
        for track in mid.tracks:
            current_time = 0
            note_on_times = {}
            last_note_time = 0
            
            for msg in track:
                current_time += msg.time
                
                if msg.type == 'note_on' and msg.velocity > 0:
                    notes.append(msg.note)
                    velocities.append(msg.velocity)
                    note_on_times[msg.note] = current_time
                    
                    if last_note_time > 0:
                        time_between_notes.append(current_time - last_note_time)
                    last_note_time = current_time
                    
                elif msg.type == 'note_off' or (msg.type == 'note_on' and msg.velocity == 0):
                    if msg.note in note_on_times:
                        duration = current_time - note_on_times[msg.note]
                        note_durations.append(duration)
                        del note_on_times[msg.note]
        
        if not notes:
            return None
        
        # 20 KEY FEATURES FOR COMPOSER CLASSIFICATION
        
        # 1-4: Basic song structure
        features['duration'] = mid.length
        features['num_tracks'] = len(mid.tracks)
        features['num_notes'] = len(notes)
        features['note_density'] = len(notes) / max(mid.length, 1)
        
        # 5-8: Pitch characteristics
        features['pitch_mean'] = np.mean(notes)
        features['pitch_std'] = np.std(notes)
        features['pitch_range'] = max(notes) - min(notes)
        features['pitch_min'] = min(notes)
        
        # 9-11: Melodic movement
        if len(notes) > 1:
            intervals = [notes[i+1] - notes[i] for i in range(len(notes)-1)]
            features['interval_mean'] = np.mean(intervals)
            features['ascending_ratio'] = sum(1 for x in intervals if x > 0) / len(intervals)
            features['large_leaps_ratio'] = sum(1 for x in intervals if abs(x) > 4) / len(intervals)
        else:
            features['interval_mean'] = 0
            features['ascending_ratio'] = 0
            features['large_leaps_ratio'] = 0
        
        # 12-15: Rhythm and timing
        if note_durations:
            features['note_duration_mean'] = np.mean(note_durations)
            features['note_duration_std'] = np.std(note_durations)
        else:
            features['note_duration_mean'] = 0
            features['note_duration_std'] = 0
            
        if time_between_notes:
            features['time_between_mean'] = np.mean(time_between_notes)
            features['time_between_std'] = np.std(time_between_notes)
        else:
            features['time_between_mean'] = 0
            features['time_between_std'] = 0
        
        # 16-18: Dynamics (velocity)
        if velocities:
            features['velocity_mean'] = np.mean(velocities)
            features['velocity_std'] = np.std(velocities)
            features['velocity_range'] = max(velocities) - min(velocities)
        else:
            features['velocity_mean'] = 0
            features['velocity_std'] = 0
            features['velocity_range'] = 0
        
        # 19-20: Harmonic content (simplified)
        pitch_classes = [note % 12 for note in notes]
        features['most_common_pitch_class'] = Counter(pitch_classes).most_common(1)[0][0]
        features['pitch_class_diversity'] = len(set(pitch_classes))
        
        return features
        
    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        return None

# Test the function
print("Testing 20-feature extraction...")
test_features = extract_features(train_files['Mozart'][0])
print(f"Number of features extracted: {len(test_features)}")
print("\nAll 20 features:")
for i, (key, value) in enumerate(test_features.items(), 1):
    print(f"{i:2d}. {key}: {value}")

Testing 20-feature extraction...
Number of features extracted: 20

All 20 features:
 1. duration: 404.11327912499996
 2. num_tracks: 36
 3. num_notes: 7009
 4. note_density: 17.3441467085074
 5. pitch_mean: 62.9012697959766
 6. pitch_std: 11.64409975154554
 7. pitch_range: 58
 8. pitch_min: 29
 9. interval_mean: -0.005136986301369863
10. ascending_ratio: 0.3550228310502283
11. large_leaps_ratio: 0.2605593607305936
12. note_duration_mean: 279.84448566129265
13. note_duration_std: 454.23803400632664
14. time_between_mean: 399.4057991715469
15. time_between_std: 1465.0511718604812
16. velocity_mean: 77.23084605507205
17. velocity_std: 12.626026081479488
18. velocity_range: 74
19. most_common_pitch_class: 3
20. pitch_class_diversity: 12


In [4]:
# Now applying feature extraction to all training files
print("Extracting features from all training files...")

all_features = []
failed_files = []

for composer, files in train_files.items():
    print(f"Processing {composer}: {len(files)} files")
    successful = 0
    
    for file_path in files:
        features = extract_features(file_path)
        
        if features:
            features['composer'] = composer
            features['file_path'] = file_path
            all_features.append(features)
            successful += 1
        else:
            failed_files.append(file_path)
    
    print(f"  Successfully processed: {successful}/{len(files)} files")

print(f"\nOverall Results:")
print(f"Total files processed successfully: {len(all_features)}")
print(f"Total files failed: {len(failed_files)}")

if failed_files:
    print(f"Failed files: {len(failed_files)}")

# Showing summary by composer
print(f"\nFiles per composer:")
composer_counts = {}
for feature_dict in all_features:
    composer = feature_dict['composer']
    composer_counts[composer] = composer_counts.get(composer, 0) + 1

for composer, count in composer_counts.items():
    print(f"  {composer}: {count} files")

Extracting features from all training files...
Processing Mozart: 86 files
  Successfully processed: 86/86 files
Processing Chopin: 132 files
  Successfully processed: 132/132 files
Processing Bach: 136 files
  Successfully processed: 136/136 files
Processing Beethoven: 127 files
Error processing Data/train/Beethoven/Anhang 14-3.mid: Could not decode key with 3 flats and mode 255
  Successfully processed: 126/127 files

Overall Results:
Total files processed successfully: 480
Total files failed: 1
Failed files: 1

Files per composer:
  Mozart: 86 files
  Chopin: 132 files
  Bach: 136 files
  Beethoven: 126 files


In [5]:
# Converting the list of feature dictionaries into a pandas DataFrame
training_features = pd.DataFrame(all_features)
print(f"Created DataFrame with {len(training_features)} rows and {len(training_features.columns)} columns")

# Checking out the first few rows
training_features.head()

Created DataFrame with 480 rows and 22 columns


Unnamed: 0,duration,num_tracks,num_notes,note_density,pitch_mean,pitch_std,pitch_range,pitch_min,interval_mean,ascending_ratio,...,note_duration_std,time_between_mean,time_between_std,velocity_mean,velocity_std,velocity_range,most_common_pitch_class,pitch_class_diversity,composer,file_path
0,404.113279,36,7009,17.344147,62.90127,11.6441,58,29,-0.005137,0.355023,...,454.238034,399.405799,1465.051172,77.230846,12.626026,74,3,12,Mozart,Data/train/Mozart/K495 Horn Concerto n4 1mov.mid
1,361.846184,19,3536,9.772108,61.033654,11.257112,57,29,-0.00396,0.340028,...,67.085447,150.936968,472.118553,52.501697,7.073507,39,5,12,Mozart,Data/train/Mozart/K626 Requiem 05 Recordare.mid
2,304.218882,15,10671,35.076718,67.302221,10.670214,51,40,0.001874,0.38313,...,191.262459,103.185477,577.799561,105.596102,17.491686,81,2,12,Mozart,Data/train/Mozart/K492 Overture ''Le Nozze di ...
3,388.019697,26,4675,12.048357,62.305455,9.054603,45,41,-0.002567,0.391314,...,141.012365,268.308751,1587.954269,80.532834,19.465597,80,7,12,Mozart,Data/train/Mozart/K427 Great Mass 1mov.mid
4,626.371366,26,9047,14.443508,66.197082,11.380628,58,31,-0.004422,0.465399,...,537.543711,797.468186,3914.609658,87.414944,14.379838,64,0,12,Mozart,Data/train/Mozart/K299 Flute Harp Concerto 3mo...


In [6]:
# Dropping the file path column (perhaps unnecessarily) to prevent contamination during training
training_features.drop('file_path', axis=1, inplace=True)

# Create a mapping from composer names to numbers for later scaling purposes
composer_mapping = {
    'Bach': 1,
    'Beethoven': 2,
    'Chopin': 3,
    'Mozart': 4
}

# Replacing the composer names with their numbers
training_features = training_features.replace(composer_mapping)
training_features.head()

  training_features = training_features.replace(composer_mapping)


Unnamed: 0,duration,num_tracks,num_notes,note_density,pitch_mean,pitch_std,pitch_range,pitch_min,interval_mean,ascending_ratio,...,note_duration_mean,note_duration_std,time_between_mean,time_between_std,velocity_mean,velocity_std,velocity_range,most_common_pitch_class,pitch_class_diversity,composer
0,404.113279,36,7009,17.344147,62.90127,11.6441,58,29,-0.005137,0.355023,...,279.844486,454.238034,399.405799,1465.051172,77.230846,12.626026,74,3,12,4
1,361.846184,19,3536,9.772108,61.033654,11.257112,57,29,-0.00396,0.340028,...,84.794908,67.085447,150.936968,472.118553,52.501697,7.073507,39,5,12,4
2,304.218882,15,10671,35.076718,67.302221,10.670214,51,40,0.001874,0.38313,...,113.117326,191.262459,103.185477,577.799561,105.596102,17.491686,81,2,12,4
3,388.019697,26,4675,12.048357,62.305455,9.054603,45,41,-0.002567,0.391314,...,134.89955,141.012365,268.308751,1587.954269,80.532834,19.465597,80,7,12,4
4,626.371366,26,9047,14.443508,66.197082,11.380628,58,31,-0.004422,0.465399,...,396.463037,537.543711,797.468186,3914.609658,87.414944,14.379838,64,0,12,4


In [7]:
# Assigning x and y as the dataframes for variables and results
x_train = training_features.drop(columns=['composer'])
y_train = training_features['composer']

# Printing the shape of x and y:
print("Shape of x_train:", x_train.shape)
print("Shape of y_train:", y_train.shape)

Shape of x_train: (480, 20)
Shape of y_train: (480,)


In [8]:
# Creating a StandardScaler object
scaler = StandardScaler()

# Fitting the scaler to the dataframe x_train
scaler.fit(x_train)

# Transforming x_train and to apply standardization
x_train_scaled_array = scaler.transform(x_train)

# Converting the scaled array back into a dataframe so as to calculate individual means and standard deviations
x_train_scaled = pd.DataFrame(x_train_scaled_array, columns=x_train.columns)

# Computing the means and the standard deviations for each column
x_train_column_means = x_train_scaled.mean(axis=0)  
x_train_column_stds = x_train_scaled.std(axis=0) 

# Printing the means and std devs for each column, rounding to 6 digits
for i, (mean, std) in enumerate(zip(x_train_column_means, x_train_column_stds)):
    print(f"Column {i+1}: Mean = {mean:.6f}, Std Dev = {std:.6f}")

Column 1: Mean = 0.000000, Std Dev = 1.001043
Column 2: Mean = -0.000000, Std Dev = 1.001043
Column 3: Mean = -0.000000, Std Dev = 1.001043
Column 4: Mean = 0.000000, Std Dev = 1.001043
Column 5: Mean = -0.000000, Std Dev = 1.001043
Column 6: Mean = -0.000000, Std Dev = 1.001043
Column 7: Mean = -0.000000, Std Dev = 1.001043
Column 8: Mean = 0.000000, Std Dev = 1.001043
Column 9: Mean = 0.000000, Std Dev = 1.001043
Column 10: Mean = -0.000000, Std Dev = 1.001043
Column 11: Mean = -0.000000, Std Dev = 1.001043
Column 12: Mean = 0.000000, Std Dev = 1.001043
Column 13: Mean = -0.000000, Std Dev = 1.001043
Column 14: Mean = 0.000000, Std Dev = 1.001043
Column 15: Mean = 0.000000, Std Dev = 1.001043
Column 16: Mean = -0.000000, Std Dev = 1.001043
Column 17: Mean = 0.000000, Std Dev = 1.001043
Column 18: Mean = -0.000000, Std Dev = 1.001043
Column 19: Mean = -0.000000, Std Dev = 1.001043
Column 20: Mean = -0.000000, Std Dev = 1.001043


Looks like we're all set for the training data. Now to apply to same standardization to the test data.

In [9]:
# Applying feature extraction to all test files
print("Extracting features from all test files...")
all_test_features = []
failed_test_files = []

for composer, files in test_files.items():
    print(f"Processing {composer}: {len(files)} files")
    successful = 0
    
    for file_path in files:
        features = extract_features(file_path)
        
        if features:
            features['composer'] = composer
            features['file_path'] = file_path
            all_test_features.append(features)
            successful += 1
        else:
            failed_test_files.append(file_path)
    
    print(f"  Successfully processed: {successful}/{len(files)} files")

print(f"\nOverall Test Results:")
print(f"Total test files processed successfully: {len(all_test_features)}")
print(f"Total test files failed: {len(failed_test_files)}")

if failed_test_files:
    print(f"Failed test files: {len(failed_test_files)}")

# Show summary by composer for test files
print(f"\nTest files per composer:")
test_composer_counts = {}
for feature_dict in all_test_features:
    composer = feature_dict['composer']
    test_composer_counts[composer] = test_composer_counts.get(composer, 0) + 1

for composer, count in test_composer_counts.items():
    print(f"  {composer}: {count} files")

Extracting features from all test files...
Processing Mozart: 32 files
  Successfully processed: 32/32 files
Processing Chopin: 51 files
  Successfully processed: 51/51 files
Processing Bach: 52 files
  Successfully processed: 52/52 files
Processing Beethoven: 44 files
  Successfully processed: 44/44 files

Overall Test Results:
Total test files processed successfully: 179
Total test files failed: 0

Test files per composer:
  Mozart: 32 files
  Chopin: 51 files
  Bach: 52 files
  Beethoven: 44 files


In [12]:
# Converting the list of test feature dictionaries into a DataFrame
test_features = pd.DataFrame(all_test_features)
print(f"Created DataFrame with {len(test_features)} rows and {len(test_features.columns)} columns")

# Checking out the first few rows
test_features.head()

Created DataFrame with 179 rows and 22 columns


Unnamed: 0,duration,num_tracks,num_notes,note_density,pitch_mean,pitch_std,pitch_range,pitch_min,interval_mean,ascending_ratio,...,note_duration_std,time_between_mean,time_between_std,velocity_mean,velocity_std,velocity_range,most_common_pitch_class,pitch_class_diversity,composer,file_path
0,388.019697,26,4675,12.048357,62.305455,9.054603,45,41,-0.002567,0.391314,...,141.012365,268.308751,1587.954269,80.532834,19.465597,80,7,12,Mozart,Data/test/Mozart/K427 Great Mass 1mov.mid
1,347.866891,12,5642,16.218847,62.069124,13.302596,62,24,-0.009218,0.352243,...,195.614212,327.272727,491.494364,73.289614,17.104226,124,0,12,Mozart,Data/test/Mozart/K525 Serenade 2mov ''Eine Kle...
2,335.482926,12,5913,17.625338,68.726197,10.876486,58,33,-0.006089,0.407815,...,121.509799,204.685075,964.554549,80.757314,26.190655,65,2,12,Mozart,Data/test/Mozart/K314 Flute Concerto n2 3mov.mid
3,210.008333,8,1535,7.309234,61.420847,9.209404,43,36,-0.012386,0.458931,...,103.362871,63.248532,76.976976,86.591531,8.578167,70,2,12,Mozart,Data/test/Mozart/K401 Fuga in G minor .mid
4,204.782841,12,1728,8.438207,61.839699,10.163097,57,29,-0.001737,0.407064,...,183.944581,231.283547,638.500666,84.270255,9.173949,50,0,12,Mozart,Data/test/Mozart/K335 Laudate Dominus.mid


In [13]:
# Dropping the file path column so that the columns of our dataframe match the shape of our testing dataframe
test_features.drop('file_path', axis=1, inplace=True)


# Mapping composer names to numbers, for scaling and predicting purposes
composer_mapping = {
    'Bach': 1,
    'Beethoven': 2,
    'Chopin': 3,
    'Mozart': 4
}

# Replace the composer names with numbers (modifies the DataFrame in place)
test_features = test_features.replace(composer_mapping)

# Making sure everything worked
test_features.head()

  test_features = test_features.replace(composer_mapping)


Unnamed: 0,duration,num_tracks,num_notes,note_density,pitch_mean,pitch_std,pitch_range,pitch_min,interval_mean,ascending_ratio,...,note_duration_mean,note_duration_std,time_between_mean,time_between_std,velocity_mean,velocity_std,velocity_range,most_common_pitch_class,pitch_class_diversity,composer
0,388.019697,26,4675,12.048357,62.305455,9.054603,45,41,-0.002567,0.391314,...,134.89955,141.012365,268.308751,1587.954269,80.532834,19.465597,80,7,12,4
1,347.866891,12,5642,16.218847,62.069124,13.302596,62,24,-0.009218,0.352243,...,203.423609,195.614212,327.272727,491.494364,73.289614,17.104226,124,0,12,4
2,335.482926,12,5913,17.625338,68.726197,10.876486,58,33,-0.006089,0.407815,...,80.683579,121.509799,204.685075,964.554549,80.757314,26.190655,65,2,12,4
3,210.008333,8,1535,7.309234,61.420847,9.209404,43,36,-0.012386,0.458931,...,105.365385,103.362871,63.248532,76.976976,86.591531,8.578167,70,2,12,4
4,204.782841,12,1728,8.438207,61.839699,10.163097,57,29,-0.001737,0.407064,...,145.713955,183.944581,231.283547,638.500666,84.270255,9.173949,50,0,12,4


In [14]:
# Assigning x and y as the dataframes for variables and results
x_test = test_features.drop(columns=['composer'])
y_test = test_features['composer']

# Printing the shape of x and y:
print("Shape of x_test:", x_test.shape)
print("Shape of y_test:", y_test.shape)

Shape of x_test: (179, 20)
Shape of y_test: (179,)


In [15]:
# Transforming x_train and to apply standardization
x_test_scaled_array = scaler.transform(x_test)

# Converting the scaled array back into a dataframe so as to calculate individual means and standard deviations
x_test_scaled = pd.DataFrame(x_test_scaled_array, columns=x_test.columns)

# Computing the means and the standard deviations for each column
x_test_column_means = x_test_scaled.mean(axis=0)  
x_test_column_stds = x_test_scaled.std(axis=0) 

# Printing the means and std devs for each column, rounding to 6 digits
for i, (mean, std) in enumerate(zip(x_test_column_means, x_test_column_stds)):
    print(f"Column {i+1}: Mean = {mean:.6f}, Std Dev = {std:.6f}")

Column 1: Mean = 0.018712, Std Dev = 0.793598
Column 2: Mean = 0.027808, Std Dev = 0.983591
Column 3: Mean = 0.064096, Std Dev = 1.070530
Column 4: Mean = 0.026490, Std Dev = 0.940816
Column 5: Mean = -0.051393, Std Dev = 0.908515
Column 6: Mean = 0.056319, Std Dev = 0.959828
Column 7: Mean = 0.071085, Std Dev = 0.963405
Column 8: Mean = -0.098165, Std Dev = 0.925503
Column 9: Mean = 0.056755, Std Dev = 0.635759
Column 10: Mean = -0.007007, Std Dev = 0.917818
Column 11: Mean = -0.045579, Std Dev = 0.952132
Column 12: Mean = -0.055811, Std Dev = 0.912632
Column 13: Mean = -0.056752, Std Dev = 0.946188
Column 14: Mean = -0.028197, Std Dev = 0.920345
Column 15: Mean = -0.001428, Std Dev = 0.999440
Column 16: Mean = 0.035650, Std Dev = 0.955301
Column 17: Mean = 0.083148, Std Dev = 0.995310
Column 18: Mean = 0.043303, Std Dev = 0.981042
Column 19: Mean = -0.017340, Std Dev = 0.999153
Column 20: Mean = 0.090390, Std Dev = 0.871728


Note: since the data for test was not used it fitting the scaler, means and std devs don't come out to exactly 0 or 1, and that's exactly what we want in this case.

At this point we've extracted the features from the audio files, converted them into dataframes, and organized that into test and train datasets with scaled values.  

The next step is to make these dataframes easily accessible in other notebooks, to enable collaboration.

In [16]:
# Saving all the DataFrames as CSVs
x_train_scaled.to_csv('Data/x_train_scaled.csv', index=False)
x_test_scaled.to_csv('Data/x_test_scaled.csv', index=False)
y_train.to_csv('Data/y_train.csv', index=False)
y_test.to_csv('Data/y_test.csv', index=False)

print("All DataFrames saved:")
print(f"  x_train_scaled: {x_train_scaled.shape} -> Data/x_train_scaled.csv")
print(f"  x_test_scaled: {x_test_scaled.shape} -> Data/x_test_scaled.csv") 
print(f"  y_train: {y_train.shape} -> Data/y_train.csv")
print(f"  y_test: {y_test.shape} -> Data/y_test.csv")

All DataFrames saved:
  x_train_scaled: (480, 20) -> Data/x_train_scaled.csv
  x_test_scaled: (179, 20) -> Data/x_test_scaled.csv
  y_train: (480,) -> Data/y_train.csv
  y_test: (179,) -> Data/y_test.csv


# To load the dataframes in new notebooks, run this code:

x_train = pd.read_csv('Data/x_train.csv')
x_test = pd.read_csv('Data/x_test.csv')
y_train = pd.read_csv('Data/y_train.csv')
y_test = pd.read_csv('Data/y_test.csv')

And with that, we're all set - all the necessary dataframes for model training have now been created and are easily importable to new notebooks via CSVs.

Composer encoding for reference purposes:
    'Bach': 1
    'Beethoven': 2
    'Chopin': 3
    'Mozart': 4