### Scores analysis


#### Importing the processed data and a bit of preparation

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [None]:
df_meta = pd.read_csv('../data/processed/scores_metadata.csv')
df_scores = pd.read_csv('../data/processed/scores.csv')

First, we'll add a column indicating the difference between each note played and the tonic of the composition. I'll call this new column `relative_note`.  
To get this column, first I need to join the scores data with the scores metadata, because the tonic information is in the latter.  

With this column created, we will know which notes are being used on a chromatic scale. Thus, we may determine which scales and modes are used.

In [None]:
df_scores.head(5)

Unnamed: 0,id,start_time,end_time,instrument,note,start_beat,end_beat,note_value
0,2416,10206,19934,61,63,2.0,0.6875,Dotted Eighth
1,2416,10206,19934,61,55,2.0,0.6875,Dotted Eighth
2,2416,24030,31710,61,63,3.0,0.6875,Dotted Eighth
3,2416,24030,31710,61,55,3.0,0.6875,Dotted Eighth
4,2416,34782,42462,61,55,4.0,0.6875,Dotted Eighth


In [None]:
df_meta.head(5)

Unnamed: 0,id,composer,composition,movement,ensemble,seconds,key,tonic,tonic_encoded,mode
0,1727,Schubert,Piano Quintet in A major,2. Andante,Piano Quintet,447,A major,A,9,major
1,1728,Schubert,Piano Quintet in A major,3. Scherzo: Presto,Piano Quintet,251,A major,A,9,major
2,1729,Schubert,Piano Quintet in A major,4. Andantino - Allegretto,Piano Quintet,444,A major,A,9,major
3,1730,Schubert,Piano Quintet in A major,5. Allegro giusto,Piano Quintet,368,A major,A,9,major
4,1733,Schubert,Piano Sonata in A major,2. Andantino,Solo Piano,546,A major,A,9,major


In [None]:
# Joining the data and keeping only the  columns needed
df = df_scores.merge(df_meta, how='inner', on='id')
df = df[['id', 'composer', 'instrument', 'start_beat', 'end_beat', 'note_value', 'note', 'tonic_encoded']]
df.head(5)

Unnamed: 0,id,composer,instrument,start_beat,end_beat,note_value,note,tonic_encoded
0,2416,Beethoven,61,2.0,0.6875,Dotted Eighth,63,3
1,2416,Beethoven,61,2.0,0.6875,Dotted Eighth,55,3
2,2416,Beethoven,61,3.0,0.6875,Dotted Eighth,63,3
3,2416,Beethoven,61,3.0,0.6875,Dotted Eighth,55,3
4,2416,Beethoven,61,4.0,0.6875,Dotted Eighth,55,3


In [None]:
# Creating the column 'relative_note'
# We need the remainder of the note divided by 12, so we get the note regardless of the octave used.
df['relative_note'] = (df.note - df.tonic_encoded)%12
df.head(5)

Unnamed: 0,id,composer,instrument,start_beat,end_beat,note_value,note,tonic_encoded,relative_note
0,2416,Beethoven,61,2.0,0.6875,Dotted Eighth,63,3,0
1,2416,Beethoven,61,2.0,0.6875,Dotted Eighth,55,3,4
2,2416,Beethoven,61,3.0,0.6875,Dotted Eighth,63,3,0
3,2416,Beethoven,61,3.0,0.6875,Dotted Eighth,55,3,4
4,2416,Beethoven,61,4.0,0.6875,Dotted Eighth,55,3,4


In [None]:
# I've just realised that the column 'end_beat' actually shows duration of the note measured in quarter notes
df.rename({'end_beat': 'note_duration'}, axis=1, inplace=True)
df.head(5)

Unnamed: 0,id,composer,instrument,start_beat,note_duration,note_value,note,tonic_encoded,relative_note
0,2416,Beethoven,61,2.0,0.6875,Dotted Eighth,63,3,0
1,2416,Beethoven,61,2.0,0.6875,Dotted Eighth,55,3,4
2,2416,Beethoven,61,3.0,0.6875,Dotted Eighth,63,3,0
3,2416,Beethoven,61,3.0,0.6875,Dotted Eighth,55,3,4
4,2416,Beethoven,61,4.0,0.6875,Dotted Eighth,55,3,4


#### General Overview

Ok, now we've got the `relative_note` column, we can start our analysis.

In [None]:
# Shape of the dataset now we've joined with the metadata
df.shape

(1078664, 9)

In [None]:
df.dtypes

id                 int64
composer          object
instrument         int64
start_beat       float64
note_duration    float64
note_value        object
note               int64
tonic_encoded      int64
relative_note      int64
dtype: object

In [None]:
# Checking for duplicated rows
df.duplicated().value_counts()

False    1078664
dtype: int64

In [None]:
# Checking for NaN values
df.isna().any()

id               False
composer         False
instrument       False
start_beat       False
note_duration    False
note_value       False
note             False
tonic_encoded    False
relative_note    False
dtype: bool

In [None]:
# General check of numeric variables
df.describe()

Unnamed: 0,id,instrument,start_beat,note_duration,note,tonic_encoded,relative_note
count,1078664.0,1078664.0,1078664.0,1078664.0,1078664.0,1078664.0,1078664.0
mean,2237.593,19.38166,481.642,0.6527726,63.89427,5.234182,5.14342
std,288.1851,23.03962,442.1495,0.8411599,12.66501,3.349286,3.485569
min,1727.0,1.0,0.0,0.008333333,21.0,0.0,0.0
25%,2080.0,1.0,169.0,0.2416667,55.0,3.0,2.0
50%,2320.0,1.0,366.0,0.4895833,64.0,5.0,5.0
75%,2483.0,41.0,665.25,0.875,74.0,9.0,8.0
max,2678.0,74.0,4314.0,102.9792,104.0,11.0,11.0


Given the nature of the dataset, this statistical description is pretty much useless.  

Now, we will do a count of the times each note is played in every composition, and relate it to the total number of notes played in the composition.

In [None]:
# Analysing number of times each note is played by composition
df_composition_notes = df.groupby(['id', 'relative_note']).size().reset_index(name='count')
df_composition_notes

Unnamed: 0,id,relative_note,count
0,1727,0,928
1,1727,1,498
2,1727,2,226
3,1727,3,1041
4,1727,4,370
...,...,...,...
3797,2678,7,683
3798,2678,8,103
3799,2678,9,310
3800,2678,10,108


In [None]:
# Creating series with the total notes of the composition
s_totalnotes = df_composition_notes.groupby('id')['count'].sum().rename('total_notes')

# Joining the series with the last dataframe to get the percentage each note is played by composition
df_notes_ratio = pd.merge(df_composition_notes, s_totalnotes, how='inner', left_on='id', right_index=True)
df_notes_ratio['note_ratio'] = df_notes_ratio['count']/df_notes_ratio['total_notes']

df_notes_ratio

Unnamed: 0,id,relative_note,count,total_notes,note_ratio
0,1727,0,928,6580,0.141033
1,1727,1,498,6580,0.075684
2,1727,2,226,6580,0.034347
3,1727,3,1041,6580,0.158207
4,1727,4,370,6580,0.056231
...,...,...,...,...,...
3797,2678,7,683,3373,0.202490
3798,2678,8,103,3373,0.030537
3799,2678,9,310,3373,0.091906
3800,2678,10,108,3373,0.032019


In [None]:
# We will keep only the columns 'id', 'relative_note' and 'note_ratio'
df_notes_ratio = df_notes_ratio[['id', 'relative_note', 'note_ratio']]

# Pivoting the relative_note column
df_notes_ratio = df_notes_ratio.pivot(index='id',columns='relative_note', values='note_ratio')

df_notes_ratio

relative_note,0,1,2,3,4,5,6,7,8,9,10,11
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1727,0.141033,0.075684,0.034347,0.158207,0.056231,0.112766,0.041945,0.069301,0.103647,0.074012,0.094681,0.038146
1728,0.188278,0.024707,0.111271,0.057710,0.066366,0.143553,0.018936,0.158341,0.039675,0.071416,0.070334,0.049414
1729,0.210852,0.032925,0.052680,0.040300,0.094824,0.202555,0.010931,0.096273,0.065455,0.088898,0.066114,0.038193
1730,0.170947,0.017887,0.117414,0.046506,0.087645,0.132362,0.020570,0.145011,0.034240,0.112431,0.068736,0.046250
1733,0.100576,0.015950,0.084626,0.057599,0.225964,0.046079,0.027027,0.068232,0.060700,0.101019,0.059371,0.152858
...,...,...,...,...,...,...,...,...,...,...,...,...
2632,0.213418,0.007778,0.083131,0.030141,0.083131,0.160914,0.026738,0.127370,0.006320,0.110355,0.134662,0.016043
2633,0.191287,0.011517,0.113170,0.012018,0.112168,0.141713,0.022033,0.177266,0.010516,0.102654,0.035053,0.070606
2659,0.133568,0.007030,0.147627,0.130053,0.017575,0.145870,0.007030,0.142355,0.089631,0.028120,0.087873,0.063269
2677,0.185025,0.025441,0.130385,0.028332,0.087308,0.081816,0.041631,0.210755,0.038161,0.066204,0.019081,0.085863


Now, based on the table generated, we can do some analysis on the ratio each note is played by piece.  
Later, we can merge this dataset with the metadata again to do this analysis by composer.

In [None]:
# Merging the notes ratio with the metadata table
df_composition_ratios = df_meta.merge(df_notes_ratio, left_on='id', right_index=True)
# = df_meta.merge(df_notes_ratio, left_on='id', right_index=True)
df_composition_ratios

Unnamed: 0,id,composer,composition,movement,ensemble,seconds,key,tonic,tonic_encoded,mode,...,2,3,4,5,6,7,8,9,10,11
0,1727,Schubert,Piano Quintet in A major,2. Andante,Piano Quintet,447,A major,A,9,major,...,0.034347,0.158207,0.056231,0.112766,0.041945,0.069301,0.103647,0.074012,0.094681,0.038146
1,1728,Schubert,Piano Quintet in A major,3. Scherzo: Presto,Piano Quintet,251,A major,A,9,major,...,0.111271,0.057710,0.066366,0.143553,0.018936,0.158341,0.039675,0.071416,0.070334,0.049414
2,1729,Schubert,Piano Quintet in A major,4. Andantino - Allegretto,Piano Quintet,444,A major,A,9,major,...,0.052680,0.040300,0.094824,0.202555,0.010931,0.096273,0.065455,0.088898,0.066114,0.038193
3,1730,Schubert,Piano Quintet in A major,5. Allegro giusto,Piano Quintet,368,A major,A,9,major,...,0.117414,0.046506,0.087645,0.132362,0.020570,0.145011,0.034240,0.112431,0.068736,0.046250
4,1733,Schubert,Piano Sonata in A major,2. Andantino,Solo Piano,546,A major,A,9,major,...,0.084626,0.057599,0.225964,0.046079,0.027027,0.068232,0.060700,0.101019,0.059371,0.152858
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
313,2632,Beethoven,Piano Sonata No 10 in G major,2. Andante,Solo Piano,341,G major,G,7,major,...,0.083131,0.030141,0.083131,0.160914,0.026738,0.127370,0.006320,0.110355,0.134662,0.016043
314,2633,Beethoven,Piano Sonata No 10 in G major,3. Scherzo: Allegro assai,Solo Piano,227,G major,G,7,major,...,0.113170,0.012018,0.112168,0.141713,0.022033,0.177266,0.010516,0.102654,0.035053,0.070606
315,2659,Bach,Violin Partita No 1 in B minor,6. Double,Solo Violin,108,B minor,B,11,minor,...,0.147627,0.130053,0.017575,0.145870,0.007030,0.142355,0.089631,0.028120,0.087873,0.063269
316,2677,Beethoven,Piano Sonata No 9 in E major,1. Allegro,Solo Piano,445,E major,E,4,major,...,0.130385,0.028332,0.087308,0.081816,0.041631,0.210755,0.038161,0.066204,0.019081,0.085863


In [None]:
# Defining function to get top -any note ratio- compositions
def top_ratio(n, top=5):
    return df_composition_ratios.sort_values(n, ascending=False)[['id', 'composer', 'composition', 'movement', 'key', n]].head(top)

In [None]:
# Top 5 pieces with higher ratio of the infamous tritone
top_tritone = top_ratio(6)
top_tritone

Unnamed: 0,id,composer,composition,movement,key,6
91,2155,Brahms,String Sextet No 1 in B-flat major,"2. Andante, ma moderato",B-flat major,0.138508
25,1772,Schubert,Piano Sonata in D-flat major,2. Andante molto,D-flat major,0.133933
26,1773,Schubert,Piano Sonata in D-flat major,3. Menuetto. Allegretto,D-flat major,0.121648
196,2384,Beethoven,String Quartet No 13 in B-flat major,4. Alla danza tedesca. Allegro assai,B-flat major,0.117588
230,2471,Beethoven,Piano Sonata No 3 in C major,2. Adagio,C major,0.101706


In [None]:
# Top 5 pieces with higher ratio of the tonic
top_tonic = top_ratio(0)
top_tonic

Unnamed: 0,id,composer,composition,movement,key,0
60,1932,Dvorak,String Quartet No 12 in F major,3. Molto vivace,F major,0.273794
213,2417,Beethoven,Sextet in E-flat major for Winds,4. Rondo. Allegro,E-flat major,0.2559
252,2506,Beethoven,Octet in E-flat major for Winds,3. Menuetto,E-flat major,0.244253
186,2371,Beethoven,Piano Sonata No 9 in E major,2. Allegretto,E major,0.236264
184,2366,Beethoven,String Quartet No 12 in E-flat major,"2. Adagio, ma non troppo e molto cantabile",E-flat major,0.235078


<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=177ea176-1f94-4265-9666-0cca06278d8e' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>