# Chord Progression Frequency Analysis
This python notebook will look into the frequency of chord progressions in a composer's repertoire.

## Sequence Dictionary
The first step is to determine the frequency of an n-long sequence in a text file formatted as in the List Generator notebook.

In [25]:
def build_n_sequence_dictionary(n : int, txt : str) -> dict:
    """
    Given a length n, returns a dictionary containing sequences
    in txt of length n mapped to the number of times they appear
    in txt.
    """
    seq_dict = dict()
    
    blocks = txt.split('\n')
    for block in blocks:
        elems = block.split(',')
        for i in range(len(elems) - n):
            seq = elems[i:i+n]
            seq_dict[tuple(seq)] = seq_dict.get(tuple(seq), 0) + 1
            
    return seq_dict            

In [30]:
# test build_n_sequence_dictionary()

# The following text represents corpus.parse('monteverdi/madrigal.3.1.rntxt')
test_txt = "vi,V,I,IV,I,V,V,i,V,i,VI,V,i,V,i,i,i,I,IV,ii,vii,I,vi,I,I,IV,vii,I,vi,vi,I,IV,IV,I,I,IV,ii,V,I,V,V,ii,ii,vi,IV,V,I,I,V,V,ii,VI,VI,iv,V,V,i,V,I,V,IV,I,vi,V,I,I,i,i,VII,VII,iv,VI,i,V,i,ii,V,I,vi,V,vi,V,vi,i,V,I,V,ii,V,I,V,vi,i,i,V,i,V,I,i,V,i,V,i,V,I,ii,I,ii,V,I,I,vi,V,II,V,I,IV,ii,vii,V,i,V,I,i,V,i,V,I,ii,I,ii,V,I,ii,I,ii,V,I,I,vi,vi,II,V,I,vi,V,i,V,I,i,i,i,i,i,ii,V,i,V,i,i,i,v,IV,III,V,i,V,i,i,II,III,v,IV,i,vii,i,i,V,V,V,I,vii,I,I,vii,I,V,V,i,VII,v,i,V,V,i,V,I,I,V,vi,V,IV,ii,V,i,V,i,I,IV,IV,I,ii,I,ii,V,i,V,i,V,vi,ii,V,IV,vii,iii,IV,V,V,i,V,iii,vi,ii,V,I,I,vi,ii,I,IV,V,I,V,I,IV,V,vi,iii,i,V,i,VII,v,iv,III,iv,iv,III,I,ii,v,i,V,I"

seq_dict = build_n_sequence_dictionary(3, test_txt)
# print(seq_dict)
print(sorted(seq_dict.items(), key=lambda kv:(kv[1], kv[0]), reverse=True))

[(('V', 'i', 'V'), 16), (('i', 'V', 'i'), 12), (('i', 'V', 'I'), 8), (('ii', 'V', 'I'), 7), (('V', 'I', 'I'), 6), (('i', 'i', 'i'), 5), (('V', 'V', 'i'), 5), (('V', 'I', 'V'), 5), (('ii', 'I', 'ii'), 4), (('I', 'vi', 'V'), 4), (('I', 'ii', 'V'), 4), (('I', 'ii', 'I'), 4), (('I', 'V', 'V'), 4), (('ii', 'V', 'i'), 3), (('V', 'i', 'i'), 3), (('V', 'I', 'ii'), 3), (('V', 'I', 'i'), 3), (('V', 'I', 'IV'), 3), (('I', 'IV', 'ii'), 3), (('I', 'I', 'vi'), 3), (('vii', 'I', 'vi'), 2), (('vi', 'ii', 'V'), 2), (('vi', 'V', 'vi'), 2), (('vi', 'V', 'I'), 2), (('v', 'i', 'V'), 2), (('i', 'ii', 'V'), 2), (('i', 'i', 'V'), 2), (('i', 'VII', 'v'), 2), (('i', 'V', 'V'), 2), (('i', 'I', 'IV'), 2), (('V', 'vi', 'i'), 2), (('V', 'vi', 'V'), 2), (('V', 'i', 'VII'), 2), (('V', 'V', 'ii'), 2), (('V', 'I', 'vi'), 2), (('IV', 'ii', 'vii'), 2), (('IV', 'ii', 'V'), 2), (('IV', 'V', 'I'), 2), (('IV', 'IV', 'I'), 2), (('II', 'V', 'I'), 2), (('I', 'vii', 'I'), 2), (('I', 'vi', 'vi'), 2), (('I', 'i', 'i'), 2), (('I', 

## Using the Datasets

In [33]:
def get_dataset(composer: str, with_inversions=False) -> str:
    '''
    Given a composer and a flag for inversions, returns the correct
    dataset containing the composer's repertoire as a string.
    
    If with_inversions is True, the dataset returned will contain
    inversions as (0), (1), etc. after each roman numeral, where
    (0) is the root position.
    '''
    if composer == 'bach':
        if with_inversions:
            return open("inv-dataset-bach.txt","r").read()
        else:
            return open("simple-dataset-bach.txt","r").read()
    elif composer == 'monteverdi':
        if with_inversions:
            return open("inv-dataset-monteverdi.txt","r").read()
        else:
            return open("simple-dataset-monteverdi.txt","r").read()
    else:
        raise NotImplementedError("The composer you've entered is not in the database.")

### Method 1 - Counting and getting an overall percentage
For this function, we count the number of times the given progression appears in the composer's repertoire and calculate a percentage based on all n-long chord progressions used in their repertoire.

In [44]:
def progression_count_and_percent(prog: str, composer: str, with_inversions=False, as_msg=False):
    '''
    Given a chord progression formatted as roman numerals separated by commas
    (if with_inversions=True, parentheses containing the inversion number, 0 
    being the root position, follow each roman numeral) and a composer 
    (currently 'bach' and 'monteverdi' are the only ones supported),
    returns a tuple containing the number of times the given progression
    appears in the composer's repertoire and the percentage used based on
    all n-long progressions. 
    
    If as_msg is True, returns a string containing information in English instead.
    '''
    # removes any whitespace
    prog = prog.replace(' ', '')

    txt = get_dataset(composer, with_inversions)
        
    # creates the dictionary given the progression
    prog_list = tuple(prog.split(','))
    seq_dict = build_n_sequence_dictionary(len(prog_list), txt)
    total_progs = sum(seq_dict.values())
    num_times = seq_dict.get(prog_list, 0)
    
    if as_msg:
        msg = ''
        if num_times == 0:
            msg = composer + ' never uses [' + prog + '] in their repertoire.'
        if num_times == 1:
            msg = composer + ' uses [' + prog + '] ' + str(num_times) + ' time in their repertoire.\n'
        else: 
            msg = composer + ' uses [' + prog + '] ' + str(num_times) + ' times in their repertoire.\n'
        msg += 'Compared to other length-' + str(len(prog_list)) + ' progressions, [' + prog + '] is used {:.2f}'.format(num_times/total_progs*100) + '% of the time.'
        return msg
    return num_times, num_times/total_progs

In [45]:
# test without inversion
prog = 'V,I'
print(progression_count_and_percent(prog, 'monteverdi', as_msg=True))

monteverdi uses [V,I] 446 times in their repertoire.
Compared to other length-2 progressions, [V,I] is used 6.74% of the time.


In [90]:
# test with inversion
prog = 'V(0),I(0)'
print(progression_count_and_percent(prog, 'monteverdi', with_inversions=True, as_msg=True))

monteverdi uses [V(0),I(0)] 312 times in their repertoire.
Compared to other length-2 progressions, [V(0),I(0)] is used 4.72% of the time.


### Method 2 - Provide percentages at every chord
For this function, we output the percentage of times a composer uses each chord in the progression given the previous chords in the progression.

In [87]:
def progression_probability(prog: str, composer: str, with_inversions=False, as_msg=False):
    '''
    Given a chord progression formatted as roman numerals separated by commas
    (if with_inversions=True, parentheses containing the inversion number, 0 
    being the root inversion, follow each roman numeral) and a composer 
    (currently 'bach' and 'monteverdi' are the only ones supported),
    returns a list containing the probability that the i-th chord in the
    progression follows the i-1 chords that precede it using its frequency
    in the composer's repertoire.
    
    If as_msg is True, returns a string containing information in English instead.
    '''
    # removes any whitespace
    prog = prog.replace(' ', '')
    
    txt = get_dataset(composer, with_inversions)    
    
    prog_prob = [] # empty list to contain tuples of chord progression and probabilities
    prog_list = tuple(prog.split(','))
    # creates the dictionary given the progression
    seq_dict = build_n_sequence_dictionary(len(prog_list), txt) 
    
    # values to be modified as i-1 chords in progression are given
    total_progs = sum(seq_dict.values())
    chords_to_check = seq_dict.keys()
    
    # flag used when progression is not used in repertoire
    shortcut = False
    
    for i in range(len(prog_list)):
        if shortcut:
            prog_prob.append((prog_list[:i+1], 0.))
            continue
            
        same_start_chords = list(filter(lambda a: (a[:i+1] == prog_list[:i+1]), chords_to_check))
        
        # count number of same_start_chords in repertoire
        progs = 0
        for chord in same_start_chords:
            progs += seq_dict[chord]
        if progs == 0:
#             print(chords_to_check) # shows actual chords that follow
            shortcut = True
        
        # add probability to list
        prog_prob.append((prog_list[:i+1], progs/total_progs))
        # update values assuming i-1 chords are given
        total_progs = progs
        chords_to_check = same_start_chords
    
    if as_msg:
        msg = ''
        
        # msg for first chord in progression
        msg += composer + ' starts a progression with [' + prog_list[0] + \
                '] {:.2f}'.format(prog_prob[0][1]*100) + '% of the time.'
        for i in range(1, len(prog_list)):
            msg += '\n' + composer + ' follows [' + ','.join([chord for chord in prog_list[:i]]) + \
                    '] with [' + prog_list[i] + '] {:.2f}'.format(prog_prob[i][1]*100) + '% of the time.'
        return msg
    return prog_prob

In [88]:
# test without inversion
prog = 'V,I,IV,V,vi,iii,V'
print(progression_probability(prog, 'monteverdi', as_msg=True))

monteverdi starts a progression with [V] 22.73% of the time.
monteverdi follows [V] with [I] 30.12% of the time.
monteverdi follows [V,I] with [IV] 14.19% of the time.
monteverdi follows [V,I,IV] with [V] 29.03% of the time.
monteverdi follows [V,I,IV,V] with [vi] 11.11% of the time.
monteverdi follows [V,I,IV,V,vi] with [iii] 50.00% of the time.
monteverdi follows [V,I,IV,V,vi,iii] with [V] 0.00% of the time.


In [59]:
# test with inversion
prog = 'V(0),I(0)'
print(progression_probability(prog, 'monteverdi', with_inversions=True, as_msg=True))

monteverdi starts a progression with [V(0)] 19.96% of the time.
monteverdi follows [V(0)] with [I(0)] 23.64% of the time.


## Stretch Goals
* automate roman numeral analysis for music21 parseable composers
* return locations of matching progressions in scores (highlight when .show()))