## Quantifying Centone Content in Arab Andalusian Music using TF-IDF
March - 2019

In [1]:
%load_ext autoreload

%autoreload 2

In [2]:
import sys
sys.path.append('../src/')

import extraction
import model
import persistence
import reporting

In [3]:
from collections import Counter
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

### Load

Load mappings Nawba mappings and recording metadata

In [4]:
data_path = '../data/'

In [5]:
# Nawba Mappings to centones
nawba_centones = persistence.load_and_parse_centones_mapping(os.path.join(data_path, 'Centones_Nawba.csv'))
# Nawba Mappings to tabs
nawba_tabs = persistence.load_and_parse_nawba_tabs(os.path.join(data_path, 'nawba_tabs.json'))

# Recording Descriptions
andalusian_description =  pd.read_json(os.path.join(data_path, 'andalusian_description.json'))

### Download

Download all scores from dunya

In [6]:
dunya_token = 'b6091182-8877-4918-b637-77987c4a1034'
scores_path = os.path.join(data_path, 'scores/')

In [7]:
scores = persistence.download_scores(andalusian_description, dunya_token, target_folder=scores_path)

In [None]:
# Replace tab with nawba
# (mbid, nawba)
mbid_nawba = [[x[0], nawba_tabs[x[1]]] for x in scores if x[1] in nawba_tabs]

# Nawba mbid lookup
nawba_mbid_lookup = {x[0]:x[1] for x in mbid_nawba}

### Pattern Extraction

Load all scores into a stream of notes

In [None]:
notes_dict = {}
chord_mbid = []

for i,(mbid, nawba) in enumerate(mbid_nawba):
    this_score_path = os.path.join(scores_path, mbid + '.xml')
    # Fails for scores with chords
    try:
        note_stream = persistence.pattern_stream_from_score(this_score_path, 4)
        notes_dict[mbid] = note_stream
    except Exception as e:
        print('{} contains chords and wont be counted'.format(mbid))
        chord_mbid.append(mbid)

final_scores = [x for x in scores if x[0] not in chord_mbid]

notes_indices = notes_dict.keys()
notes = notes_dict.values()

Extract bag of patterns from each score

In [None]:
mbid_patterns = [extraction.extract_pattern_grams(nt, min_n=3, max_n=10) for nt in notes]

In [None]:
# Convert indices from mbid to Nawba
full_nawba_indices = [nawba_mbid_lookup[x] for x in notes_indices]

In [None]:
# Counts of number off scores in each nawba for plotting later
nawba_scores = Counter(full_nawba_indices)

### TFIDF

Apply TF-IDF on corpus of bag of patterns

In [None]:
distributions = model.get_tfidf_distributions(mbid_patterns)

Average tf-idf for each pattern for each Nawba

In [None]:
frame_grouped = model.average_tfidf(distributions, full_nawba_indices)

In [None]:
frame_grouped[:5]

### Results

Plot analysis results

In [None]:
# Initialise Nawba number to plot
i = 1

Repeatedly run these cells to iterate through all Nawbas

In [None]:
nawba = 'Nawba_{}'.format(i)
reporting.get_amins_plot(frame_grouped, nawba, nawba_centones)
i += 1
if i > len(set(nawba_centones.keys())):
    i = 1

In [None]:
# Bars marked green match amins centones
# Bars marked red are supersets of amins centones
# Bars marked blues are un defined centones
nawba = 'Nawba_{}'.format(11)
reporting.get_top_centones_plot(frame_grouped, nawba, nawba_centones, nawba_scores[nawba], n=13, min_freq=50)
#i += 1
#if i > len(set(nawba_centones.keys())):
#    i = 1