In [None]:
# !python3 preprocess.py ~/polyphemus/lmd_matched/A/A/A preprocessed_16 --n_bars=16 --n_files=100 --n_workers=8

In [145]:
from data import PolyphemusDataset
import torch
from torch_geometric.data import Data


In [2]:
dataset = PolyphemusDataset("preprocessed_16/", n_bars=16)

In [18]:
dataset

PolyphemusDataset(533)

In [6]:
x = dataset.__getitem__(0)

In [10]:
x

Data(edge_index=[2, 1692], edge_attrs=[1692, 33], num_nodes=358, node_features=[358, 4], is_drum=[358], batch=[358], ptr=[17], bars=[358], c_tensor=[358, 16, 230], s_tensor=[16, 4, 32])

In [44]:
def split_into_pairs(input_string):
    # Initialize an empty list to hold the pairs
    pairs = []
    
    # Iterate over the string in steps of 2
    for i in range(0, len(input_string), 2):
        # Append the substring of the next two characters to the list
        pairs.append(input_string[i:i+2])
    
    return pairs

In [47]:
phrases = split_into_pairs("i4A4A4B9b4A4B9b4B9X5o2")

In [54]:
phrases

['i4', 'A4', 'A4', 'B9', 'b4', 'A4', 'B9', 'b4', 'B9', 'X5', 'o2']

In [100]:
list(dict.fromkeys(phrases)) 

['i4', 'A4', 'B9', 'b4', 'X5', 'o2']

In [257]:
def get_phrase_edge_type(prev_phrase, curr_phrase):
    """
    Edge types:
    0: Intro to Any
    1: Any to Outro
    2: Repeated phrase
    3: Melody to Melody
    4: Melody to Non-Melody
    5: Non-Melody to Melody
    6: Non-Melody to Non-Melody
    """
    
    prev_phrase_type = prev_phrase[0]
    curr_phrase_type = curr_phrase[0]
    
    if prev_phrase_type == "i":
        return 0
    elif curr_phrase_type == "o":
        return 1
    elif prev_phrase == curr_phrase:
        return 2
    elif prev_phrase_type.isupper() & curr_phrase_type.isupper():
        return 3
    elif prev_phrase_type.isupper() & curr_phrase_type.islower():
        return 4
    elif prev_phrase_type.islower() & curr_phrase_type.isupper():
        return 5
    elif prev_phrase_type.islower() & curr_phrase_type.islower():
        return 6
    else:
        return None
    
curr_bar_idx = 0
prev_phrase = -1

# Construct phrase index (tensor of size n_bars, indicating the phrase type per bar)
phrase_idxs = torch.zeros(dataset.n_bars)
type_idx = list(dict.fromkeys(phrases)) 


# construct phrase edges, indicating connected nodes
phrase_edges = []
edge_types = []

for idx, phrase in enumerate(phrases):
    n_bar_phrase = int(phrase[1])
    phrase_type = phrase[0]
    is_repeated = (prev_phrase == type_idx.index(phrase))
    
    if is_repeated:
        phrase_idx = -1
    else:
        phrase_idx = type_idx.index(phrase)
        
    phrase_idxs[curr_bar_idx:curr_bar_idx+n_bar_phrase] = phrase_idx
    # phrase_idx[curr_bar_idx:curr_bar_idx+n_bar_phrase] = idx
    curr_bar_idx += n_bar_phrase
    
    if idx == 0:
        prev_phrase = type_idx.index(phrase)
        prev_phrase_type = phrase_type
    elif not is_repeated:
        edge_type = get_phrase_edge_type(prev_phrase_type, phrase_type)
        phrase_edges.append((prev_phrase, type_idx.index(phrase)))
        # phrase_edges.append((idx-1, idx))
        edge_types.append(edge_type)
        prev_phrase = type_idx.index(phrase)
        prev_phrase_type = phrase_type
    else:
        print("SAME")
    
    if curr_bar_idx > dataset.n_bars:
        print("EXCEED")
        break
    

SAME
EXCEED


In [258]:
phrases

['i4', 'A4', 'A4', 'B9', 'b4', 'A4', 'B9', 'b4', 'B9', 'X5', 'o2']

In [259]:
phrase_edges

[(0, 1), (1, 2)]

In [260]:
type_idx

['i4', 'A4', 'B9', 'b4', 'X5', 'o2']

In [261]:
phrase_idx

2

In [262]:
phrase_edge_idx = torch.tensor(phrase_edges, dtype=torch.long).T
phrase_edge_attr = torch.tensor(edge_types)

In [263]:
track_g = Data(edge_index=phrase_edge_idx, edge_attrs=phrase_edge_attr, bar_phrase_idx=phrase_idxs )

In [264]:
track_g.edge_index

tensor([[0, 1],
        [1, 2]])

In [265]:
track_g.edge_attrs

tensor([0, 3])

In [266]:
track_g.bar_phrase_idx

tensor([ 0.,  0.,  0.,  0.,  1.,  1.,  1.,  1., -1., -1., -1., -1.,  2.,  2.,
         2.,  2.])