In [1]:
from pathlib import Path
from itertools import product

import tensorflow as tf
import pandas as pd
from sklearn.model_selection import train_test_split

from myutils import log
from myutils.json_tools import load_json, save_json

In [3]:
ROOT_DIR = Path('..')
DATA_DIR = ROOT_DIR / 'data'

TRAIN_DATA_DIR = DATA_DIR / 'train_data'
TRAIN_DATA_DIR.mkdir(exist_ok=True)

MODELS_DIR = ROOT_DIR / 'models'
MODELS_DIR.mkdir(exist_ok=True)

PAD = 'PAD'
BOS = 'BOS'
EOS = 'EOS'

SEQUENCE_LENGTH = 70

TRAINABLE_TYPES = ['reel', 'jig', 'polka', 'waltz', 'hornpipe', 'slip jig']
TRAINABLE_MODES = ['maj', 'min', 'dor', 'mix']

EPOCHS = 15
BATCH_SIZE = 64

In [6]:
tunes = load_json(DATA_DIR / 'tunes_merged.json')
tunes_df = pd.DataFrame(tunes)
tunes_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35541 entries, 0 to 35540
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   tune            35541 non-null  object
 1   setting         35541 non-null  object
 2   name            35541 non-null  object
 3   type            35541 non-null  object
 4   meter           35541 non-null  object
 5   mode            35541 non-null  object
 6   abc             35541 non-null  object
 7   date            35541 non-null  object
 8   username        35541 non-null  object
 9   id              35541 non-null  int64 
 10  abc_transposed  35541 non-null  object
 11  length          35541 non-null  int64 
 12  key             35541 non-null  object
dtypes: int64(2), object(11)
memory usage: 3.5+ MB


In [7]:
length_limit = tunes_df['length'].median() + 1.5 * tunes_df['length'].quantile(0.75)

tunes_df = tunes_df[tunes_df['length'] <= length_limit]

In [9]:
char_counts = pd.Series(list(''.join(tunes_df['abc_transposed']))).value_counts()

chars = set(char_counts.index.tolist())
chars.update([PAD, BOS, EOS])

n_chars = len(chars)
char2id = dict(zip(chars, range(n_chars)))

n_chars

143

In [11]:
tune_type = 'polka'
tune_mode = 'maj'

current_tunes_df = tunes_df[(tunes_df['type'] == tune_type) & (tunes_df['mode'] == tune_mode)].copy(deep=True)

chars = set(char2id.keys())
n_chars = len(chars)

pad_id = char2id[PAD]
bos_id = char2id[BOS]
eos_id = char2id[EOS]

x = []
y = []

for _, row in current_tunes_df.iterrows():
    tune = row['abc_transposed']

    if len(set(tune) - chars) > 0:
        continue

    encoded_tune = [char2id[char] for char in tune]
    encoded_tune = [pad_id] * (SEQUENCE_LENGTH - 1) + [bos_id] + encoded_tune + [eos_id]

    for i in range(len(encoded_tune) - SEQUENCE_LENGTH):
        j = i + SEQUENCE_LENGTH

        x.append(encoded_tune[i:j])
        y.append(encoded_tune[j])

x = tf.keras.utils.to_categorical(x, num_classes=n_chars, dtype='int8')
y = tf.keras.utils.to_categorical(y, num_classes=n_chars, dtype='int8')

x.shape, y.shape

((366865, 70, 143), (366865, 143))

In [18]:
import numpy as np
from scipy import sparse

In [None]:
for i in range(x.shape[1]):
    x_slice = x[:, i, :]
    x_slice = sparse.csr_matrix(x_slice)
    
    sparse.save_npz('')

In [17]:
x_ = x[:, 0, :]

In [19]:
np.save('x.npy', x_)

In [20]:
sparse.save_npz('x.npz', sparse.csr_matrix(x_))

In [16]:
x[:, 0, :].shape

(366865, 143)