# Creating matrices from dataset

In [4]:
from gensim.models import Word2Vec
import os
import numpy as np
import pandas as pd
import pickle

In [5]:
seqlength = 20
num_midi_features = 3
num_sequences_per_song = 2
training_rate = 0.8
validation_rate = 0.1
test_rate = 0.1

syll_model_path = './enc_models/syllEncoding_20190419.bin'
word_model_path = './enc_models/wordLevelEncoder_20190419.bin'
songs_path = './data/songs_word_level'

print('Creating a dataset with sequences of length', seqlength, 
      'with', num_sequences_per_song, 'sequences per song')

syllModel = Word2Vec.load(syll_model_path)
wordModel = Word2Vec.load(word_model_path)
syll2Vec = syllModel.wv['Hello']
word2Vec = wordModel.wv['world']
num_syll_features = len(syll2Vec) + len(word2Vec)
print('Syllable embedding length :', num_syll_features)

Creating a dataset with sequences of length 20 with 2 sequences per song
Syllable embedding length : 20


In [6]:
files = os.listdir(songs_path)
num_songs = len(files)
print("Total number of songs : ", num_songs)

Total number of songs :  7497


In [7]:
print(files)

['acf3cfbbb59afb8cac898b4fc8dfce5c.npy', 'b97dc283afc183df3589b0102b8bc572.npy', '8b94a246d974703fb763701d2dbaa38c.npy', 'c3c5001de0384a1fc100a51c4038ce5e.npy', 'd79c8008376760d4c6282339e4729686.npy', 'ce1bf6d93dbef95874d1f764dd96ca4d.npy', 'a4919b9f6b0d55777ae85871cf072e50.npy', '4b2b33e463ea71f41ebe0f726855fb55.npy', '5ffe486ad929c6880a652402f83e993a.npy', 'd05020be594ba4b4e8be65e88e99e202.npy', '304c5319b2eb5b3aa8ed8265682401af.npy', '981fe6582bce47e022a685df093eba52.npy', 'cef1751a18d1ae5ae314d5f4660a593c.npy', '527c1e98748c3675886977db8b84f20f.npy', 'df8050f9202ae1ca2ce9b54cf9026352.npy', '58f24d233e6481358afcccc2deff0d7a.npy', 'd3bc8ab6f283c04caadfb553cec200e2.npy', '70abe1e81a31ffc0b511883cc05d2548.npy', '30d93086c267bbd040fe7985b29281d4.npy', '72c23b0c4fdc0dfe40844eddf0ff2318.npy', 'beaab8cacb664624410ecef759e059d5.npy', '536405018ce5da10307dafd45c9be8c0.npy', '559f4454a21cad4f25f34f6db558fd77.npy', '2f0a754a1ec9d8739a08f835cc128aa9.npy', '5601ac73cf5f7059a7c02853ec1a61fc.npy',

In [8]:
num_syll_features, num_midi_features, seqlength

(20, 3, 20)

In [9]:
# inspect content of sample file
sample_file = files[0]
sample_features = np.load(os.path.join(songs_path, sample_file), allow_pickle=True) # load midi files to feature
len(sample_features), len(sample_features[0])

(1, 4)

In [10]:
print(len(sample_features[0][0]))
print(sample_features[0][0])

198
[[[0.0, 0.09062500000000284, 659.2551138257398, 124.0]], [[0.2109375, 0.06718750000000284, 659.2551138257398, 124.0]], [[0.17343750000000213, 0.09687500000000071, 659.2551138257398, 124.0]], [[0.3515625, 1.1515625000000007, 659.2551138257398, 124.0]], [[4.515625, 0.3000000000000007, 523.2511306011972, 109.0]], [[0.7484374999999979, 0.6765625000000028, 698.4564628660078, 101.0], [0.6875, 0.4765625, 659.2551138257398, 103.0]], [[0.5015625000000021, 1.0812500000000007, 523.2511306011972, 98.0]], [[4.084374999999998, 0.4921875, 659.2551138257398, 98.0]], [[0.5093749999999986, 0.19843750000000426, 587.3295358348151, 85.0]], [[0.2265625, 0.3343750000000014, 587.3295358348151, 83.0], [0.3734375000000014, 1.3953125000000028, 523.2511306011972, 91.0]], [[1.8765624999999986, 0.3343750000000014, 659.2551138257398, 97.0]], [[0.3515625, 0.19843750000000426, 587.3295358348151, 88.0]], [[0.390625, 0.17031250000000142, 587.3295358348151, 93.0], [0.3890625000000014, 1.5671875000000028, 523.25113060

In [11]:
print(len(sample_features[0][1]))
print(sample_features[0][1])

198
[[[76.0, 0.25, 0.0]], [[76.0, 0.75, 0.5]], [[76.0, 0.25, 0.0]], [[76.0, 3.0, 1.0]], [[72.0, 1.0, 8.0]], [[77.0, 2.0, 0.0], [76.0, 1.0, 0.0]], [[72.0, 3.0, 0.0]], [[76.0, 1.0, 8.0]], [[74.0, 0.5, 0.0]], [[74.0, 1.0, 0.0], [72.0, 4.0, 0.0]], [[76.0, 1.0, 1.0]], [[74.0, 1.0, 0.0]], [[74.0, 1.0, 0.0], [72.0, 4.0, 0.0]], [[76.0, 2.0, 1.0]], [[74.0, 0.75, 0.0]], [[74.0, 1.0, 0.0], [72.0, 6.0, 0.0]], [[77.0, 1.0, 0.0]], [[74.0, 3.0, 1.0]], [[67.0, 0.75, 2.0]], [[76.0, 1.0, 0.0]], [[74.0, 0.75, 0.0]], [[74.0, 0.5, 0.0]], [[72.0, 3.0, 1.0], [72.0, 0.75, 2.0]], [[76.0, 0.25, 0.0]], [[77.0, 1.0, 1.0]], [[72.0, 3.0, 0.0], [76.0, 0.5, 4.0]], [[74.0, 0.75, 0.0]], [[77.0, 1.0, 0.0]], [[74.0, 3.0, 0.0], [76.0, 2.0, 2.0]], [[72.0, 0.5, 0.0], [74.0, 3.0, 0.0]], [[74.0, 1.0, 4.0]], [[76.0, 0.75, 0.0]], [[72.0, 4.0, 0.0]], [[72.0, 0.75, 2.0]], [[74.0, 1.0, 0.0]], [[72.0, 1.0, 0.0]], [[79.0, 4.0, 0.0]], [[74.0, 2.0, 2.0]], [[74.0, 0.75, 8.0]], [[74.0, 0.75, 0.0]], [[76.0, 0.75, 0.0]], [[77.0, 2.0, 0.0]

In [12]:
print(len(sample_features[0][2]))
print(sample_features[0][2]) 

198
[['Must'], ['have'], ['been'], ['love'], ['but'], ['over', 'over'], ['now'], ['Lay'], ['a'], ['whisper', 'whisper'], ['on'], ['my'], ['pillow', 'pillow'], ['leave'], ['the'], ['winter', 'winter'], ['on'], ['the'], ['ground'], ['I'], ['wake'], ['up'], ['lonely', 'lonely'], ['air'], ['of'], ['silence', 'silence'], ['in'], ['the'], ['bedroom', 'bedroom'], ['around', 'around'], ['Touch'], ['me'], ['now'], ['I'], ['close'], ['my'], ['eyes'], ['dream'], ['It'], ['must'], ['have'], ['been'], ['love'], ['but'], ['over', 'over'], ['now'], ['It'], ['must'], ['have'], ['been'], ['good'], ['but'], ['I'], ['lost'], ['it'], ['somehow', 'somehow'], ['It'], ['must'], ['have'], ['been'], ['love'], ['but'], ['over', 'over'], ['now'], ['from'], ['the'], ['moment', 'moment'], ['we'], ['touched'], ['the'], ['time'], ['had'], ['run'], ['out'], ['Makebelieving', 'Makebelieving', 'Makebelieving', 'Makebelieving'], ['together', 'together', 'together'], ['that'], ['sheltered', 'sheltered'], ['by'], ['your']

In [13]:
print(len(sample_features[0][3]))
print(sample_features[0][3])

198
[['Must'], ['have'], ['been'], ['love'], ['but'], ['o', 'ver'], ['now'], ['Lay'], ['a'], ['whis', 'per'], ['on'], ['my'], ['pil', 'low'], ['leave'], ['the'], ['win', 'ter'], ['on'], ['the'], ['ground'], ['I'], ['wake'], ['up'], ['lone', 'ly'], ['air'], ['of'], ['si', 'lence'], ['in'], ['the'], ['bed', 'room'], ['a', 'round'], ['Touch'], ['me'], ['now'], ['I'], ['close'], ['my'], ['eyes'], ['dream'], ['It'], ['must'], ['have'], ['been'], ['love'], ['but'], ['o', 'ver'], ['now'], ['It'], ['must'], ['have'], ['been'], ['good'], ['but'], ['I'], ['lost'], ['it'], ['some', 'how'], ['It'], ['must'], ['have'], ['been'], ['love'], ['but'], ['o', 'ver'], ['now'], ['from'], ['the'], ['mo', 'ment'], ['we'], ['touched'], ['the'], ['time'], ['had'], ['run'], ['out'], ['Make', 'be', 'liev', 'ing'], ['to', 'geth', 'er'], ['that'], ['shel', 'tered'], ['by'], ['your'], ['But'], ['in'], ['and'], ['out', 'side'], ['turned'], ['to'], ['wa', 'ter'], ['like'], ['a'], ['tear'], ['drop'], ['in'], ['your'],

In [14]:

# for a file, we will work with features[0][1] & features[0][3] representing 
# list of midi_list & list of syll_list


In [15]:
data_matrix = np.zeros(shape=(num_sequences_per_song*num_songs, 
                              (num_syll_features + num_midi_features) * seqlength))
data_matrix.shape

(14994, 460)

In [16]:
seq_filename_list = [] # to keep track of filename from which a sequence is extracted
small_file_cntr = 0 # to keep track of files with less than 20 syllable-note pairs

In [15]:
# load all the songs, cut to 20 note-sequence, convert to song embeddinds

i = 0
j = 0

for file in files:
    features = np.load(os.path.join(songs_path, file), allow_pickle=True) # load midi files to feature
    
    if len(features[0][1]) >= seqlength: # seqlength = 20, if length of song > 20 note
        j = 0
        for midiList, syllList in zip(features[0][1], features[0][3]):
            word = ''
            for syll in syllList:
                #print(syll)
                word = word + syll
                if word in list(wordModel.wv.vocab):
                    word2Vec = wordModel.wv[word]
            for midi, syll in zip(midiList, syllList):
                if syll in list(syllModel.wv.vocab):
                    syll2Vec = syllModel.wv[syll]
                    syllWordVec = np.concatenate((syll2Vec,word2Vec)) # joint embedding = syllabus + words
                    if j < seqlength:
                        data_matrix[i][num_midi_features * j:num_midi_features * j + num_midi_features] = midi # append midi
                        data_matrix[i][num_midi_features * seqlength + num_syll_features * j:
                                       num_midi_features * seqlength + num_syll_features * (j + 1)] = syllWordVec # append joint embedding
                        j += 1
                    else:
                        break
        i += 1
        seq_filename_list.append(file)
        #print(syllWordVec)
        if i%100 == 0:
            print("sequence ", i)
    
    else: # seqlength < 20
        small_file_cntr += 1
        
    if len(features[0][1]) >= 2*seqlength: #  if length of song > 40 note
        j = 0
        for midiList, syllList in zip(features[0][1][seqlength:], features[0][3][seqlength:]): # cut one more from note 21
            word = ''
            for syll in syllList:
                word = word + syll
                if word in list(wordModel.wv.vocab):
                    word2Vec = wordModel.wv[word]
            for midi, syll in zip(midiList, syllList):
                if syll in list(syllModel.wv.vocab):
                    syll2Vec = syllModel.wv[syll]
                    syllWordVec = np.concatenate((syll2Vec,word2Vec))
                    if j < seqlength:
                        data_matrix[i][num_midi_features * j:num_midi_features * j + num_midi_features] = midi
                        data_matrix[i][num_midi_features * seqlength + num_syll_features * j:
                                       num_midi_features * seqlength + num_syll_features * (j + 1)] = syllWordVec
                        j += 1
                    else:
                        break
        i += 1
        seq_filename_list.append(file)
        if i%100 == 0:
            print("sequence number ", i)
  

sequence  100
sequence number  200
sequence  300
sequence number  400
sequence  500
sequence  600
sequence number  700
sequence  800
sequence number  900
sequence number  1000
sequence number  1100
sequence number  1200
sequence  1300
sequence number  1400
sequence  1500
sequence  1600
sequence  1700
sequence number  1800
sequence number  1900
sequence  2000
sequence number  2100
sequence  2200
sequence number  2300
sequence number  2400
sequence  2500
sequence  2600
sequence number  2700
sequence  2800
sequence  2900
sequence number  3000
sequence  3100
sequence number  3200
sequence  3300
sequence  3400
sequence number  3500
sequence number  3600
sequence  3700
sequence number  3800
sequence number  3900
sequence number  4000
sequence number  4100
sequence  4200
sequence number  4300
sequence  4400
sequence number  4500
sequence  4600
sequence  4700
sequence  4800
sequence number  4900
sequence number  5000
sequence number  5100
sequence  5200
sequence number  5300
sequence number  5

In [95]:

print('There are {} files out of {} with less than 20 syllable-note pairs hence not considered.'.format(small_file_cntr, len(files)))


There are 343 files out of 7497 with less than 20 syllable-note pairs hence not considered.


In [18]:
data_matrix = data_matrix[0:i, :]
data_matrix.shape

(13937, 460)

In [16]:

# dump sequence filename list & data matrix

pickle.dump(seq_filename_list, open('./data/dataset_filenames/full_filename_list.pkl', 'wb'))
np.save('./data/dataset_matrices/full_data_matrix.npy', data_matrix)


In [18]:

# end of the data matrix generation


In [26]:

# creating train, valid & test matrix from data matrix 


In [27]:

data_matrix.shape


(13937, 460)

In [28]:

len(data_matrix), 0.80 * len(data_matrix), 0.20 * len(data_matrix)


(13937, 11149.6, 2787.4)

In [17]:

# we split the data into 80% train with 11149 sequences and 10% validation & test with 1394 sequences each.

# we want all the sequences extracted from a file to be present exclusively in one set (train/validation/test) only.

# this allows us to bind files to specific sets (train/validation/test)

# this strategy gives us freedom to reconstruct data matrices with different sequence lengths, lyrics encoders etc.


In [31]:

print('Out of {} sequence {} sequences are unique'.format(len(data_matrix), len(np.unique(data_matrix, axis=0))))


Out of 13937 sequence 11824 sequences are unique


In [33]:

# assign the first 11149 sequences in the data matrix to the training set

train_data_matrix = data_matrix[:11149]
train_filename_list = seq_filename_list[:11149]
train_data_matrix.shape, len(train_filename_list), train_filename_list[-2:]


((11149, 460),
 11149,
 ['a24475ca7efdfccf9d003abece827b4c.npy',
  'a24475ca7efdfccf9d003abece827b4c.npy'])

In [34]:

# remove duplicates and create test & valildation set


In [35]:

data_matrix_df = pd.DataFrame(data_matrix)
is_duplicated_df = data_matrix_df.duplicated()
is_duplicated_df.values.sum() # 13937 - 11824


2113

In [36]:

clean_data_matrix_df = data_matrix_df.drop_duplicates() # clean_data_matrix_df contains no duplicate sequences
clean_data_matrix_df.shape


(11824, 460)

In [37]:

clean_data_matrix_df_index = clean_data_matrix_df.index
print(len(clean_data_matrix_df_index), list(clean_data_matrix_df_index))


11824 [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 222

In [38]:

# find indices from clean_data_matrix_df_index which are not part of training set 
clean_data_matrix_df_non_train_index = clean_data_matrix_df_index[clean_data_matrix_df_index >= 11149]
print(len(clean_data_matrix_df_non_train_index), list(clean_data_matrix_df_non_train_index))


2102 [11149, 11150, 11151, 11152, 11153, 11154, 11155, 11156, 11157, 11158, 11159, 11160, 11161, 11162, 11163, 11164, 11165, 11166, 11167, 11168, 11169, 11172, 11173, 11174, 11175, 11176, 11177, 11178, 11179, 11182, 11183, 11188, 11189, 11190, 11191, 11192, 11193, 11194, 11195, 11196, 11197, 11198, 11199, 11200, 11203, 11204, 11205, 11206, 11207, 11208, 11209, 11212, 11213, 11214, 11215, 11218, 11219, 11220, 11221, 11224, 11225, 11226, 11227, 11228, 11229, 11230, 11231, 11232, 11233, 11234, 11237, 11238, 11239, 11240, 11244, 11245, 11246, 11247, 11248, 11249, 11250, 11251, 11252, 11253, 11254, 11255, 11256, 11257, 11258, 11259, 11260, 11261, 11264, 11265, 11268, 11269, 11270, 11271, 11272, 11273, 11274, 11275, 11276, 11277, 11279, 11280, 11281, 11282, 11283, 11286, 11287, 11288, 11289, 11290, 11291, 11292, 11293, 11294, 11295, 11296, 11297, 11298, 11299, 11300, 11301, 11302, 11303, 11304, 11309, 11310, 11311, 11312, 11313, 11314, 11315, 11316, 11317, 11318, 11319, 11320, 11321, 11324, 

In [39]:

valid_test_data_matrix = data_matrix[clean_data_matrix_df_non_train_index]
valid_test_filename_list = np.asarray(seq_filename_list)[clean_data_matrix_df_non_train_index].tolist()


In [40]:

valid_test_data_matrix.shape, len(valid_test_filename_list)


((2102, 460), 2102)

In [41]:

# out of 2102 sequences present in valid_test_matrix, first 50% i.e. 1051 sequences will form validation matrix 
# and remaining 1051 sequences will form the test matrix


In [42]:

valid_data_matrix = valid_test_data_matrix[:1051]
valid_filename_list = valid_test_filename_list[:1051]

test_data_matrix = valid_test_data_matrix[1051:]
test_filename_list = valid_test_filename_list[1051:]

valid_data_matrix.shape, len(valid_filename_list), test_data_matrix.shape, len(test_filename_list)


((1051, 460), 1051, (1051, 460), 1051)

In [43]:

# end of data matrix creation and post processing


In [56]:

# save matrix and file list for different sets


In [57]:

np.save('./data/dataset_matrices/train_data_matrix.npy', train_data_matrix)
np.save('./data/dataset_matrices/valid_data_matrix.npy', valid_data_matrix)
np.save('./data/dataset_matrices/test_data_matrix.npy',  test_data_matrix)


In [58]:

pickle.dump(train_filename_list, open('./data/dataset_filenames/train_filename_list.pkl', 'wb'))
pickle.dump(valid_filename_list, open('./data/dataset_filenames/valid_filename_list.pkl', 'wb'))
pickle.dump(test_filename_list,  open('./data/dataset_filenames/test_filename_list.pkl', 'wb'))


In [None]:

# end of the notebook
