In [1]:
import pandas
import numpy
import torch

In [2]:
print("Subtitle Lookup Preview:")
pandas.read_table("../../datasets/knnw/knnw_en_sub.csv", sep = ";", header=0).head()

Subtitle Lookup Preview:


Unnamed: 0,Number,Start time in milliseconds,End time in milliseconds,Text
0,1,1650,10800,TOHO CORPORATION
1,2,53940,58090,"Some mornings, I wake up crying without knowin..."
2,3,58700,61440,That's when everything happens now and again.
3,4,62060,66540,"Whatever that dream was I had, I can never rem..."
4,5,66540,69550,- But... - But...


In [3]:
print("Audio Shape:")
numpy.load("../../datasets/knnw/knnw_en.spectrogram.npy").shape

Audio Shape:


(129, 1370582)

In [4]:
class KnnwAudioDataset(torch.utils.data.Dataset):
    
    def __init__(self, 
                 audio_path="../../datasets/knnw/knnw_en.spectrogram.npy",
                 subtitle_lookup_path="../../datasets/knnw/knnw_en_sub.csv",
                 total_frames=1370582, 
                 total_duration=6396010):
        
        self.duration_per_frame = total_duration / total_frames
        
        self.audio = numpy.load(audio_path)
        
        self.subtitle_lookup = pandas.read_table(subtitle_lookup_path, 
                                                 sep = ";", header=0)
        
        self.length = len(self.subtitle_lookup)
        
    def __len__(self):
        
        return self.length
    
    def __getitem__(self, i):
        
        start_time = self.subtitle_lookup.iloc[i, 1]
        stop_time = self.subtitle_lookup.iloc[i, 2]
        
        audio_range = self.get_range(start_time, stop_time)
        
        audio_item = self.audio[:,audio_range]
        
        subtitle_item = self.subtitle_lookup.iloc[i, 3]
        subtitle_item = self.get_tokenization(subtitle_item)
        
        return audio_item, subtitle_item
        
    def get_index(self, time, start_flag):
        
        if start_flag == True:
            return numpy.floor(time/self.duration_per_frame)
        
        else:
            return numpy.ceil(time/self.duration_per_frame)
        
    def get_range(self, start_time, end_time):
        
        start_index = self.get_index(start_time, start_flag=True)
        stop_index  = self.get_index(end_time, start_flag=False)
        
        return range(int(start_index), int(stop_index))
    
    def get_tokenization(self, subtitle_item):
        
        return subtitle_item

In [5]:
dataset = KnnwAudioDataset()

4.666637968395908


In [6]:
next(iter(dataset))

(array([[6.7114437e-01, 1.3753934e-01, 4.5676559e-02, ..., 2.4686806e+00,
         4.9535306e-03, 2.6628307e-01],
        [2.5196629e+01, 3.8855080e+01, 2.2727580e+00, ..., 6.0028568e+01,
         7.1121506e+01, 2.8637695e+01],
        [1.7483507e+00, 4.5893925e+01, 4.5668683e+00, ..., 2.3218732e+00,
         4.5152702e+00, 8.6796255e+00],
        ...,
        [2.8274903e-06, 2.8095658e-07, 1.3620793e-05, ..., 1.3395635e-06,
         4.4126537e-06, 1.8380763e-06],
        [2.0729281e-05, 5.7674066e-07, 8.5283239e-07, ..., 6.2710678e-06,
         1.1276571e-07, 3.4037425e-06],
        [1.1499887e-07, 2.5459119e-06, 1.0235656e-05, ..., 1.1129669e-06,
         1.5908822e-07, 7.2525477e-06]], dtype=float32),
 'TOHO CORPORATION')

**Further Reading: How do I split a custom dataset into training and test datasets?**

https://stackoverflow.com/questions/50544730/how-do-i-split-a-custom-dataset-into-training-and-test-datasets