In [1]:
import pandas as pd
import numpy as np

from PIL import Image

In [3]:
def image_to_input_vector(img_name, numcontext):
    '''
    Turn an audio file into feature representation.

    This function has been modified from Mozilla DeepSpeech:
    https://github.com/mozilla/DeepSpeech/blob/master/util/audio.py

    # This Source Code Form is subject to the terms of the Mozilla Public
    # License, v. 2.0. If a copy of the MPL was not distributed with this
    # file, You can obtain one at http://mozilla.org/MPL/2.0/.
    '''

    # Load image file
    # DONT FORGET THE CONVERT L!!!! OTHERWISE there are only 3 channels gr...
    im = np.array(Image.open(img_name).convert('L')
                  , dtype=np.uint32)
    #print(im.shape)
    orig_inputs = im.transpose()
                  
    # Get mfcc coefficients
    #orig_inputs = mfcc(audio, samplerate=fs, numcep=numcep)
    #print(orig_inputs.shape)
    # We only keep every second feature (BiRNN stride = 2)
    orig_inputs = orig_inputs[::2]
    
    # numcep is my # of rows of pixels in the line
    numcep = orig_inputs.shape[1]
    
    # For each time slice of the training set, we need to copy the context this makes
    # the numcep dimensions vector into a numcep + 2*numcep*numcontext dimensions
    # because of:
    #  - numcep dimensions for the current mfcc feature set
    #  - numcontext*numcep dimensions for each of the past and future (x2) mfcc feature set
    # => so numcep + 2*numcontext*numcep
    train_inputs = np.array([], np.float32)
    train_inputs.resize((orig_inputs.shape[0], numcep + 2 * numcep * numcontext))

    # Prepare pre-fix post fix context
    empty_column = np.array([])
    empty_column.resize((numcep))

    # Prepare train_inputs with past and future contexts
    time_slices = range(train_inputs.shape[0])
    context_past_min = time_slices[0] + numcontext
    context_future_max = time_slices[-1] - numcontext
    for time_slice in time_slices:
        # Reminder: array[start:stop:step]
        # slices from indice |start| up to |stop| (not included), every |step|

        # Add empty context data of the correct size to the start and end
        # of the MFCC feature matrix

        # Pick up to numcontext time slices in the past, and complete with empty
        # mfcc features
        need_empty_past = max(0, (context_past_min - time_slice))
        empty_source_past = list(empty_column for empty_slots in range(need_empty_past))
        #print(len(empty_source_past))
        data_source_past = orig_inputs[max(0, time_slice - numcontext):time_slice]
        #print(orig_inputs.shape)
        #print(data_source_past.shape)
        assert(len(empty_source_past) + len(data_source_past) == numcontext)

        # Pick up to numcontext time slices in the future, and complete with empty
        # mfcc features
        need_empty_future = max(0, (time_slice - context_future_max))
        empty_source_future = list(empty_column for empty_slots in range(need_empty_future))
        data_source_future = orig_inputs[time_slice + 1:time_slice + numcontext + 1]
        assert(len(empty_source_future) + len(data_source_future) == numcontext)

        if need_empty_past:
            past = np.concatenate((empty_source_past, data_source_past))
        else:
            past = data_source_past

        if need_empty_future:
            future = np.concatenate((data_source_future, empty_source_future))
        else:
            future = data_source_future

        past = np.reshape(past, numcontext * numcep)
        now = orig_inputs[time_slice]
        future = np.reshape(future, numcontext * numcep)

        train_inputs[time_slice] = np.concatenate((past, now, future))
        assert(len(train_inputs[time_slice]) == numcep + 2 * numcep * numcontext)

    # Scale/standardize the inputs
    # This can be done more efficiently in the TensorFlow graph
    train_inputs = (train_inputs - np.mean(train_inputs)) / np.std(train_inputs)
    return train_inputs

In [5]:
train_inputs = image_to_input_vector('hindi_data/hindi_lines/line_0.jpg', 4)

In [12]:
def get_image_and_transcript(txt_files, img_files, n_context):
    '''
    Loads image files and text transcriptions from ordered lists of filenames.
    Converts to images to df arrays and text to numerical arrays.
    Returns list of arrays. Returned image array list can be padded with
    pad_sequences function in this same module.
    '''
    image = []
    image_len = []
    transcript = []
    transcript_len = []

    for txt_file, img_file in zip(txt_files, img_files):
        # load audio and convert to features
        image_data = image_to_input_vector(img_file, n_context)
        image_data = image_data.astype('float32')

        image.append(image_data)
        image_len.append(np.int32(len(image_data)))

        # load text transcription and convert to numerical array
        #target = normalize_txt_file(txt_file)
        #target = text_to_char_array(target)
        #transcript.append(target)
        #transcript_len.append(len(target))
    print('i',image)
    image = np.asarray(image)
    print('i2',image)
    image_len = np.asarray(image_len)
    transcript = np.asarray(transcript)
    transcript_len = np.asarray(transcript_len)
    return image, image_len, transcript, transcript_len

In [13]:
get_image_and_transcript(['hindi_data/hindi_lines/line_0',
                         'hindi_data/hindi_lines/line_1'],
                         ['hindi_data/hindi_lines/line_0.jpg',
                         'hindi_data/hindi_lines/line_1.jpg'],
                        4)

i [array([[-5.29195452, -5.29195452, -5.29195452, ...,  0.19210966,
         0.19210966,  0.19210966],
       [-5.29195452, -5.29195452, -5.29195452, ...,  0.19210966,
         0.19210966,  0.19210966],
       [-5.29195452, -5.29195452, -5.29195452, ...,  0.19210966,
         0.19210966,  0.19210966],
       ..., 
       [ 0.19210966,  0.19210966,  0.19210966, ..., -5.29195452,
        -5.29195452, -5.29195452],
       [ 0.19210966,  0.19210966,  0.19210966, ..., -5.29195452,
        -5.29195452, -5.29195452],
       [ 0.19210966,  0.19210966,  0.19210966, ..., -5.29195452,
        -5.29195452, -5.29195452]], dtype=float32), array([[-5.29195452, -5.29195452, -5.29195452, ...,  0.19210966,
         0.19210966,  0.19210966],
       [-5.29195452, -5.29195452, -5.29195452, ...,  0.19210966,
         0.19210966,  0.19210966],
       [-5.29195452, -5.29195452, -5.29195452, ...,  0.19210966,
         0.19210966,  0.19210966],
       ..., 
       [ 0.19210966,  0.19210966,  0.19210966, ..., -5

(array([[[-5.29195452, -5.29195452, -5.29195452, ...,  0.19210966,
           0.19210966,  0.19210966],
         [-5.29195452, -5.29195452, -5.29195452, ...,  0.19210966,
           0.19210966,  0.19210966],
         [-5.29195452, -5.29195452, -5.29195452, ...,  0.19210966,
           0.19210966,  0.19210966],
         ..., 
         [ 0.19210966,  0.19210966,  0.19210966, ..., -5.29195452,
          -5.29195452, -5.29195452],
         [ 0.19210966,  0.19210966,  0.19210966, ..., -5.29195452,
          -5.29195452, -5.29195452],
         [ 0.19210966,  0.19210966,  0.19210966, ..., -5.29195452,
          -5.29195452, -5.29195452]],
 
        [[-5.29195452, -5.29195452, -5.29195452, ...,  0.19210966,
           0.19210966,  0.19210966],
         [-5.29195452, -5.29195452, -5.29195452, ...,  0.19210966,
           0.19210966,  0.19210966],
         [-5.29195452, -5.29195452, -5.29195452, ...,  0.19210966,
           0.19210966,  0.19210966],
         ..., 
         [ 0.19210966,  0.19210

In [6]:
get_image_t

In [None]:
def image_to_input_vector(audio_filename, numcontext):
    '''
    Turn an audio file into feature representation.

    This function has been modified from Mozilla DeepSpeech:
    https://github.com/mozilla/DeepSpeech/blob/master/util/audio.py

    # This Source Code Form is subject to the terms of the Mozilla Public
    # License, v. 2.0. If a copy of the MPL was not distributed with this
    # file, You can obtain one at http://mozilla.org/MPL/2.0/.
    '''

    # Load image file
    # DONT FORGET THE CONVERT L!!!! OTHERWISE there are only 3 channels gr...
    im = np.array(Image.open('hindi_data/hindi_lines/line_0.jpg').convert('L')
                  , dtype=np.uint32)
    #print(im.shape)
    orig_inputs = im.transpose()
                  
    # Get mfcc coefficients
    #orig_inputs = mfcc(audio, samplerate=fs, numcep=numcep)
    #print(orig_inputs.shape)
    # We only keep every second feature (BiRNN stride = 2)
    orig_inputs = orig_inputs[::2]
    
    # numcep is my # of rows of pixels in the line
    numcep = orig_inputs.shape[1]
    
    # For each time slice of the training set, we need to copy the context this makes
    # the numcep dimensions vector into a numcep + 2*numcep*numcontext dimensions
    # because of:
    #  - numcep dimensions for the current mfcc feature set
    #  - numcontext*numcep dimensions for each of the past and future (x2) mfcc feature set
    # => so numcep + 2*numcontext*numcep
    train_inputs = np.array([], np.float32)
    train_inputs.resize((orig_inputs.shape[0], numcep + 2 * numcep * numcontext))

    # Prepare pre-fix post fix context
    empty_column = np.array([])
    empty_column.resize((numcep))

    # Prepare train_inputs with past and future contexts
    time_slices = range(train_inputs.shape[0])
    context_past_min = time_slices[0] + numcontext
    context_future_max = time_slices[-1] - numcontext
    for time_slice in time_slices:
        # Reminder: array[start:stop:step]
        # slices from indice |start| up to |stop| (not included), every |step|

        # Add empty context data of the correct size to the start and end
        # of the MFCC feature matrix

        # Pick up to numcontext time slices in the past, and complete with empty
        # mfcc features
        need_empty_past = max(0, (context_past_min - time_slice))
        empty_source_past = list(empty_column for empty_slots in range(need_empty_past))
        #print(len(empty_source_past))
        data_source_past = orig_inputs[max(0, time_slice - numcontext):time_slice]
        #print(orig_inputs.shape)
        #print(data_source_past.shape)
        assert(len(empty_source_past) + len(data_source_past) == numcontext)

        # Pick up to numcontext time slices in the future, and complete with empty
        # mfcc features
        need_empty_future = max(0, (time_slice - context_future_max))
        empty_source_future = list(empty_column for empty_slots in range(need_empty_future))
        data_source_future = orig_inputs[time_slice + 1:time_slice + numcontext + 1]
        assert(len(empty_source_future) + len(data_source_future) == numcontext)

        if need_empty_past:
            past = np.concatenate((empty_source_past, data_source_past))
        else:
            past = data_source_past

        if need_empty_future:
            future = np.concatenate((data_source_future, empty_source_future))
        else:
            future = data_source_future

        past = np.reshape(past, numcontext * numcep)
        now = orig_inputs[time_slice]
        future = np.reshape(future, numcontext * numcep)

        train_inputs[time_slice] = np.concatenate((past, now, future))
        assert(len(train_inputs[time_slice]) == numcep + 2 * numcep * numcontext)

    # Scale/standardize the inputs
    # This can be done more efficiently in the TensorFlow graph
    train_inputs = (train_inputs - np.mean(train_inputs)) / np.std(train_inputs)
    return train_inputs