## Creating feature vectors for input and target
Vikram Reddy

4/22/17

In [76]:
# Imports
import numpy as np
import pandas as pd
from scipy.sparse import hstack
from PIL import Image

In [77]:
def get_input_vec_grouped(line, window_size):
    ''' 
    Returns a training vector for the input hindi line that consists
    of a time series of data. At each time step, there is a window-size length of
    vectors
    '''
    im = np.array(Image.open('hindi_data/hindi_lines/line_0.jpg').convert('L')
                  , dtype=np.uint32)
    #df = pd.DataFrame(im)
    im = im.transpose()
    num_features, time_steps = im.shape
    
    train_inputs = np.zeros((time_steps, num_features + 2 * num_features * window_size))
    
    empty_feature_vec = np.zeros((num_features))
    
    time_slices = range(time_steps)
    context_past_min = time_slices[0] + window_size
    context_future_max = time_slices[-1] - window_size
    
    for p in time_slices:
        # creating sequences of past context
        need_empty_past = max(0, (context_past_min - p)) # 9 counting down to 1, then 0s

        # only the first 9 have empty past source (if 9 is window_size)
        empty_source_past = list(empty_feature_vec for empty_slots in range(need_empty_past))
        #print(empty_source_past)
        data_source_past = im[max(0, p - window_size):p] # starts with empty list of [0:0]
        
        print(len(data_source_past), data_source_past.shape)
        #print(len(empty_source_past), empty_source_past.shape)
        print('winnnndow', window_size)
        print(len(data_source_past))
        print(p)
        assert(len(empty_source_past) + len(data_source_past) == window_size)

        # creating sequences of future context
        need_empty_future = max(0, (p - context_future_max))
        empty_source_future = list(empty_mfcc for empty_slots in range(need_empty_future))

        # at each point in time, I have a list of mfcc features (with dim 13), where the list has dim=window_size
        # So I am essentially creating features for each window_size-sized bin, centering around a timepoint
        data_source_future = im[p + 1:p + window_size + 1]
        assert(len(empty_source_future) + len(data_source_future) == window_size)

        if need_empty_past:
            past = hstack((empty_source_past, data_source_past))
        else:
            past = data_source_past

        if need_empty_future:
            future = np.concatenate((data_source_future, empty_source_future))
        else:
            future = data_source_future

        # flatten the arrays
        past = np.resize(past, (window_size * num_features,))
        now = strided_input[p]
        future = np.resize(future, (window_size * num_features,))
        assert(len(past) + len(now) + len(future) == num_features + 2*window_size*num_features)

        train_inputs[p] = np.concatenate((past,now,future))

    # Scale/Standardize the inputs
    train_inputs = (train_inputs - np.mean(train_inputs)) / np.std(train_inputs)


In [78]:
df = np.array(Image.open('hindi_data/hindi_lines/line_0.jpg').convert('L'), 
                          dtype=np.uint32)

In [79]:
df.transpose().shape

(2601, 89)

In [80]:
training = get_input_vec_grouped('line_0.jpg', 3)

0 (0, 89)
winnnndow 3
0
0


ValueError: blocks[0,:] has incompatible row dimensions. Got blocks[0,1].shape[0] == 0, expected 3.

In [31]:
def get_df(line, window_size):
    ''' 
    Returns a training vector for the input hindi line that consists
    of a time series of data. At each time step, there is a window-size length of
    vectors
    '''
    im = np.array(Image.open('hindi_data/hindi_lines/line_0.jpg').convert('L')
                  , dtype=np.uint32)
    df = pd.DataFrame(im)
    
    return df

In [32]:
df = get_df('lfds',3)

In [45]:
np.array(df.iloc[:,3:5]).shape

(89, 2)

In [22]:
num_rows, num_columns = df.shape
time_step = num_columns

In [23]:
train_inputs = np.zeros((time_step, num_rows + 2 * num_rows * 3))

In [24]:
train_inputs

array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       ..., 
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]])