Create raw_data subset based on subset of available files in vgg16_features.

In [6]:
import os
import pandas as pd
import numpy as np
from six.moves import cPickle as pickle

In [2]:
data_folder = '../data/generated2'
image_folder = os.path.join(data_folder,'formula_images')
raw_data_dir = os.path.join(data_folder, 'training')

In [3]:
def make_seq_bins(df_):
    """
    Creates ndarrays of (padded) sequence bins from df_train_padded 
    and pickles them as a dictionary of ndarrays wrapped in dataframes.
    This preprocessing is needed in order to quickly obtain an ndarray of
    token-sequences at training time.
    """
    bin_lens = df_.bin_len.unique()
    bins = {}
    bins_squashed = {}
    
    for len_ in bin_lens:
        df_slice = df_[df_.padded_seq_len == len_]
        bin_ = np.array(df_slice.padded_seq.values.tolist(), dtype=np.int32)
        bin_squashed = np.array(df_slice.squashed_seq.values.tolist(), dtype=np.int32)
        assert bin_.shape[1] == len_
        assert bin_.shape[0] == df_slice.shape[0]
        bins[len_] = pd.DataFrame(bin_, index=df_slice.index)
        bins_squashed[len_] = pd.DataFrame(bin_squashed, index=df_slice.index)
    return bins, bins_squashed



In [4]:
def check(bins, ser):
    ## spot-check that the two file contents match.
    for bin_ in bins.keys():
        for loc_ in bins[bin_].index.values:
            ## print ('spot-checking bin %d and loc %d'%(bin_,loc_))
            try:
                assert sum(bins[bin_].loc[loc_].values != ser[loc_]) == 0
            except:
                print('Assertion failed at bin %d and loc %d'%(bin_,loc_))
                raise

In [7]:
temp_dir = os.path.join(raw_data_dir, 'temp_dir')
image_features_folder = os.path.join(raw_data_dir, 'vgg16_features')

image_list = [os.path.splitext(s)[0]+'.png' for s in filter(lambda s: s.endswith('.pkl'),os.listdir(image_features_folder))]

df_train = pd.read_pickle(os.path.join(raw_data_dir, 'df_train.pkl'))

df_train_short = df_train[df_train.image.isin(image_list)]
print (df_train_short.shape)
df_train_short.to_pickle(os.path.join(temp_dir, 'df_train.pkl'))

bins_train_short, bins_sq_train_short = make_seq_bins(df_train_short)

check(bins_train_short, df_train_short.padded_seq)
check(bins_sq_train_short, df_train_short.squashed_seq)

with open(os.path.join(raw_data_dir, 'temp_dir/raw_seq_train.pkl'), 'wb') as f:
  pickle.dump(bins_train_short, f, pickle.HIGHEST_PROTOCOL)
with open(os.path.join(raw_data_dir, 'temp_dir/raw_seq_sq_train.pkl'), 'wb') as f:
  pickle.dump(bins_sq_train_short, f, pickle.HIGHEST_PROTOCOL)

(292, 12)
