In [1]:
import collections
import helper
import numpy as np
import pandas as pd
import re
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model
from keras.layers import GRU, Input, Dense, TimeDistributed, Activation, RepeatVector, Bidirectional, Dropout
from keras.layers.embeddings import Embedding
from keras.optimizers import Adam
from keras.losses import sparse_categorical_crossentropy

In [2]:
def tokenize(x):
    """
    generates tokenizer for corpus
    
    inputs:
        x: vector of base strings
        
    outputs: 
        vector of tokenized string sequences
        tokenizer
    """
    x_tk = Tokenizer(
        char_level = False,
        filters='!"#$%&()*+.:;<=>?@[\\]^_`{|}~\t\n',
        oov_token="<OOV>"
    )
    x_tk.fit_on_texts(x)
    return x_tk.texts_to_sequences(x), x_tk

def pad(x, length=None):
    """
    adds padding tokens to the end of strings to make them conform to 
        a uniform length.
    
    inputs:
        x: a vector of token sequences
        length: the intended length to which to pad each sequence
        
    outputs:
        vector of padded sequences
    """
    if length is None:
        length = max([len(sentence) for sentence in x])
    return pad_sequences(x, maxlen = length, padding = 'post')

def preprocess(x, y):
    """
    cleans, tokenizes, and pads input strings
    
    inputs: 
        x: vector of base strings
        y: vector of translated strings
        
    outputs:
        tokenized, padded base strings
        tokenized, padded translated strings
        base string tokenizer
        translated string tokenizer
    """
    
    x = x.astype(str).str.lower()
    x = [re.sub(r'\s+',' ',re.sub(r'\([^)]*\)', '', str(z))).strip() for z in x]
    x = [re.sub(r"([0-9]+(\.[0-9]+)?)",r" \1 ", str(z)).strip() for z in x]
    x = [z.replace("-"," ") for z in x]
    
    y = y.astype(str).str.lower()
    y = [re.sub(r'\s+',' ',re.sub(r'\([^)]*\)', '', str(z))).strip() for z in y]
    y = [re.sub(r"([0-9]+(\.[0-9]+)?)",r" \1 ", str(z)).strip() for z in y]
    y = [z.replace("-"," ") for z in y]
    
    # outputs tokenized sequences (1,2) and tokenizers (3,4)
    preprocess_x, x_tk = tokenize(x)
    preprocess_y, y_tk = tokenize(y)
    
    preprocess_x = pad(preprocess_x)
    preprocess_y = pad(preprocess_y)
    # Keras's sparse_categorical_crossentropy function requires the labels to be in 3 dimensions
    preprocess_y = preprocess_y.reshape(*preprocess_y.shape, 1)
    
    return preprocess_x, preprocess_y, x_tk, y_tk



def preprocess_v2(x, y):
    """
    cleans, tokenizes, and pads input strings
    
    inputs: 
        x: vector of base strings
        y: vector of translated strings
        
    outputs:
        tokenized, padded base strings
        tokenized, padded translated strings
        base string tokenizer
        translated string tokenizer
    """
    
    x = x.astype(str).str.lower()
    x = [re.sub(r'\s+',' ',re.sub(r'\([^)]*\)', '', str(z))).strip() for z in x]
    x = [re.sub(r"([0-9]+(\.[0-9]+)?)",r" \1 ", str(z)).strip() for z in x]
    x = [z.replace("-"," ") for z in x]

    y = y.astype(str).str.lower()
    y = [re.sub(r'\s+',' ',re.sub(r'\([^)]*\)', '', str(z))).strip() for z in y]
    y = [re.sub(r"([0-9]+(\.[0-9]+)?)",r" \1 ", str(z)).strip() for z in y]
    y = [z.replace("-"," ") for z in y]
    
    all_strings = ingredient_strings
    all_strings.append(category_strings)
    
    all_tokenizer = Tokenizer(
        char_level = False,
        filters='!"#$%&()*+.:;<=>?@[\\]^_`{|}~\t\n',
        oov_token="<OOV>"
    )
    all_tokenizer.fit_on_texts(all_strings)
    #return x_tk.texts_to_sequences(x), x_tk
    
    # outputs tokenized sequences (1,2) and tokenizers (3,4)
    preprocess_x = all_tokenizer.texts_to_sequences(x)
    preprocess_y = all_tokenizer.texts_to_sequences(y)
    
    preprocess_x = pad(preprocess_x)
    preprocess_y = pad(preprocess_y)
    # Keras's sparse_categorical_crossentropy function requires the labels to be in 3 dimensions
    preprocess_y = preprocess_y.reshape(*preprocess_y.shape, 1)
    
    return preprocess_x, preprocess_y, all_tokenizer




def logits_to_text(logits, tokenizer):
    """
    turns numeric predictions back into readable text
    
    inputs:
        logits: the output prediction of the trained model
        tokenizer: the tokenizer used to encode the predicted sequence
        
    outputs:
        a string representing the translated phrase
    """
    index_to_words = {id: word for word, id in tokenizer.word_index.items()}
    index_to_words[0] = '<PAD>'
    return ' '.join([index_to_words[prediction] for prediction in np.argmax(logits, 1)])

def model_final(
    input_shape,
    output_sequence_length,
    ingredient_vocab_size,
    category_vocab_size,
    embeddings_matrix,
    learning_rate = LEARNING_RATE
):
  
    model = Sequential()
    #model.add(Embedding(
    #    input_dim=ingredient_vocab_size,
    #    output_dim=128,
    #    input_length=input_shape[1]
    #))
    model.add(Embedding(
        ingredient_vocab_size, 
        embedding_dim, 
        input_length=input_shape[1], 
        weights=[embeddings_matrix], 
        trainable=False
    ))
    
    model.add(Dropout(0.2))
    model.add(Bidirectional(GRU(256,return_sequences=False)))
    model.add(RepeatVector(output_sequence_length))
    model.add(Bidirectional(GRU(256,return_sequences=True)))
    model.add(TimeDistributed(Dense(category_vocab_size,activation='softmax')))
    
    model.compile(
        loss = sparse_categorical_crossentropy,
        optimizer = Adam(learning_rate),
        metrics = ['accuracy']
    )
    
    return model

def get_prediction(x, y, x_tk, y_tk, i, m):
    
    #need to try tokenizer.texts_to_sequences
    y_id_to_word = {value: key for key, value in y_tk.word_index.items()}
    y_id_to_word[0] = '<PAD>'
    
    i = [x_tk.word_index[word] for word in i.split()]
    #i = x_tk.texts_to_sequences(i)
    i = pad_sequences([i], maxlen=x.shape[-1], padding='post')
    i = np.array([i[0], x[0]])
    predictions = m.predict(i, len(i))

    print(' '.join([y_id_to_word[np.argmax(x)] for x in predictions[0]]))



NameError: name 'LEARNING_RATE' is not defined

In [555]:
#hyperparameters
################################<-training####<-validation####<-testing
EPOCHS = 9
TEST_PORTION = 0.1
VALIDATION_PORTION = 0.1
LEARNING_RATE = 0.005
SAMPLES = 222300

#derived parameters
end_testing = int(SAMPLES * (1 - VALIDATION_PORTION))
end_training = int(end_testing * (1 - TEST_PORTION))

base = pd.read_csv(
    'part_tagging_data/translation_corpus_v2.csv').sample(SAMPLES)
base.i_str = base.i_str.astype(str)
base.string = base.string.astype(str)

base_training = base[:end_training]
base_validation = base[end_training:end_testing]
base_testing = base[end_testing:]

ingredient_strings = base.string
category_strings = base.i_str


In [557]:
preproc_ingredient_strings,\
preproc_category_strings,\
all_tokenizer =\
preprocess_v2(ingredient_strings, category_strings)

ingredient_training = preproc_ingredient_strings[:end_training]
ingredient_validation = preproc_ingredient_strings[end_training:end_testing]
ingredient_testing = preproc_ingredient_strings[end_testing:]

category_training = preproc_category_strings[:end_training]
category_validation = preproc_category_strings[end_training:end_testing]
category_testing = preproc_category_strings[end_testing:]

max_ingredient_sequence_length = preproc_ingredient_strings.shape[1]
max_category_sequence_length = preproc_category_strings.shape[1]
ingredient_vocab_size = len(ingredient_tokenizer.word_index)
category_vocab_size = len(category_tokenizer.word_index) + 1

tmp_x = pad(preproc_ingredient_strings)


In [558]:
# initialize the embeddings matrix
embedding_dim = 100

embeddings_index = {}
with open('glove.6B.100d.txt') as f:
    for line in f:
        values = line.split();
        word = values[0];
        coefs = np.asarray(values[1:], dtype='float32');
        embeddings_index[word] = coefs;

        

In [559]:
# apply the embeddings to create the pretrained weights for this corpus
embeddings_matrix = np.zeros((len(all_tokenizer.word_index)+1, embedding_dim));
for word, i in all_tokenizer.word_index.items():
    embedding_vector = embeddings_index.get(word);
    if embedding_vector is not None:
        embeddings_matrix[i] = embedding_vector;
        

In [560]:
model_f = model_final(
    tmp_x.shape,
    preproc_category_strings.shape[1],
    len(all_tokenizer.word_index)+1,
    len(all_tokenizer.word_index)+1,
    embeddings_matrix
)


In [561]:
model_f.fit(
    tmp_x, 
    preproc_category_strings, 
    batch_size = 256, 
    epochs = EPOCHS, 
    validation_data = (ingredient_validation,category_validation)
)


Epoch 1/9
Epoch 2/9
Epoch 3/9
Epoch 4/9
Epoch 5/9
Epoch 6/9
Epoch 7/9
Epoch 8/9
Epoch 9/9


<tensorflow.python.keras.callbacks.History at 0x7f90f525f250>

In [562]:
all_df = pd.read_csv("part_tagging_data/all_df_reconciler_v2.csv")
new_targets = all_df.string.astype(str)
#print(new_targets)

In [563]:
def get_prediction(x, y, x_tk, y_tk, i, m):
    
    
    #need to try tokenizer.texts_to_sequences
    y_id_to_word = {value: key for key, value in y_tk.word_index.items()}
    y_id_to_word[0] = '<PAD>'
    
    #i = [x_tk.word_index[word] for word in i.split()]
    
    i = i.astype(str).str.lower()
    i = [re.sub(r'\s+',' ',re.sub(r'\([^)]*\)', '', str(z))).strip() for z in i]
    i = [re.sub(r"([0-9]+(\.[0-9]+)?)",r" \1 ", str(z)).strip() for z in i]
    i = [z.replace("-"," ") for z in i]
    
    i = x_tk.texts_to_sequences(i)
    #i = x_tk.texts_to_sequences(i)
    i = pad_sequences(i, maxlen=x.shape[-1], padding='post')
    i = np.array(i)
    #predictions = m.predict(i)
    predictions = list(map(
        lambda x: ' '.join(
            filter(lambda z: z != "<PAD>",[y_id_to_word[np.argmax(y)] for y in x])
        ),
        m.predict(i)
    ))
    return predictions
    #print(' '.join([y_id_to_word[np.argmax(x)] for x in predictions[0]]))
    
    
testing_preds = get_prediction(
    preproc_ingredient_strings,
    preproc_category_strings, 
    all_tokenizer, 
    all_tokenizer,
    new_targets,
    model_f
)



In [581]:
model_f.save('ingredient_role_translator')

In [564]:
all_df['preds'] = testing_preds
all_df.to_csv("part_tagging_data/all_df_reconciler_v2.csv")

In [579]:
test_ings = pd.DataFrame([
    "'1  cup frosting canned or from scratch; try vanilla or cream cheese or chocolate",
    "2   4  ounces vodka depending on how strong you want it",
    "Sprinkles and corn syrup or honey for rimming the glasses"
])
print(
    get_prediction(
    preproc_ingredient_strings,
    preproc_category_strings, 
    all_tokenizer, 
    all_tokenizer,
    test_ings[0].astype(str),
    model_f
    )
)


['ripe chocolate', 'the kings juice', 'milk']


In [None]:
#### to delete i think



#base_testing['preds'] = testing_preds
print(base_testing.head())
base_testing.to_csv("reconciler.csv")


def final_predictions(x, y, x_tk, y_tk, m, tst_snt):
    tmp_X = pad(preproc_ingredient_strings)

    y_id_to_word = {value: key for key, value in y_tk.word_index.items()}
    y_id_to_word[0] = '<PAD>'
    
    tst_snt = [x_tk.word_index[word] for word in tst_snt.split()]
    
    tst_snt = pad_sequences([tst_snt], maxlen=x.shape[-1], padding='post')
    tst_snt = pad(tst_snt, y.shape[1])
    predictions = m.predict(tst_snt.reshape((-1, y.shape[-2], 1)))

    print('Sample 1:')
    print(' '.join([y_id_to_word[np.argmax(x)] for x in predictions[0]]))

#final_predictions(
#    preproc_ingredient_strings, 
#    preproc_category_strings, 
#    ingredient_tokenizer, 
#    category_tokenizer,
#    bidi_model,
#    "2 3/4 oz quality soy sauce, warmed"
#)