In [1]:
import pandas as pd
import logging
import glob
from sklearn.model_selection import train_test_split
pd.set_option('display.max_colwidth', 500)
logger = logging.getLogger()
logger.setLevel(logging.WARNING)

In [6]:
#read in data sample 2M rows (for speed of tutorial)
traindf, testdf = train_test_split(pd.read_csv('Excavation_rawdata.csv'), 
                                   test_size=.10)


#print out stats about shape of data
print(f'Train: {traindf.shape[0]:,} rows {traindf.shape[1]:,} columns')
print(f'Test: {testdf.shape[0]:,} rows {testdf.shape[1]:,} columns')

# preview data
traindf.head(3)
#testdf.head(3)

Train: 34 rows 2 columns
Test: 4 rows 2 columns


Unnamed: 0,excavation,summarize
10,"earthwork in excavation upto 3.0m depth below existing ground level including lifting, dressing of sides and ramming of bottom, shoring, strutting, dewatering, stacking of excavated soil with all equipment and materials, all complete as per specification, drawings and instructions of the purchaser / engineer, including a lead upto 100 m within plant boundary for all types of soil",excavation upto 3.0m depth a lead upto 100 m within plant boundary for all types of soil
27,"Earthwork in excavation for depth beyond 10.0m below ground level for foundations, trenches, pipe lines, storm water drains etc. including lifting, dressing of sides and ramming of bottom, shoring, strutting, dewatering, stacking of excavated soil with all equipment and materials, all complete as per specification, drawings and instructions of the Purchaser / Engineer, including a lead upto 500 m within plant boundary for all types of Soil, weathered rock. Measurement shall be limited to 10 ...",excavation for depth beyond 10.0m including a lead upto 500 m within plant all types of Soil
37,Disposal of surplus excavated earth beyond a lead of 1.0 KM upto 8.0 KM dumping and leveling the area from where the dumped earth is to be transported complete with all lifts as specified and directed by Engineer.,a lead of 1.0 KM upto 8.0 KM dumping and leveling all lifts


In [7]:
train_body_raw = traindf.excavation.tolist()
train_title_raw = traindf.summarize.tolist()
#preview output of first element
train_body_raw[0]


'earthwork in excavation upto 3.0m depth below existing ground level including lifting, dressing of sides and ramming of bottom, shoring, strutting, dewatering, stacking of excavated soil with all equipment and materials, all complete as per specification, drawings and instructions of the purchaser / engineer, including a lead upto 100 m within plant boundary for all types of soil'

In [8]:
%reload_ext autoreload
%autoreload 2
from ktext.preprocess import processor

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [11]:
from ktext.preprocess import processor
# instantiate data processing object
body_pp = processor(keep_n=1000, padding_maxlen=100)
# process data
train_body_vecs = body_pp.fit_transform(train_body_raw)



In [9]:
print(train_body_raw[0])

earthwork in excavation upto 3.0m depth below existing ground level including lifting, dressing of sides and ramming of bottom, shoring, strutting, dewatering, stacking of excavated soil with all equipment and materials, all complete as per specification, drawings and instructions of the purchaser / engineer, including a lead upto 100 m within plant boundary for all types of soil


In [12]:
print(train_body_vecs[0])

[ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 36  9 10 24  4 46
 33 31 93 25 43  7 50 37  2 48  3 41  2 19 28 29 34 32  2 18 11 17  6 51
  3 44  6 22 12 15 38 35  3 47  2  5 56 23  7 20 16 24  4 26 57 61 62  8
  6 27  2 11]


In [13]:
# instantiate the pre-processor for titles
title_pp = processor(append_indicators=True, keep_n=4500, 
                     padding_maxlen=30, padding ='post')
# process the titles
train_title_vecs = title_pp.fit_transform(train_title_raw)



In [9]:
print('\noriginal string:\n', train_title_raw[0])
print('after pre-processing:\n', train_title_vecs[0])



original string:
 Excavation for leveling of ground soil of all types excavated material up to a distance of 50 m
after pre-processing:
 [ 3 15 20 41  5 42 10  5  8 11 32 43 19 16 17 44  5  2  9  4  0  0  0  0
  0  0  0  0  0  0]


In [14]:


import dill as dpickle
import numpy as np

# Save the preprocessor
with open('body_pp.dpkl', 'wb') as f:
    dpickle.dump(body_pp, f)

with open('title_pp.dpkl', 'wb') as f:
    dpickle.dump(title_pp, f)

# Save the processed data
np.save('train_title_vecs.npy', train_title_vecs)
np.save('train_body_vecs.npy', train_body_vecs)



In [15]:
from seq2seq_utils import load_decoder_inputs, load_encoder_inputs, load_text_processor

In [12]:
encoder_input_data, doc_length = load_encoder_inputs('train_body_vecs.npy')
decoder_input_data, decoder_target_data = load_decoder_inputs('train_title_vecs.npy')

Shape of encoder input: (34, 100)
Shape of decoder input: (34, 29)
Shape of decoder target: (34, 29)


In [13]:
num_encoder_tokens, body_pp = load_text_processor('body_pp.dpkl')
num_decoder_tokens, title_pp = load_text_processor('title_pp.dpkl')

Size of vocabulary for body_pp.dpkl: 282
Size of vocabulary for title_pp.dpkl: 93


In [14]:
%matplotlib inline
from keras.models import Model
from keras.layers import Input, LSTM, GRU, Dense, Embedding, Bidirectional, BatchNormalization
from keras import optimizers

In [15]:
    from keras.models import Model
    from keras.layers import Input, LSTM, GRU, Dense, Embedding, Bidirectional, BatchNormalization
    from keras import optimizers

    #arbitrarly set latent dimension for embedding and hidden units
    latent_dim = 300

    ##### Define Model Architecture ######

    ########################
    #### Encoder Model ####
    encoder_inputs = Input(shape=(doc_length,), name='Encoder-Input')

    # Word embeding for encoder (ex: Issue Body)
    x = Embedding(num_encoder_tokens, 
                  latent_dim, 
                  name='Body-Word-Embedding', 
                  mask_zero=False)(encoder_inputs)

    x = BatchNormalization(name='Encoder-Batchnorm-1')(x)

    # We do not need the `encoder_output` just the hidden state.
    _, state_h = GRU(latent_dim, return_state=True, name='Encoder-Last-GRU')(x)

    # Encapsulate the encoder as a separate entity so we can just 
    #  encode without decoding if we want to.
    encoder_model = Model(inputs=encoder_inputs, 
                          outputs=state_h, 
                          name='Encoder-Model')

    seq2seq_encoder_out = encoder_model(encoder_inputs)

    ########################
    #### Decoder Model ####
    decoder_inputs = Input(shape=(None,), name='Decoder-Input')  # for teacher forcing

    # Word Embedding For Decoder (ex: Issue Titles)
    dec_emb = Embedding(num_decoder_tokens, 
                        latent_dim, 
                        name='Decoder-Word-Embedding', 
                        mask_zero=False)(decoder_inputs)

    dec_bn = BatchNormalization(name='Decoder-Batchnorm-1')(dec_emb)

    # Set up the decoder, using `decoder_state_input` as initial state.
    decoder_gru = GRU(latent_dim, 
                      return_state=True, 
                      return_sequences=True, 
                      name='Decoder-GRU')

    decoder_gru_output, _ = decoder_gru(dec_bn, initial_state=seq2seq_encoder_out)
    x = BatchNormalization(name='Decoder-Batchnorm-2')(decoder_gru_output)

    # Dense layer for prediction
    decoder_dense = Dense(num_decoder_tokens, 
                          activation='softmax', 
                          name='Final-Output-Dense')

    decoder_outputs = decoder_dense(x)

    ########################
    #### Seq2Seq Model ####

    #seq2seq_decoder_out = decoder_model([decoder_inputs, seq2seq_encoder_out])
    seq2seq_Model = Model([encoder_inputs, decoder_inputs], decoder_outputs)


    seq2seq_Model.compile(optimizer=optimizers.Nadam(lr=0.001), 
    loss='sparse_categorical_crossentropy')

In [16]:
from seq2seq_utils import viz_model_architecture
seq2seq_Model.summary()
#viz_model_architecture(seq2seq_Model)

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Decoder-Input (InputLayer)      (None, None)         0                                            
__________________________________________________________________________________________________
Decoder-Word-Embedding (Embeddi (None, None, 300)    27900       Decoder-Input[0][0]              
__________________________________________________________________________________________________
Encoder-Input (InputLayer)      (None, 100)          0                                            
__________________________________________________________________________________________________
Decoder-Batchnorm-1 (BatchNorma (None, None, 300)    1200        Decoder-Word-Embedding[0][0]     
__________________________________________________________________________________________________
Encoder-Mo

In [21]:
from keras.callbacks import CSVLogger, ModelCheckpoint

script_name_base = 'tutorial_seq2seq'
csv_logger = CSVLogger('{:}.log'.format(script_name_base))
model_checkpoint = ModelCheckpoint('{:}.epoch{{epoch:02d}}-val{{val_loss:.5f}}.hdf5'.format(script_name_base),
                                   save_best_only=True)

batch_size = 1200
epochs = 10
history = seq2seq_Model.fit([encoder_input_data, decoder_input_data], np.expand_dims(decoder_target_data, -1),
          batch_size=batch_size,
          epochs=epochs,
          validation_split=0.12, callbacks=[csv_logger, model_checkpoint])

Train on 29 samples, validate on 5 samples
Epoch 1/10


  '. They will not be included '


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [18]:


#save model
seq2seq_Model.save('seq2seq_model_tutorial.h5')



  '. They will not be included '


In [22]:
from seq2seq_utils import Seq2Seq_Inference
seq2seq_inf = Seq2Seq_Inference(encoder_preprocessor=body_pp,
                                 decoder_preprocessor=title_pp,
                                 seq2seq_model=seq2seq_Model)

In [23]:
# this method displays the predictions on random rows of the holdout set
seq2seq_inf.demo_model_predictions(n=25, issue_df=testdf)




Issue Body:
 excavation in ordinary soil including lifting, dressing of sides and ramming of bottom, shoring, strutting, stacking of excavated soil, including disposal up to a lead of 50 m with manpower, machinery, tools and tackles, consumables etc. all complete as per the technical specifications, design, drawings and directions of the engineer-in-charge 

Original Title:
 ordinary soil  disposal up to a lead of 50 m with manpower

****** Machine Generated Title (Prediction) ******:
 excavation in ordinary soil disposal up to a lead of number m



Issue Body:
 earthwork in excavation upto 2.0m depth below existing ground level for foundations including stripping of top soil, lifting, dressing of sides and ramming of bottom, stacking of excavated soil, dewatering with all equipment and materials complete as per specification, drawings and instructions of the purchaser / engineer, including a lead upto 100 m within plant boundary for all types of soil 

Original Title:
 excavation u


Original Title:
 exceeding 0 m but not exceeding 10.0 m a lead up to 500 m within plant boundary for all type of soil

****** Machine Generated Title (Prediction) ******:
 excavation for depth exceeding number 0m but not exceeding number 0m but not exceeding number 0m a lead upto number m within plant all types of soil



Issue Body:
 earthwork in excavation for depth exceeding 0 m but not exceeding 10.0 m below ground level for foundations, trenches, pipe lines, storm water drains etc. including lifting, dressing of sides and ramming of bottom, shoring, strutting, dewatering, stacking of excavated soil with all equipment and materials, all complete as per specification, drawings and instruction of purchaser/ engineer, including a lead up to 500 m within plant boundary for all type of soil, weathered rock and hard rock. 

Original Title:
 exceeding 0 m but not exceeding 10.0 m a lead up to 500 m within plant boundary for all type of soil

****** Machine Generated Title (Prediction) **