In [414]:
import time
import gc

import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras import backend as K
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
from sqlalchemy import create_engine

In [415]:
pd.set_option('mode.chained_assignment', None)

### Defining Methods

In [416]:
def get_key(my_dict, val):
    for key, value in my_dict.items():
         if val == value:
             return key
 
    return "key doesn't exist"

### Reading Data

---

In [417]:
graph_path = '../../data/actual_graph_2021-05-06.csv'
graph = pd.read_csv(graph_path)
graph.rename({'id':'graph_vertex_id'}, axis=1, inplace=True)
graph.columns

Index(['graph_vertex_id', 'graph_vertex', 'graph_vertex_subclass'], dtype='object')

In [418]:
COMPETITIONS_PATH = "../../data/competitions_info_cleaned.csv" #./data/competitions_info_cleaned.csv
competitions_filled = pd.read_csv(COMPETITIONS_PATH)
competitions_filled.drop_duplicates(inplace=True)
competitions_filled.rename({'Description': 'description', 'Metric':'metric', 'DataType':'datatype', 'Subject':'subject', 'ProblemType':'problemtype'}
                        , axis=1, inplace=True)
competitions_filled.shape

(266, 9)

In [419]:
competitions_filled['ref'] = competitions_filled['ref'].str.replace("'", "")

In [420]:
COMPETITIONS_PATH = "../../data/competitions_2021-05-06.csv" #./data/competitions_info_cleaned.csv
competitions = pd.read_csv(COMPETITIONS_PATH)
competitions.drop_duplicates(inplace=True)
competitions.shape

(183, 10)

In [421]:
# competitions['ref'] = competitions['ref'].apply(lambda x: x.split(',')[0])
# competitions['ref'] = competitions['ref'].str.replace("'", "")

In [422]:
# competitions['exists_in_comp_filled'] = competitions.apply(lambda x: x['ref'] in competitions_filled['ref'].unique(), axis=1)
# competitions['exists_in_comp_filled'].value_counts()

In [423]:
# competitions_filled.merge(competitions[['id', 'ref']], on=['ref']).shape

In [424]:
competitions = competitions_filled.merge(competitions[['id', 'ref']], on=['ref'])
competitions.shape

(59, 10)

In [425]:
NOTEBOOKS_PATH = '../../data/markup_data_2021-05-06.csv'
notebooks = pd.read_csv(NOTEBOOKS_PATH)
notebooks.head(5)

Unnamed: 0,code_block_id,code_block,data_format,graph_vertex_id,errors,marks,kaggle_id,competition_id
0,570367,`# My forecasting COVID-19 confirmed cases and...,Table,45,No,2,8591010,4368
1,570367,`# My forecasting COVID-19 confirmed cases and...,Table,45,No,2,8591010,4368
2,570368,`# load training and testing data \nsubm = pd....,Table,45,No,5,8591010,4368
3,570369,`subm`,Table,41,No,5,8591010,4368
4,570367,`# My forecasting COVID-19 confirmed cases and...,Table,45,No,2,8591010,4368


In [426]:
notebooks = notebooks.merge(graph, on='graph_vertex_id', how='left')
notebooks.shape

(4748, 10)

In [427]:
nl2ml = notebooks.merge(competitions, left_on=['competition_id'], right_on=['id'], how='inner')
print(nl2ml.shape[0])
nl2ml.drop_duplicates(inplace=True, subset=['code_block_id', 'kaggle_id'])
print(nl2ml.shape[0])

2067
1989


### Vertices Preprocessing

In [428]:
nl2ml['graph_vertex'].apply(lambda x: x.split(';')[0].split('.')[0]).value_counts()

Data_Transform          595
EDA                     442
Model_Train             198
Visualization           145
Environment             142
Data_Extraction         118
Other                   103
Hyperparam_Tuning        86
Data_Export              76
Model_Evaluation         65
Model_Interpretation     16
Hypothesis                3
Name: graph_vertex, dtype: int64

In [429]:
nl2ml.columns

Index(['code_block_id', 'code_block', 'data_format', 'graph_vertex_id',
       'errors', 'marks', 'kaggle_id', 'competition_id', 'graph_vertex',
       'graph_vertex_subclass', 'ref', 'comp_name', 'comp_type', 'description',
       'metric', 'datatype', 'subject', 'problemtype', 'has_notebooks', 'id'],
      dtype='object')

In [430]:
nl2ml['vertex_l1'], nl2ml['vertex_l2'] = nl2ml['graph_vertex'].apply(lambda x: x.split(';')[0].split('.')[0]), nl2ml['graph_vertex_subclass']#.apply(lambda x: x.split(';')[0].split('.')[1])

### Missing Values

In [431]:
nl2ml.replace('-', -1, inplace=True)
print(nl2ml.isna().sum())
nl2ml.fillna(-1, inplace=True)
print(nl2ml.isna().sum())

code_block_id               0
code_block                  0
data_format                 0
graph_vertex_id             0
errors                      0
marks                       0
kaggle_id                   0
competition_id              0
graph_vertex                0
graph_vertex_subclass       0
ref                         0
comp_name                   0
comp_type                   0
description                 0
metric                      0
datatype                    0
subject                   862
problemtype              1368
has_notebooks               0
id                          0
vertex_l1                   0
vertex_l2                   0
dtype: int64
code_block_id            0
code_block               0
data_format              0
graph_vertex_id          0
errors                   0
marks                    0
kaggle_id                0
competition_id           0
graph_vertex             0
graph_vertex_subclass    0
ref                      0
comp_name                0
com

In [432]:
TASK_FEATURES = ['comp_name', 'comp_type', 'description',
                'metric', 'datatype', 'subject', 'problemtype']
# TASK_FEATURES = ['ProblemType',
#                 'number of columns (for tabular)', 'number of entries',
#                 'LabelType', 'Number of classes', 'Loss Function/Metrics',
#                 'Target Column(s) Name']

In [433]:
TARGET_COLUMN = 'vertex_l2'

### Grouping chunks by notebooks

In [478]:
notebook_id_col = 'kaggle_id'
competition_id_col = 'competition_id'
def group_by_notebooks(data:pd.DataFrame, vertex_col:str='vertex_l1') -> pd.DataFrame:
    notebook_cols = [[notebook_id_col, vertex_col, competition_id_col] + TASK_FEATURES]
    df = pd.DataFrame(columns=notebook_cols)
    for i, notebook_id in enumerate(data[notebook_id_col].unique()):
        notebook = data[data[notebook_id_col] == notebook_id].reset_index(drop=True)
        vertices_seq = " ".join(notebook[vertex_col])
        task_features = notebook[TASK_FEATURES].loc[0]
        competition_id = notebook[competition_id_col].unique()[0]
        row = [notebook_id, vertices_seq, competition_id] + task_features.tolist()
        df.loc[i] = row
        print('notebook #{} done'.format(notebook_id))
    return df

### Taking Train Features

In [479]:
# nl2ml = group_by_notebooks(nl2ml, TARGET_COLUMN)
# X, y = train[TASK_FEATURES], train[TARGET_COLUMN]
prepared_data = group_by_notebooks(nl2ml, TARGET_COLUMN)
prepared_data.shape

notebook #8591010 done
notebook #8592598 done
notebook #8596735 done
notebook #8606894 done
notebook #8609050 done
notebook #8611767 done
notebook #8630977 done
notebook #8634286 done
notebook #8640194 done
notebook #8660923 done
notebook #8667455 done
notebook #8668446 done
notebook #8678201 done
notebook #8687334 done
notebook #8689318 done
notebook #8699382 done
notebook #8705213 done
notebook #8706858 done
notebook #8708118 done
notebook #8710137 done
notebook #8710362 done
notebook #8604602 done
notebook #8617043 done
notebook #8620454 done
notebook #8625834 done
notebook #8628909 done
notebook #8658083 done
notebook #8663175 done
notebook #8671133 done
notebook #8679319 done
notebook #8682800 done
notebook #8687249 done
notebook #8693806 done
notebook #8701862 done
notebook #8702904 done
notebook #8706295 done
notebook #8711165 done
notebook #9326374 done
notebook #9349764 done
notebook #9463384 done
notebook #138832 done
notebook #2637869 done
notebook #5466844 done
notebook #57

(80, 10)

### Converting Dtypes

In [481]:
cat_encodings = {}
for i, col in enumerate(prepared_data):
    if col[0] != TARGET_COLUMN:
        print(col)
        try:
            prepared_data[col] =  prepared_data[col].astype('float32')
        except:
            prepared_data[col] = pd.Categorical(prepared_data[col])
            cat_encodings.update({i:dict(enumerate(prepared_data[col].cat.categories))})
            prepared_data[col] = prepared_data[col].cat.codes

('kaggle_id',)
('competition_id',)
('comp_name',)
('comp_type',)
('description',)
('metric',)
('datatype',)
('subject',)
('problemtype',)


In [525]:
competitions = prepared_data[competition_id_col].iloc[:, 0].unique()
test_size = 0.25
n_test_competitions = round(test_size * len(competitions))
test_competitions, train_competitions = competitions[:n_test_competitions], competitions[n_test_competitions:]
train = prepared_data[prepared_data['competition_id'].iloc[:, 0].isin(train_competitions)]
test = prepared_data[prepared_data['competition_id'].iloc[:, 0].isin(test_competitions)]
X_train, y_train = train[TASK_FEATURES], train[TARGET_COLUMN]
X_test, y_test = test[TASK_FEATURES], test[TARGET_COLUMN]
X_train.shape, X_test.shape

((40, 7), (40, 7))

In [438]:
# X_train, X_test, y_train, y_test = train_test_split(prepared_data[TASK_FEATURES], prepared_data[TARGET_COLUMN]
#                                                     , test_size=0.25, shuffle=True, random_state=123)
# X_train.shape, X_test.shape

((60, 7), (20, 7))

### Target Preprocessing: Encoding Vertices

In [527]:
lang = {vertice:i+2 for i, vertice in enumerate(nl2ml[TARGET_COLUMN].unique())} #TODO: save the dict as a local file
lang.update({'<start>':1, '<end>':max(lang.values())+1})
def encode_vertices(vertices_seq, lang:dict=lang):
    try:
        encoded = np.append(np.flip(np.array([lang[vertex] for vertex in vertices_seq[0].split(' ')] + [lang['<end>']])), lang['<start>'])
        # encoded = np.append(lang['<start>'], np.array([lang[vertex] for vertex in vertices_seq[0].split(' ')] + [lang['<end>']]))
    except:
        print(vertices_seq[0].split(' '))
        raise Exception("Can't encode vertices")
    return encoded

In [528]:
# y_train.apply(encode_vertices, axis=1)

In [529]:
# X[TARGET_COLUMN] = y.apply(encode_vertices, axis=1)
# X.to_csv('../data/nl2ml_train_example.csv', index=False)

### Target Preprocessing: Padding Sequences

In [530]:
max_length_targ, max_length_feat = prepared_data[TARGET_COLUMN].squeeze().str.split(' ').str.len().max() + 2, X_train.values.shape[1]

In [531]:
Y_train = tf.keras.preprocessing.sequence.pad_sequences(y_train.apply(encode_vertices, axis=1), maxlen=max_length_targ)
Y_test = tf.keras.preprocessing.sequence.pad_sequences(y_test.apply(encode_vertices, axis=1), maxlen=max_length_targ)

### Defining Constants

In [532]:
# BUFFER_SIZE = len(input_tensor_train)
BATCH_SIZE = 1
LR = 0.001
steps_per_epoch = len(X_train)//BATCH_SIZE
embedding_dim = 512
gru_units = 1024
# vocab_inp_size = len(inp_lang.word_index) + 1
# vocab_tar_size = len(targ_lang.word_index) + 1

### Creating tf.Dataset

In [533]:
train_dataset = tf.data.Dataset.from_tensor_slices((X_train.values, Y_train))
train_dataset = train_dataset.batch(BATCH_SIZE, drop_remainder=True)

### Building the Model

In [534]:
# https://machinelearningmastery.com/text-generation-lstm-recurrent-neural-networks-python-keras/
# https://www.tensorflow.org/tutorials/text/text_generation
# https://www.tensorflow.org/guide/keras/rnn

In [535]:
##TODO: try
# (did not worked) Less epochs (25 -> 5)
# (did not worked) Add dropout
# (did not worked) Try with both flipped sequences and regular
# (did not worked) Activation function
# (did not worked) Split over competitions

In [536]:
class Decoder(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz):
    super(Decoder, self).__init__()
    self.batch_sz = batch_sz
    self.dec_units = dec_units
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
    # self.hidden_embedding = tf.keras.layers.Embedding(vocab_size, 1)
    self.gru = tf.keras.layers.GRU(self.dec_units,
                                   return_sequences=True,
                                   return_state=True,
                                   recurrent_initializer='glorot_uniform')
    self.dropout = tf.keras.layers.Dropout(0.2)
    self.fc = tf.keras.layers.Dense(vocab_size, activation='sigmoid')

    # used for attention
    # self.attention = BahdanauAttention(self.dec_units)

  def call(self, x, hidden):#, enc_output):
    # enc_output shape == (batch_size, max_length, hidden_size)
    # context_vector, attention_weights = self.attention(hidden, enc_output)
    attention_weights = tf.ones(x.shape)
    # context_vector = tf.ones(x.shape)
    # print("X Vector has {} type and {} shape".format(type(x), x.shape))
    # print("Context Vector has {} type and {} shape".format(type(context_vector), context_vector.shape))
    # print("Attention Vector has {} type and {} shape".format(type(context_vector), context_vector.shape))
    # x shape after passing through embedding == (batch_size, 1, embedding_dim)
    x = self.embedding(x)
    # x = tf.squeeze(self.hidden_embedding(x), axis=-1)
    # x shape after concatenation == (batch_size, 1, embedding_dim + hidden_size)
    # x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)

    # passing the concatenated vector to the GRU
    output, state = self.gru(x, initial_state=hidden)

    # output shape == (batch_size * 1, hidden_size)
    output = tf.reshape(output, (-1, output.shape[2]))
    
    output = self.dropout(output)
    # output shape == (batch_size, vocab)
    x = self.fc(output)

    return x, state, attention_weights

In [537]:
decoder = Decoder(len(lang)+2, embedding_dim, gru_units, BATCH_SIZE)
sample_hidden = tf.zeros((BATCH_SIZE, gru_units))
sample_decoder_output, _, _ = decoder(tf.random.uniform((BATCH_SIZE, 1))
                                      , sample_hidden
                                    #   , sample_output
                                    )
print ('Decoder output shape: (batch_size, vocab size) {}'.format(sample_decoder_output.shape))
decoder.summary()

Decoder output shape: (batch_size, vocab size) (1, 58)
Model: "decoder_8"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_8 (Embedding)      multiple                  29696     
_________________________________________________________________
gru_8 (GRU)                  multiple                  4724736   
_________________________________________________________________
dropout_8 (Dropout)          multiple                  0         
_________________________________________________________________
dense_8 (Dense)              multiple                  59450     
Total params: 4,813,882
Trainable params: 4,813,882
Non-trainable params: 0
_________________________________________________________________


In [538]:
optimizer = tf.keras.optimizers.Adam(LR)
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
                  from_logits=True, reduction='none')

def loss_function(real, pred):
  mask = tf.math.logical_not(tf.math.equal(real, 0))
  loss_ = loss_object(real, pred)

  mask = tf.cast(mask, dtype=loss_.dtype)
  loss_ *= mask

  return tf.reduce_mean(loss_)

In [539]:
# def compute_bleu(reference_corpus, translation_corpus, max_order=4,
#                  smooth=False):
#   """Computes BLEU score of translated segments against one or more references.
#   Args:
#     reference_corpus: list of lists of references for each translation. Each
#         reference should be tokenized into a list of tokens.
#     translation_corpus: list of translations to score. Each translation
#         should be tokenized into a list of tokens.
#     max_order: Maximum n-gram order to use when computing BLEU score.
#     smooth: Whether or not to apply Lin et al. 2004 smoothing.
#   Returns:
#     3-Tuple with the BLEU score, n-gram precisions, geometric mean of n-gram
#     precisions and brevity penalty.
#   """
#   matches_by_order = [0] * max_order
#   possible_matches_by_order = [0] * max_order
#   reference_length = 0
#   translation_length = 0
#   for (references, translation) in zip(reference_corpus,
#                                        translation_corpus):
#     reference_length += min(len(r) for r in references)
#     translation_length += len(translation)

#     merged_ref_ngram_counts = collections.Counter()
#     for reference in references:
#       merged_ref_ngram_counts |= _get_ngrams(reference, max_order)
#     translation_ngram_counts = _get_ngrams(translation, max_order)
#     overlap = translation_ngram_counts & merged_ref_ngram_counts
#     for ngram in overlap:
#       matches_by_order[len(ngram)-1] += overlap[ngram]
#     for order in range(1, max_order+1):
#       possible_matches = len(translation) - order + 1
#       if possible_matches > 0:
#         possible_matches_by_order[order-1] += possible_matches

#   precisions = [0] * max_order
#   for i in range(0, max_order):
#     if smooth:
#       precisions[i] = ((matches_by_order[i] + 1.) /
#                        (possible_matches_by_order[i] + 1.))
#     else:
#       if possible_matches_by_order[i] > 0:
#         precisions[i] = (float(matches_by_order[i]) /
#                          possible_matches_by_order[i])
#       else:
#         precisions[i] = 0.0

#   if min(precisions) > 0:
#     p_log_sum = sum((1. / max_order) * math.log(p) for p in precisions)
#     geo_mean = math.exp(p_log_sum)
#   else:
#     geo_mean = 0

#   ratio = float(translation_length) / reference_length

#   if ratio > 1.0:
#     bp = 1.
#   else:
#     bp = math.exp(1 - 1. / ratio)

#   bleu = geo_mean * bp

#   return (bleu, precisions, bp, ratio, translation_length, reference_length)

In [540]:
# class PerplexityMetric(tf.keras.metrics.Metric):
#     ##TODO: calculate perplexity for one example
#     # average for batch
#     # average for epoch
#     """
#     USAGE NOTICE: this metric accepts only logits for now (i.e. expect the same behaviour as from tf.keras.losses.SparseCategoricalCrossentropy with the a provided argument "from_logits=True", 
# 		here the same loss is used with "from_logits=True" enforced so you need to provide it in such a format)
#     METRIC DESCRIPTION:
#     Popular metric for evaluating language modelling architectures.
#     More info: http://cs224d.stanford.edu/lecture_notes/LectureNotes4.pdf.
#     DISCLAIMER: Original function created by Kirill Mavreshko in https://github.com/kpot/keras-transformer/blob/b9d4e76c535c0c62cadc73e37416e4dc18b635ca/example/run_gpt.py#L106. 
#     My "contribution": I converted Kirill method's logic (and added a padding masking to to it) into this new Tensorflow 2.0 way of doing things via a stateful "Metric" object. This required making the metric a fully-fledged object by subclassing      the Metric class. 
#     """
#     def __init__(self, name='perplexity', **kwargs):
#       super(PerplexityMetric, self).__init__(name=name, **kwargs)
#       self.cross_entropy = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False, reduction='none')
#       # self.cross_entropy = tf.keras.losses.CategoricalCrossentropy(from_logits=False, reduction='none')
#       self.perplexity = self.add_weight(name='tp', initializer='ones') #tf.math.multiply(1, 1)
# 		# Consider uncommenting the decorator for a performance boost (?)  		
#     # @tf.function
#     def _calculate_perplexity(self, real, pred):
# 			# The next 4 lines zero-out the padding from loss calculations, 
# 			# this follows the logic from: https://www.tensorflow.org/beta/tutorials/text/transformer#loss_and_metrics 			
#       mask = tf.math.logical_not(tf.math.equal(real, 0))
#       loss_ = self.cross_entropy(real, pred)
#       mask = tf.cast(mask, dtype=loss_.dtype)
#       loss_ *= mask
# 			# Calculating the perplexity steps:
#       step1 = K.mean(loss_, axis=0)#axis=-1)
#       step2 = K.exp(step1)
#       perplexity = K.mean(step2)
#       return perplexity

#     def update_state(self, y_true, y_pred, sample_weight=None):
#       # TODO:FIXME: handle sample_weight !
#       if sample_weight is not None:
#           print("WARNING! Provided 'sample_weight' argument to the perplexity metric. Currently this is not handled and won't do anything differently..")
#       cur_perplexity = self._calculate_perplexity(y_true, y_pred)
# 			# Remember self.perplexity is a tensor (tf.Variable), so using simply "self.perplexity = perplexity" will result in error because of mixing EagerTensor and Graph operations 
#       # self.perplexity.assign_add(cur_perplexity)
#       # print('cur_perplexity: {}'.format(cur_perplexity))
#       # print('self.perplexity: {}'.format(self.perplexity))
#       # print('mul : {}'.format(tf.math.multiply(self.perplexity, cur_perplexity)))
#       self.perplexity.assign(tf.math.multiply(self.perplexity, cur_perplexity))
#       # self.perplexity = tf.math.multiply(self.perplexity, cur_perplexity) ##TODO
#       # print('current perplexity is: {}'.format(self.perplexity))

#     def result(self):
#       return self.perplexity

#     def reset_states(self):
#       # The state of the metric will be reset at the start of each epoch.
#       self.perplexity.assign(1.0) # = tf.math.multiply(1, 1)

### Model Training or Loading Pre-Trained

In [541]:
@tf.function
def train_step(inp, targ):#, enc_hidden):
  loss = 0
  batch_perplexity = 1
  with tf.GradientTape() as tape:
    # enc_output, enc_hidden = encoder(inp, enc_hidden)

    dec_hidden = tf.zeros((BATCH_SIZE, gru_units)) #enc_hidden

    dec_input = tf.expand_dims([1] * BATCH_SIZE, 1)
    
    # Teacher forcing - feeding the target as the next input
    for t in range(1, targ.shape[1]): # for each vertex (token) from solution (sequence)
      # passing enc_output to the decoder
      predictions, dec_hidden, _ = decoder(dec_input, dec_hidden)#, enc_output)
      loss += loss_function(targ[:, t], predictions)
      batch_perplexity *= tf.exp(loss)      

      # using teacher forcing
      dec_input = tf.expand_dims(targ[:, t], 1)

  batch_loss = (loss / int(targ.shape[1]))
  variables = decoder.trainable_variables # + encoder.trainable_variables
  gradients = tape.gradient(loss, variables)
  optimizer.apply_gradients(zip(gradients, variables))
  del inp, targ, gradients, variables
  gc.collect()
  return batch_loss

In [542]:
checkpoint_dir = './checkpoints/'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(optimizer=optimizer
                                # , metrics=perplexity_metric
                                #  , encoder=encoder
                                 , decoder=decoder)

In [543]:
EPOCHS = 25
for epoch in range(EPOCHS):
    gc.collect()
    start = time.time()
    # enc_hidden = encoder.initialize_hidden_state()
    total_loss = 0
    total_batch_perplexity = 0
    for (batch, (feat, targ)) in enumerate(train_dataset.take(steps_per_epoch)):
        # print ('Features: {}, Target: {}'.format(feat, targ))
        batch_loss = train_step(feat, targ)#, enc_hidden)
        batch_perplexity = tf.exp(batch_loss)

        total_loss += batch_loss
        total_batch_perplexity += batch_perplexity #perplexity_metric.result()

        if batch % 100 == 0:
            print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1,
                                                            batch,
                                                            batch_loss.numpy()), end=' ')
            print('Perplexity {:.4f}'.format(batch_perplexity))
if (epoch + 1) % 2 == 0:
    print('saving')
    checkpoint.write(file_prefix=checkpoint_prefix)
    print('saved')

print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))

Epoch 1 Batch 0 Loss 0.3506 Perplexity 1.4199
Epoch 2 Batch 0 Loss 0.2247 Perplexity 1.2519
Epoch 3 Batch 0 Loss 0.2125 Perplexity 1.2367
Epoch 4 Batch 0 Loss 0.1951 Perplexity 1.2155
Epoch 5 Batch 0 Loss 0.1957 Perplexity 1.2161
Epoch 6 Batch 0 Loss 0.2009 Perplexity 1.2225
Epoch 7 Batch 0 Loss 0.1599 Perplexity 1.1734
Epoch 8 Batch 0 Loss 0.1356 Perplexity 1.1452
Epoch 9 Batch 0 Loss 0.1103 Perplexity 1.1166
Epoch 10 Batch 0 Loss 0.1390 Perplexity 1.1492
Epoch 11 Batch 0 Loss 0.0657 Perplexity 1.0679
Epoch 12 Batch 0 Loss 0.0643 Perplexity 1.0664
Epoch 13 Batch 0 Loss 0.0445 Perplexity 1.0455
Epoch 14 Batch 0 Loss 0.0549 Perplexity 1.0565
Epoch 15 Batch 0 Loss 0.0476 Perplexity 1.0488
Epoch 16 Batch 0 Loss 0.0430 Perplexity 1.0440
Epoch 17 Batch 0 Loss 0.0500 Perplexity 1.0513
Epoch 18 Batch 0 Loss 0.0413 Perplexity 1.0422
Epoch 19 Batch 0 Loss 0.0404 Perplexity 1.0412
Epoch 20 Batch 0 Loss 0.0411 Perplexity 1.0420
Epoch 21 Batch 0 Loss 0.0399 Perplexity 1.0407
Epoch 22 Batch 0 Loss 

In [544]:
# checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))

### Sequence Generation

In [545]:
def preprocess_task(task_vector):
    # either convert to float32 or encode to categoricals
    for i, el in enumerate(task_vector):
        try:
            task_vector[i] = float(task_vector[i])
        except:
            task_vector[i] = get_key(cat_encodings[i], task_vector[i])
    return task_vector.astype('float32')

In [546]:
def generate_solution(task_vector, save_outputs:bool=False):
  attention_plot = np.zeros((max_length_targ, max_length_feat))
  task_vector = preprocess_task(task_vector)
  inputs = tf.convert_to_tensor(task_vector)
  result = ''

  dec_hidden = tf.zeros((BATCH_SIZE, gru_units))
  dec_input = tf.expand_dims([1], 0)
  loss = 0
  for t in range(max_length_targ):
    predictions, dec_hidden, attention_weights = decoder(dec_input
                                                         , dec_hidden
                                                         )
    # storing the attention weights to plot later on
    attention_weights = tf.reshape(attention_weights, (-1, ))
    attention_plot[t] = attention_weights.numpy()
    predicted_id = tf.argmax(predictions[0]).numpy()
    predicted_vertice = get_key(lang, predicted_id)
    if (predicted_vertice != ' ')&(predicted_vertice != ''):
      # loss += loss_function(true_vector, predictions)
      # print(loss)
      result = predicted_vertice + ' ' + result #targ_lang.index_word[predicted_id] + ' '
    elif (predicted_vertice == '<start>')&(predicted_vertice == '<end>'):
      print('Evaluation: found start/end, ending')
      return result, task_vector, attention_plot
    
    # the predicted ID is fed back into the model
    dec_input = tf.expand_dims([predicted_id], 0)
  if save_outputs:
    OUTPUT_FILE = './task2seq/outputs/output.py'
    with open(OUTPUT_FILE, 'w') as f:
        last_vertice = ''
        for vertice in result.split(' '):
            if vertice:
                if (vertice!='<start>')&(vertice!='<end>')&(vertice!=last_vertice):
                    line = '#@ {} \n\n'.format(vertice)
                    f.write(line)
                    last_vertice = vertice
  return result, task_vector, attention_plot

In [547]:
i = 0
example_task_vector = X_test.reset_index().loc[i]
example_true_vector = Y_test[i]
example_task_vector, example_true_vector

(index          0
 comp_name      0
 comp_type      1
 description    0
 metric         1
 datatype       0
 subject        1
 problemtype    0
 Name: 0, dtype: int64,
 array([ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0, 56,  4,  3, 16, 15, 13, 15, 15, 11,
        12, 15, 13, 15, 15, 11, 14, 13,  7, 12, 11,  4,  9, 10,  9,  4,  8,
         6,  6,  4,  4,  4,  4,  4,  4,  3,  3,  3,  2,  2,  1]))

In [548]:
result, task_vector, attention_plot = generate_solution(example_task_vector)
print('The number of vertices is: {} \n'.format(len(result.split(' '))))
print(result)

The number of vertices is: 117 

import_modules <start> <start> import_modules <start> <start> import_modules <start> <start> import_modules <start> <start> import_modules <start> <start> import_modules <start> <start> import_modules <start> <start> import_modules <start> <start> import_modules <start> <start> import_modules <start> <start> import_modules <start> <start> import_modules <start> <start> import_modules <start> <start> import_modules <start> <start> <start> <start> import_modules <start> import_modules import_modules show_table show_table show_table show_table_attributes distribution show_table drop_column distribution distribution distribution distribution distribution count_unique_values count_unique_values drop_column show_shape count_unique_values show_shape set_options distribution drop_column show_table_attributes feature_engineering prepare_x_and_y distribution prepare_x_and_y import_modules set_options define_search_space define_search_space define_search_space def

### Model Evaluation

In [549]:
def generate_solution_with_evaluation(task_vector, true_vector, save_outputs:bool=False):
  attention_plot = np.zeros((max_length_targ, max_length_feat))
  task_vector = preprocess_task(task_vector)
  inputs = tf.convert_to_tensor(task_vector)
  result = ''

  dec_hidden = tf.zeros((BATCH_SIZE, gru_units))
  dec_input = tf.expand_dims([1], 0)
  loss = 0
  for t in range(max_length_targ):
    predictions, dec_hidden, attention_weights = decoder(dec_input
                                                         , dec_hidden
                                                         )
    # storing the attention weights to plot later on
    attention_weights = tf.reshape(attention_weights, (-1, ))
    attention_plot[t] = attention_weights.numpy()
    predicted_id = tf.argmax(predictions[0]).numpy()
    predicted_vertice = get_key(lang, predicted_id)
    if (predicted_vertice != ' ')&(predicted_vertice != ''):
        loss += loss_function(true_vector[t], predictions)
        result = predicted_vertice + ' ' + result #targ_lang.index_word[predicted_id] + ' '
    elif (predicted_vertice == '<start>')&(predicted_vertice == '<end>'):
        print('Evaluation: found start/end, ending')
        return result, task_vector, attention_plot
    
    # the predicted ID is fed back into the model
    dec_input = tf.expand_dims([predicted_id], 0)
  if save_outputs:
    OUTPUT_FILE = './task2seq/outputs/output.py'
    with open(OUTPUT_FILE, 'w') as f:
        last_vertice = ''
        for vertice in result.split(' '):
            if vertice:
                if (vertice!='<start>')&(vertice!='<end>')&(vertice!=last_vertice):
                    line = '#@ {} \n\n'.format(vertice)
                    f.write(line)
                    last_ve.rtice = vertice
  return result, loss

In [550]:
i = 0
example_task_vector = X_test.reset_index().loc[i]
example_true_vector = Y_test[i]
example_task_vector, example_true_vector

(index          0
 comp_name      0
 comp_type      1
 description    0
 metric         1
 datatype       0
 subject        1
 problemtype    0
 Name: 0, dtype: int64,
 array([ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0, 56,  4,  3, 16, 15, 13, 15, 15, 11,
        12, 15, 13, 15, 15, 11, 14, 13,  7, 12, 11,  4,  9, 10,  9,  4,  8,
         6,  6,  4,  4,  4,  4,  4,  4,  3,  3,  3,  2,  2,  1]))

In [551]:
result, loss = generate_solution_with_evaluation(example_task_vector, example_true_vector)
print('The number of vertices is: {} \n'.format(len(result.split(' '))))
print(result, '\n')
print('Cross-Entropy:', loss.numpy())

The number of vertices is: 117 

import_modules <start> <start> import_modules <start> <start> import_modules <start> <start> import_modules <start> <start> import_modules <start> <start> import_modules <start> <start> import_modules <start> <start> import_modules <start> <start> import_modules <start> <start> import_modules <start> <start> import_modules <start> <start> import_modules <start> <start> import_modules <start> <start> import_modules <start> <start> <start> <start> import_modules <start> import_modules import_modules show_table show_table show_table show_table_attributes distribution show_table drop_column distribution distribution distribution distribution distribution count_unique_values count_unique_values drop_column show_shape count_unique_values show_shape set_options distribution drop_column show_table_attributes feature_engineering prepare_x_and_y distribution prepare_x_and_y import_modules set_options define_search_space define_search_space define_search_space def

In [552]:
def predict_on_test(X_test, Y_test):
    y_pred = []
    losses = []
    print('predicting..', end=' ')
    for i, task_vector in X_test.reset_index().iterrows():
        print('{:.2%}'.format(i/X_test.shape[0]), end=' ')
        true_vector = Y_test[i]
        result, loss = generate_solution_with_evaluation(task_vector, true_vector)
        # print(loss.numpy())
        y_pred.append(result[:-1])
        losses.append(loss)
    print()
    y_pred = pd.DataFrame(y_pred, columns=[TARGET_COLUMN])
    return y_pred, losses

In [558]:
## Predict on Train
y_pred, losses = predict_on_test(X_train[TASK_FEATURES], Y_train)
print('Cross-Entropy: {}'.format(np.mean(losses)))
# print('Perplexity: {}'.format(np.mean(np.exp(losses))))
print('Unique answers: {}'.format(y_pred[TARGET_COLUMN].nunique()))

predicting.. 0.00% 2.50% 5.00% 7.50% 10.00% 12.50% 15.00% 17.50% 20.00% 22.50% 25.00% 27.50% 30.00% 32.50% 35.00% 37.50% 40.00% 42.50% 45.00% 47.50% 50.00% 52.50% 55.00% 57.50% 60.00% 62.50% 65.00% 67.50% 70.00% 72.50% 75.00% 77.50% 80.00% 82.50% 85.00% 87.50% 90.00% 92.50% 95.00% 97.50% 
Cross-Entropy: 182.16726684570312
Unique answers: 1


In [554]:
## Predict on Test
y_pred, losses = predict_on_test(X_test[TASK_FEATURES], Y_test)
print('Cross-Entropy: {}'.format(np.mean(losses)))
# print('Perplexity: {}'.format(np.mean(np.exp(losses))))
print('Unique answers: {}'.format(y_pred[TARGET_COLUMN].nunique()))

predicting.. 0.00% 2.50% 5.00% 7.50% 10.00% 12.50% 15.00% 17.50% 20.00% 22.50% 25.00% 27.50% 30.00% 32.50% 35.00% 37.50% 40.00% 42.50% 45.00% 47.50% 50.00% 52.50% 55.00% 57.50% 60.00% 62.50% 65.00% 67.50% 70.00% 72.50% 75.00% 77.50% 80.00% 82.50% 85.00% 87.50% 90.00% 92.50% 95.00% 97.50% 
Cross-Entropy: 251.97317504882812
Unique answers: 1


In [555]:
##TODO: BLEU
##TODO: To py and argparse

### To DAGsHub

In [None]:
##TODO: Export to DAGsHub
# experiment_params = {}
# experiment_results = {}

### Export Sequences

In [556]:
# ## save vertices to file
# OUTPUT_FILE = './task2seq/outputs/example_output.py'
# with open(OUTPUT_FILE, 'w') as f:
#     last_vertice = ''
#     for vertice in result.split(' '):
#         if vertice:
#             if (vertice!='<start>')&(vertice!='<end>')&(vertice!=last_vertice):
#                 line = '#@ {} \n\n'.format(vertice)
#                 f.write(line)
#                 last_vertice = vertice