## Install the library and download the pretrained models

In [1]:
print("Installing dependencies...")
%tensorflow_version 2.x
!pip install -q t5==0.6.4

import functools
import os
import time
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

import tensorflow.compat.v1 as tf
import tensorflow_datasets as tfds

import t5

!wget "https://www.dropbox.com/sh/kjoqdpj7e16dny9/AADdvjWVFckCgNQN-AqMKhiDa?dl=1" -O vocabulary.zip
!unzip vocabulary.zip
!rm vocabulary.zip
!wget "https://www.dropbox.com/sh/udp8trmj2needph/AABzPDjZPRRZwIhCCwFsTW66a?dl=1" -O program_synthesis.zip
!unzip program_synthesis.zip
!rm program_synthesis.zip

Installing dependencies...
[K     |████████████████████████████████| 163kB 34.5MB/s 
[?25hINFO:tensorflow:tokens_length=568 inputs_length=512 targets_length=114 noise_density=0.15 mean_noise_span_length=3.0 
--2021-05-17 14:58:18--  https://www.dropbox.com/sh/kjoqdpj7e16dny9/AADdvjWVFckCgNQN-AqMKhiDa?dl=1
Resolving www.dropbox.com (www.dropbox.com)... 162.125.80.18, 2620:100:6019:18::a27d:412
Connecting to www.dropbox.com (www.dropbox.com)|162.125.80.18|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: /sh/dl/kjoqdpj7e16dny9/AADdvjWVFckCgNQN-AqMKhiDa [following]
--2021-05-17 14:58:18--  https://www.dropbox.com/sh/dl/kjoqdpj7e16dny9/AADdvjWVFckCgNQN-AqMKhiDa
Reusing existing connection to www.dropbox.com:443.
HTTP request sent, awaiting response... 302 Found
Location: https://ucc3658ba09e7fe3addf95bfcbcc.dl.dropboxusercontent.com/zip_download_get/AyC7saMXNU6TABr5sX9JbRje20ejmFsqV5gw0Ffx5pf7yOZyyMm4E5duwcwjJ-x2QufLdzw6irucc3NKAkiAfNQ-EOK8RLIJYP4

## Set sentencepiece model

In [2]:
import t5.data
# .sentencepiece_vocabulary import n
# import t5.data.sentencepiece_vocabulary
vocab_model_path = 'code_spm_unigram_40M.model'
vocab = t5.data.SentencePieceVocabulary(vocab_model_path, extra_ids=100)

print("Vocab has a size of %d\n" % vocab.vocab_size)

Vocab has a size of 32100



In [3]:
import numpy as np
import pandas as pd
pd.options.display.max_colwidth = 150 #extend width

In [10]:
def read_from_gdrive(filename):
  from google.colab import drive
  drive.mount('drive')
  return pd.read_csv('drive/My Drive/rp/' + filename)

In [None]:
df = read_from_gdrive('04-19-2021-train.csv').set_index('id')
pdf = read_from_gdrive('04-19-2021-prefix.csv').set_index('0')

## Set the preprocessors and the task registry for the t5 model

In [4]:
def program_synt_dataset_fn(split, shuffle_files=False):
    del shuffle_files

    ds = tf.data.TextLineDataset(program_synt_path[split])
    ds = ds.map(
        functools.partial(tf.io.decode_csv, record_defaults=["", ""], field_delim="\t", use_quote_delim=False),
        num_parallel_calls=tf.data.experimental.AUTOTUNE
    )
    ds = ds.map(lambda *ex: dict(zip(["text", "program"], ex)))
    return ds

In [15]:
import t5.data.utils

In [5]:
def program_synt_preprocessor(ds):
    def normalize_text(text):
        return text

    def to_inputs_and_targets(ex):
        return {
            "inputs": tf.strings.join(["program synthesis: ", normalize_text(ex["text"])]),
            "targets": normalize_text(ex["program"])
        }

    return ds.map(to_inputs_and_targets, num_parallel_calls=tf.data.experimental.AUTOTUNE)


t5.data.TaskRegistry.remove('program_synt')
t5.data.TaskRegistry.add(
    "program_synt",
    dataset_fn=program_synt_dataset_fn,
    output_features={
        "inputs": t5.data.utils.Feature(vocabulary=vocab),
        "targets": t5.data.utils.Feature(vocabulary=vocab),
    },
    splits=["train", "validation"],
    text_preprocessor=[program_synt_preprocessor],
    postprocess_fn=t5.data.postprocessors.lower_text,
    metric_fns=[t5.evaluation.metrics.bleu, t5.evaluation.metrics.accuracy, t5.evaluation.metrics.rouge],
)

## Set t5 small model

In [6]:
MODEL_DIR = "small"
model_parallelism = 1
train_batch_size = 256

tf.io.gfile.makedirs(MODEL_DIR)
model = t5.models.MtfModel(
    model_dir=MODEL_DIR,
    tpu=None,
    tpu_topology=None,
    model_parallelism=model_parallelism,
    batch_size=train_batch_size,
    sequence_length={"inputs": 512, "targets": 512},
    mesh_shape="model:1,batch:1",
    mesh_devices=["GPU:0"],
    learning_rate_schedule=0.003,
    save_checkpoints_steps=5000,
    keep_checkpoint_max=None,
    iterations_per_loop=100,
)

## Program synthesis

### Give the question

In [7]:
question = "you are given an array of numbers a and a number b, compute the difference of elements in a and b" #@param {type:"raw"}


### Parsing and Tokenization

In [8]:
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [9]:
def englishTokenizer(sentence):
    result = []
    tokens = word_tokenize(sentence)
    for t in tokens:
        if( not len(t)>50):
            result.append(t)
    return ' '.join(result)

tokenized_question = englishTokenizer(question)
print("tokenized description: " + tokenized_question)

tokenized description: you are given an array of numbers a and a number b , compute the difference of elements in a and b


### Record the question with the prefix to a txt file

In [10]:
questions = [tokenized_question]

inputs_path = 'input.txt'
with tf.io.gfile.GFile(inputs_path, "w") as f:
  for c in questions:
    f.write("program synthesis: %s\n" % c)

predict_outputs_path = 'MtfModel-output.txt'


### Running the model with the best checkpoint to generating code for the given question

In [14]:
print("All devices: ", tf.config.list_logical_devices('GPU'))

All devices:  [LogicalDevice(name='/device:GPU:0', device_type='GPU')]


In [12]:
import t5.data.mixtures

In [16]:
model.batch_size = 8  
model.predict(
    input_file="input.txt",
    output_file=predict_outputs_path,
    checkpoint_steps=840000,
    beam_size=4,
    vocabulary=vocab, 
    # Select the most probable output token at each step.
    temperature=0,
)

INFO:tensorflow:Using config: {'_model_dir': 'small', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': 5000, '_save_checkpoints_secs': None, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': None, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_checkpoint_save_graph_def': True, '_service': None, '_cluster_spec': ClusterSpec({}), '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1, '_tpu_config': TPUConfig(iterations_per_loop=100, num_shards=None, num_cores_per_replica=1, per_host_input_for_training=4, tpu_j

TypeError: ignored

In [None]:
!ls

### Program synthesis Result

In [15]:
prediction_file = "MtfModel-output.txt-6000"
print("\nPredictions using checkpoint 6000:\n" )
with tf.io.gfile.GFile(prediction_file) as f:
  for c, d in zip(questions, f):
    if c:
      print("Question: " + c + '\n')
      print("Code: " + d)



Predictions using checkpoint 6000:



NotFoundError: ignored