In [1]:
!pip install t5

Collecting t5
  Using cached t5-0.9.3-py3-none-any.whl (153 kB)
Collecting tfds-nightly
  Downloading tfds_nightly-4.4.0.dev202110270108-py3-none-any.whl (4.0 MB)
     |████████████████████████████████| 4.0 MB 7.4 MB/s            
[?25hCollecting torch
  Using cached torch-1.10.0-cp37-none-macosx_10_9_x86_64.whl (147.1 MB)
Collecting scikit-learn
  Using cached scikit_learn-1.0.1-cp37-cp37m-macosx_10_13_x86_64.whl (7.9 MB)
Collecting pandas
  Using cached pandas-1.3.4-cp37-cp37m-macosx_10_9_x86_64.whl (11.3 MB)
Collecting babel
  Using cached Babel-2.9.1-py2.py3-none-any.whl (8.8 MB)
Collecting nltk
  Using cached nltk-3.6.5-py3-none-any.whl (1.5 MB)
Collecting mesh-tensorflow[transformer]>=0.1.13
  Using cached mesh_tensorflow-0.1.19-py3-none-any.whl (366 kB)
Collecting seqio
  Using cached seqio-0.0.7-py3-none-any.whl (286 kB)
Collecting numpy
  Using cached numpy-1.21.3-cp37-cp37m-macosx_10_9_x86_64.whl (16.9 MB)
Collecting transformers>=2.7.0
  Using cached t

In [1]:
import functools
from functools import partial
import tensorflow._api.v2.compat.v1 as tf
import pandas as pd
import os
import t5
import t5.models
from t5.models import MtfModel
import seqio

tf.disable_v2_behavior()

ON_TPU = False      #Change in case if TPU is present
tf.test.gpu_device_name()

Instructions for updating:
non-resource variables are not supported in the long term


2021-10-28 09:53:18.426495: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


''

In [2]:
PATH_TO_DATA = "data/"
train_df = pd.read_csv(PATH_TO_DATA+"xnli.test.tsv", sep="\t")
test_df = pd.read_csv(PATH_TO_DATA+"xnli.dev.tsv",sep="\t")
print(train_df.shape)
print(test_df.shape)

(75150, 19)
(37350, 19)


In [3]:
def create_data(old,new):
    df = pd.read_csv(old,sep='\t')
    df = df[['language','sentence1','sentence2']]
    sent1 = df[['language','sentence1']].rename(columns={"sentence1":"input"})
    sent2 = df[['language','sentence2']].rename(columns={"sentence2":"input"})
    final = pd.concat([sent1,sent2],ignore_index=True)
    final['input'] = 'input: '+final.input
    final = final.drop_duplicates()
    final = final.sample(frac=1)
    final.to_csv(new,index=False,header=False)
    print(f'Shape: {final.shape}')

In [4]:
MAIN_DIR = ""         #Add path to a gcs bucket if using TPU

xnli_csv_path = {
    "train":"train.csv",
    "test": "test.csv"
}

def xnli_dataset_fn(split, shuffle_files=False):
  if MAIN_DIR=="":
    ds = tf.data.TextLineDataset(xnli_csv_path[split])
  else:
    ds = tf.data.TextLineDataset(MAIN_DIR+xnli_csv_path[split])
  ds = ds.map(
      functools.partial(tf.io.decode_csv, record_defaults=["", ""],
                        field_delim=","),
      num_parallel_calls=tf.data.experimental.AUTOTUNE)
  ds = ds.map(lambda *ex: dict(zip(["language", "input"], ex)))
  return ds

def lang_preprocessor(data):
        return data.map(lambda ex:{"inputs":ex["input"],"targets": ex["language"]}, num_parallel_calls=tf.data.experimental.AUTOTUNE)

DEFAULT_VOCAB = t5.data.SentencePieceVocabulary("gs://t5-data/vocabs/mc4.250000.100extra/sentencepiece.model")

DEFAULT_OUTPUT_FEATURES = {
    "inputs":
        seqio.Feature(
            vocabulary=DEFAULT_VOCAB, add_eos=True,required=False),
    "targets":
        seqio.Feature(
            vocabulary=DEFAULT_VOCAB, add_eos=True)
}

task = "lang_classify"

seqio.TaskRegistry.remove(task)
seqio.TaskRegistry.add(
    task,
    source=seqio.FunctionDataSource(
        dataset_fn=xnli_dataset_fn,
        splits=["train", "test"],
        ),
    preprocessors=[
        lang_preprocessor,
        seqio.preprocessors.tokenize_and_append_eos,
    ],
    postprocess_fn=t5.data.postprocessors.lower_text,
    metric_fns=[t5.evaluation.metrics.accuracy],
    output_features=DEFAULT_OUTPUT_FEATURES,
  )

<seqio.dataset_providers.Task at 0x7f8a926a7250>

In [5]:
create_data(PATH_TO_DATA+'xnli.test.tsv',"train.csv")
create_data(PATH_TO_DATA+'xnli.dev.tsv',"test.csv")
print(pd.read_csv("train.csv").shape)


Shape: (100161, 2)
Shape: (49797, 2)
(100160, 2)


In [6]:
PRE_TRAINED_MODEL = "gs://t5-data/pretrained_models/mt5/small"
LR = 0.003
BATCH_SIZE = 32

TPU_TOPOLOGY = "v2-8"
TPU_ADDRESS = None

if ON_TPU:
    BATCH_SIZE = 128
    try:
        tpu = tf.distribute.cluster_resolver.TPUClusterResolver()  # TPU detection
        TPU_ADDRESS = tpu.get_master()
        print('Running on TPU:', TPU_ADDRESS)
    except ValueError as e:
        raise BaseException('ERROR: Not connected to a TPU runtime; please see the previous cell in this notebook for instructions!')

    tf.enable_eager_execution()
    tf.config.experimental_connect_to_host(TPU_ADDRESS)

n = pd.read_csv("train.csv").shape[0]
EPOCH = 5
ft_steps = int(n/BATCH_SIZE)*EPOCH

if MAIN_DIR=="":
  MODEL_DIR = "models/"
else:
  MODEL_DIR = MAIN_DIR+"models/"

model = MtfModel(MODEL_DIR,
                   tpu=TPU_ADDRESS,
                 tpu_topology=TPU_TOPOLOGY,
                   model_parallelism=1,
                   batch_size=BATCH_SIZE,
                   sequence_length={"inputs": 64, "targets": 15},
                   learning_rate_schedule=LR,
                   save_checkpoints_steps=5000,
                    keep_checkpoint_max= 16 if ON_TPU else None,
                   iterations_per_loop=300 if ON_TPU else 100)

model.finetune(
      mixture_or_task_name=task,
      pretrained_model_dir=PRE_TRAINED_MODEL,
      finetune_steps=ft_steps,
      split="train")



ERROR:root:Path not found: gs://t5-data/pretrained_models/mt5/small/operative_config.gin
INFO:tensorflow:Using config: {'_model_dir': 'models/', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': 5000, '_save_checkpoints_secs': None, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': None, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_checkpoint_save_graph_def': True, '_service': None, '_cluster_spec': ClusterSpec({}), '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1, '_tpu_config': TPUConfig(iterations_p

INFO:tensorflow:Using config: {'_model_dir': 'models/', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': 5000, '_save_checkpoints_secs': None, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': None, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_checkpoint_save_graph_def': True, '_service': None, '_cluster_spec': ClusterSpec({}), '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1, '_tpu_config': TPUConfig(iterations_per_loop=100, num_shards=None, num_cores_per_replica=1, per_host_input_for_training=4, tpu

KeyboardInterrupt: 

In [9]:
model.batch_size = BATCH_SIZE*4
SUMM_DIR = "output/" if MAIN_DIR=="" else MAIN_DIR+"output/"
model.eval(
    "lang_classify",
    summary_dir=SUMM_DIR,
    checkpoint_steps=-1,
    split="test"
)

INFO:tensorflow:Using config: {'_model_dir': 'models/', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': 5000, '_save_checkpoints_secs': None, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': None, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_checkpoint_save_graph_def': True, '_service': None, '_cluster_spec': ClusterSpec({}), '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1, '_tpu_config': TPUConfig(iterations_per_loop=100, num_shards=None, num_cores_per_replica=1, per_host_input_for_training=4, tpu

INFO:tensorflow:Using config: {'_model_dir': 'models/', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': 5000, '_save_checkpoints_secs': None, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': None, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_checkpoint_save_graph_def': True, '_service': None, '_cluster_spec': ClusterSpec({}), '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1, '_tpu_config': TPUConfig(iterations_per_loop=100, num_shards=None, num_cores_per_replica=1, per_host_input_for_training=4, tpu

KeyboardInterrupt: 

In [11]:
from sklearn.metrics import precision_recall_fscore_support, classification_report

def get_prediction(output_dir,task_name):
    def _prediction_file_to_ckpt(path):
        return int(path.split("_")[-2])
    prediction_files = tf.io.gfile.glob(os.path.join(output_dir,"%s_*_predictions"%task_name))
    if len(prediction_files) == 0: return None
    return sorted(prediction_files, key=_prediction_file_to_ckpt)[-1]

def evaluation(output_dir,task_name):
    pred_fn = get_prediction(output_dir,task_name)
    if not pred_fn: return None,None,None
    with tf.io.gfile.GFile(pred_fn) as p:
        preds = [line.strip() for line in p]

    with tf.io.gfile.GFile(os.path.join(output_dir,"%s_targets" % task_name)) as t:
        targets = [line.strip() for line in t]

    with tf.io.gfile.GFile(os.path.join(output_dir,"%s_inputs" % task_name)) as i:
        inputs = [eval(line).decode('utf-8') for line in i]

    p,r,f1,_ = precision_recall_fscore_support(targets, preds,average='micro')
    print(f'precison: {p} \nrecall: {r} \nf1: {f1}\n')
    print()
    print(classification_report(targets,preds))

evaluation(SUMM_DIR, "lang_classify")


precison: 0.9972889933128501 
recall: 0.9972889933128501 
f1: 0.9972889933128501


              precision    recall  f1-score   support

          ar       1.00      1.00      1.00      3320
          bg       1.00      1.00      1.00      3320
          de       1.00      1.00      1.00      3320
          el       1.00      1.00      1.00      3320
          en       0.99      1.00      0.99      3320
          es       1.00      1.00      1.00      3320
          fr       1.00      1.00      1.00      3320
          hi       1.00      0.98      0.99      3320
          ru       1.00      1.00      1.00      3320
          sw       1.00      1.00      1.00      3319
          th       1.00      1.00      1.00      3320
          tr       1.00      1.00      1.00      3320
          ur       0.98      1.00      0.99      3319
          vi       1.00      1.00      1.00      3320
          zh       1.00      1.00      1.00      3319

    accuracy                           1.00     497

In [None]:
if ON_TPU:
    %reload_ext tensorboard
%load_ext tensorboard
%tensorboard --logdir="$MODEL_DIR" --port=0

In [None]:
def load_best_model():
    return MtfModel(MODEL_DIR,
                   tpu=None,
                   model_parallelism=1,
                   sequence_length={"inputs": 64, "targets": 15})

def get_predictions(inputs,model=None):
    if type(inputs) == str:
        inputs = [inputs]

    with open('inputs.txt', "w") as f:
        for inp in inputs:
            f.write("input: %s\n" % inp.lower())

    model.predict(
          input_file='inputs.txt',
          output_file='predd.txt',
          temperature=0,
      )


model = load_best_model()
inputs = [
    "Let go to park"
    "चलो पार्क चलते हैं",
    "Hãy đến công viên",
    "Vamos a aparcar",
    "Пойдем в парк",
          ]
get_predictions(inputs,model)