In [2]:
# Following https://github.com/huggingface/notebooks/blob/master/examples/translation.ipynb

In [3]:
!pip install wandb datasets transformers sacrebleu==1.5.1  bert-score -qqq

In [4]:
!nvidia-smi

Tue Oct 19 23:02:07 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.74       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   33C    P8    28W / 149W |      0MiB / 11441MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [6]:
import datetime
from tqdm import tqdm

model_checkpoint = 't5-base'
fp16 = True
todaydate = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M")
gdir = f'drive/My Drive/Colab Notebooks/{todaydate}/'
model_name=f'sparql-qald9-{model_checkpoint}-{todaydate}'
model_path='./models/'+model_name
ds_path= 'qald-text-to-sparql'
print(model_name)

sparql-qald9-t5-base-2021-10-19_23-02


In [7]:
# Flexible integration for any Python script
import wandb

# 1. Start a W&B run
wandb.init(project='text-to-sparql', entity='shahriar', name= model_name)

# 2. Save model inputs and hyperparameters
config = wandb.config
config.learning_rate = 0.01

[34m[1mwandb[0m: Currently logged in as: [33mshahriar[0m (use `wandb login --relogin` to force relogin)


In [8]:
model_name

'sparql-qald9-t5-base-2021-10-19_23-02'

In [10]:
!mkdir models

mkdir: cannot create directory ‘models’: File exists


In [9]:
from huggingface_hub import notebook_login
notebook_login()

Login successful
Your token has been saved to /root/.huggingface/token


In [11]:
!apt install git-lfs -qqq
!git config --global credential.helper store

In [12]:
!unzip qald-text-to-sparql.zip

Archive:  qald-text-to-sparql.zip
replace qald-text-to-sparql/dataset_dict.json? [y]es, [n]o, [A]ll, [N]one, [r]ename: nb
replace qald-text-to-sparql/test.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: 

In [13]:
x = 'yazdipour/text-to-sparql-t5-base-2021-10-19_15-35_lastDS'
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM, T5TokenizerFast
model = AutoModelForSeq2SeqLM.from_pretrained(x).to("cuda")
tokenizer = AutoTokenizer.from_pretrained(x)
# tokenizer = T5TokenizerFast.from_pretrained(model_name)

Downloading:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/850M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.88k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.32M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.74k [00:00<?, ?B/s]

In [14]:
from datasets import load_dataset, load_metric, Dataset, load_from_disk
raw_datasets = load_from_disk(ds_path)
# raw_datasets = load_dataset("yazdipour/text-to-sparql-t5-lc-quad-v2", data_files={"train": "train.csv", "test": "test.csv"})

In [13]:
# Preprocessing

In [15]:
print(raw_datasets['test']['translation'][0]['sparql'],'\n',
      raw_datasets['test']['translation'][0]['en'],'\n',
      tokenizer(raw_datasets['test']['translation'][0]['en']))

select distinct ?uri where [ res:Salt_Lake_City onto:timeZone ?uri ] 
 what is the time zone of salt lake city? 
 {'input_ids': [125, 19, 8, 97, 2901, 13, 3136, 6957, 690, 58, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [17]:
max_input_length = 0 
max_target_length = 0
for d in tqdm(raw_datasets['train']['translation']):
    len_en = len(d['en'])
    len_qry = len(d['sparql'])
    if len_en > max_input_length: max_input_length=len_en
    if len_qry > max_target_length: max_target_length=len_qry
print('\n',max_input_length, max_target_length)

100%|██████████| 408/408 [00:00<00:00, 120972.43it/s]


 85 299





In [18]:
source_lang = "en"
target_lang = "sparql"
prefix = "translate english to sparql2: "

def preprocess_function(examples):
    inputs = []
    targets= []
    for ex in examples["translation"]:
      inputs.append(prefix + ex[source_lang])
      targets.append(ex[target_lang])
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
      labels = tokenizer(targets, max_length=max_target_length, truncation=True)
      
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [19]:
tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [20]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['attention_mask', 'input_ids', 'labels', 'translation'],
        num_rows: 408
    })
    test: Dataset({
        features: ['attention_mask', 'input_ids', 'labels', 'translation'],
        num_rows: 150
    })
})

In [21]:
# Fine-tuning the model

In [22]:
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer


In [23]:
batch_size = 8
args = Seq2SeqTrainingArguments(
    model_name,
    evaluation_strategy = "epoch",
    learning_rate=3e-4,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=1,
    predict_with_generate=True,
    fp16=fp16,
    push_to_hub=True
)

In [24]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [25]:
import numpy as np
def postprocess_text(preds, labels):
    preds = [pred.replace('?',' ?').replace('.', ' .').strip() for pred in preds]
    labels = [[label.replace('?',' ?').replace('.', ' .').strip()] for label in labels]
    return preds, labels

In [26]:
# The last thing to define for our Seq2SeqTrainer is how to compute 
# the metrics from the predictions. We need to define 
# a function for this, which will just use the metric we loaded earlier, 
# and we have to do a bit of pre-processing to decode the predictions into texts:

In [27]:
from bert_score import BERTScorer
scorer = BERTScorer(lang="en", rescale_with_baseline=True)
metric = load_metric("sacrebleu")

Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [28]:
def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    P, R, F1 = scorer.score(decoded_preds, decoded_labels)
    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    gen_len = np.mean(prediction_lens)
    
    return {"gen_len":gen_len, 'P':P.mean(), 'R':R.mean(), 'F1':F1.mean(), "bleu-score": result["score"], "bleu-precisions": result["precisions"], "bleu-bp": result["bp"]}

In [29]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

Cloning https://huggingface.co/yazdipour/sparql-qald9-t5-base-2021-10-19_23-02 into local empty directory.
Using amp fp16 backend


In [30]:
trainer.train()

The following columns in the training set  don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: translation.
***** Running training *****
  Num examples = 408
  Num Epochs = 1
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 51
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


Epoch,Training Loss,Validation Loss,Gen Len,P,R,F1,Bleu-score,Bleu-precisions,Bleu-bp
1,No log,1.830026,19.0,0.364001,0.034577,0.194338,10.035799,"[72.88988261598658, 50.27455765710799, 35.93015446608462, 28.454070201643017]",0.228112


The following columns in the evaluation set  don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: translation.
***** Running Evaluation *****
  Num examples = 150
  Batch size = 8


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=51, training_loss=1.2968808342428768, metrics={'train_runtime': 94.6069, 'train_samples_per_second': 4.313, 'train_steps_per_second': 0.539, 'total_flos': 52370378588160.0, 'train_loss': 1.2968808342428768, 'epoch': 1.0})

In [31]:
trainer.save_model(model_path)

Saving model checkpoint to ./models/sparql-qald9-t5-base-2021-10-19_23-02
Configuration saved in ./models/sparql-qald9-t5-base-2021-10-19_23-02/config.json
Model weights saved in ./models/sparql-qald9-t5-base-2021-10-19_23-02/pytorch_model.bin
tokenizer config file saved in ./models/sparql-qald9-t5-base-2021-10-19_23-02/tokenizer_config.json
Special tokens file saved in ./models/sparql-qald9-t5-base-2021-10-19_23-02/special_tokens_map.json


In [32]:
model_name

'sparql-qald9-t5-base-2021-10-19_23-02'

In [None]:
trainer.push_to_hub()

Saving model checkpoint to sparql-qald9-t5-base-2021-10-19_23-02
Configuration saved in sparql-qald9-t5-base-2021-10-19_23-02/config.json
Model weights saved in sparql-qald9-t5-base-2021-10-19_23-02/pytorch_model.bin
tokenizer config file saved in sparql-qald9-t5-base-2021-10-19_23-02/tokenizer_config.json
Special tokens file saved in sparql-qald9-t5-base-2021-10-19_23-02/special_tokens_map.json


Upload file pytorch_model.bin:   0%|          | 3.36k/850M [00:00<?, ?B/s]

Upload file runs/Oct19_23-08-26_4e9c45cbc667/1634684953.6451986/events.out.tfevents.1634684953.4e9c45cbc667.82…

Upload file runs/Oct19_23-08-26_4e9c45cbc667/events.out.tfevents.1634684953.4e9c45cbc667.828.0:  68%|######8  …

Upload file training_args.bin: 100%|##########| 2.92k/2.92k [00:00<?, ?B/s]

In [None]:
raw_datasets.

In [None]:
translator = pipeline(
    "translation_xx_to_yy",
    model=model,
    tokenizer=tokenizer,
    device=0 #0 for cuda, -1 for cpu
)

In [None]:
translate= lambda q: (translator(prefix+q, max_length=100)[0]['translation_text'])

In [None]:
translate('How old is Bill Gates?')

In [None]:
translate('Are Taiko some kind of Japanese musical instrument?  ')