In [1]:
#!/usr/bin/env python3
# coding=utf-8
# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team.
# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#Conditional text generation with the auto-regressive models of the library (GPT/GPT-2/CTRL/Transformer-XL/XLNet)

In [2]:
import torch
import transformers

import logging
import numpy as np

import argparse

from transformers import (
    GPT2LMHeadModel,
    GPT2Tokenizer,
    OpenAIGPTLMHeadModel,
    OpenAIGPTTokenizer,
)

## Build Text Generation Model

In [3]:
logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO,
)
logger = logging.getLogger(__name__)

MAX_LENGTH = int(10000)  # Hardcoded max length to avoid infinite loop

MODEL_CLASSES = {
    "gpt2": (GPT2LMHeadModel, GPT2Tokenizer),
    "openai-gpt": (OpenAIGPTLMHeadModel, OpenAIGPTTokenizer),
}

In [4]:
def adjust_length_to_model(length, max_sequence_length):
    if length < 0 and max_sequence_length > 0:
        length = max_sequence_length
    elif 0 < max_sequence_length < length:
        length = max_sequence_length  # No generation bigger than model size
    elif length < 0:
        length = MAX_LENGTH  # avoid infinite loop
    return length

In [5]:
def main(**kwarg):
    
    no_cuda = True
    device = torch.device("cuda" if torch.cuda.is_available() and not no_cuda else "cpu")

    # Initialize the model and tokenizer
    try:
        model_class, tokenizer_class = MODEL_CLASSES[kwarg['model_type']]
    except KeyError:
        raise KeyError("the model {} you specified is not supported. You are welcome to add it and open a PR :)")

    tokenizer = tokenizer_class.from_pretrained(kwarg['model_name_or_path'])
    model = model_class.from_pretrained(kwarg['model_name_or_path'])

    length = adjust_length_to_model(kwarg['length'], max_sequence_length=model.config.max_position_embeddings)
    logger.info(kwarg)

    prompt_text = kwarg['prompt'] if kwarg['prompt'] else input("Model prompt >>> ")
    
    encoded_prompt = tokenizer.encode(prompt_text, add_special_tokens=True, return_tensors="pt")
    encoded_prompt = encoded_prompt.to(device)

    if encoded_prompt.size()[-1] == 0:
        input_ids = None
    else:
        input_ids = encoded_prompt

    output_sequences = model.generate(
        input_ids=input_ids,
        max_length=length + len(encoded_prompt[0]),
        temperature= 1.0,
        top_k = kwarg['k'],
        top_p=0.9,
        repetition_penalty=1.0,
        do_sample=True,
        num_return_sequences = 2
    )

    # Remove the batch dimension when returning multiple sequences
    if len(output_sequences.shape) > 2:
        output_sequences.squeeze_()

    generated_sequences = []

    for generated_sequence_idx, generated_sequence in enumerate(output_sequences):
        print("=== GENERATED SEQUENCE {} ===".format(generated_sequence_idx + 1))
        generated_sequence = generated_sequence.tolist()

        # Decode text
        text = tokenizer.decode(generated_sequence, clean_up_tokenization_spaces=True)

        # Remove all text after the stop token
        text = text[: text.find(kwarg['stop_token']) if kwarg['stop_token'] else None]

        # Add the prompt at the beginning of the sequence. Remove the excess text that was used for pre-processing
        total_sequence = (
            prompt_text + text[len(tokenizer.decode(encoded_prompt[0], clean_up_tokenization_spaces=True)) :]
        )

        generated_sequences.append(total_sequence)
        print(total_sequence)

    return generated_sequences

In [6]:
OUTPUT_DIR='./output'
TRAIN_FILE='./dataset/prepared-data/text-generation/train_text-generation.txt'
VALID_FILE='./dataset/prepared-data/text-generation/dev_text-generation.txt'
PROMPT = "certain genetic conditions increase the risk of childhood cns embryonal tumors ."

In [9]:
%%time
output = main(model_type ='gpt2',
              model_name_or_path = OUTPUT_DIR,
              length = 300,
              prompt = PROMPT,
              stop_token = "<EOS>",
              k = 30,
              num_return_sequences = 2)

11/26/2020 15:04:27 - INFO - __main__ -   {'model_type': 'gpt2', 'model_name_or_path': './output', 'length': 300, 'prompt': 'certain genetic conditions increase the risk of childhood cns embryonal tumors .', 'stop_token': '<EOS>', 'k': 30, 'num_return_sequences': 2}
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


=== GENERATED SEQUENCE 1 ===
certain genetic conditions increase the risk of childhood cns embryonal tumors . A childhood carcinoma may cause signs or symptoms that begin before the tumor is diagnosed and continue for months or years.   Childhood cancer is usually treated as childhood cns embryonal tumors:           - Tissue is removed and the tumor is removed by surgery or chemotherapy.    - The tumor is raised under an artificial light (a fluorescent light that emits light) so that it is more visible to the outside world.    - The tumor is placed in a sealed bag or plastic bag with wires to carry radioactive material <BOS>  during surgery or chemotherapy.     - A special pathologist views the tumor, checks it, and removes any tumor cells that are not cancer.       Tests that examine the body and bones are used to detect (find) and diagnose childhood carcinomas.      Check the list of NCI-supported cancer clinical trials that are now accepting patients with childhood cns embryonal tum

 ## ---------------------------------------------------------------------------------------------------------

## Fine-tune GPT2 language model to dataset

In [3]:
%%time
!python3 ./transformers/examples/language-modeling/run_language_modeling.py \
--output_dir $OUTPUT_DIR \
--model_type gpt2\
--model_name_or_path gpt2\
--do_train \
--train_data_file $TRAIN_FILE \
--do_eval \
--eval_data_file $VALID_FILE \
--per_device_train_batch_size=2 \
--per_device_eval_batch_size=2 \
--line_by_line \
--overwrite_output_dir\
--learning_rate=5e-5 \
--num_train_epochs=1

2020-11-23 16:41:02.134525: W tensorflow/stream_executor/platform/default/dso_loader.cc:59] Could not load dynamic library 'libcudart.so.10.1'; dlerror: libcudart.so.10.1: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /cm/shared/apps/cuda10.0/toolkit/10.0.130/extras/CUPTI/lib64:/cm/local/apps/cuda/libs/current/lib64:/cm/shared/apps/cuda10.0/toolkit/10.0.130/targets/x86_64-linux/lib:/cm/shared/apps/slurm/18.08.9/lib64/slurm:/cm/shared/apps/slurm/18.08.9/lib64
2020-11-23 16:41:02.134594: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
11/23/2020 16:41:23 - INFO - __main__ -   Training/evaluation parameters TrainingArguments(output_dir='./output/', overwrite_output_dir=True, do_train=True, do_eval=True, do_predict=False, evaluation_strategy=<EvaluationStrategy.NO: 'no'>, prediction_loss_only=False, per_device_train_batch_size=2, per_device_eval_batch_size=2, per_gpu_train_batch_