In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install datasets
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.7.1-py3-none-any.whl (451 kB)
[K     |████████████████████████████████| 451 kB 25.2 MB/s 
[?25hCollecting xxhash
  Downloading xxhash-3.1.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[K     |████████████████████████████████| 212 kB 57.1 MB/s 
[?25hCollecting multiprocess
  Downloading multiprocess-0.70.14-py38-none-any.whl (132 kB)
[K     |████████████████████████████████| 132 kB 81.3 MB/s 
Collecting responses<0.19
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Collecting huggingface-hub<1.0.0,>=0.2.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 80.3 MB/s 
Collecting urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1
  Downloading urllib3-1.25.11-py2.py3-none-any.whl (127 kB)
[K     |████████████████████████████████| 127 kB 78.7 MB/s 
Installing coll

In [3]:
from datasets import load_dataset
from transformers import AutoTokenizer
import os
import tensorflow as tf
from transformers import TFAutoModelForCausalLM
from transformers import create_optimizer, AdamWeightDecay

## Load Data

In [4]:
!curl -O http://static.decontextualize.com/gutenberg-poetry-v001.ndjson.gz

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 52.2M  100 52.2M    0     0  79.8M      0 --:--:-- --:--:-- --:--:-- 79.8M


In [5]:
dataset = load_dataset("json", data_files="gutenberg-poetry-v001.ndjson.gz")



Downloading and preparing dataset json/default to /root/.cache/huggingface/datasets/json/default-963ef7a487b677c1/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset json downloaded and prepared to /root/.cache/huggingface/datasets/json/default-963ef7a487b677c1/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

In [6]:
dataset

DatasetDict({
    train: Dataset({
        features: ['s', 'gid'],
        num_rows: 3085117
    })
})

In [7]:
dataset['train'][0]

{'s': 'The Song of Hiawatha is based on the legends and stories of',
 'gid': '19'}

## Train Model

In [8]:
def preprocess(data, tokenizer):
  sen = [f'{element} <LINE> ' for element in data['s']]
  return tokenizer(sen, truncation=True)

In [9]:
base_model = "distilgpt2"
tokenizer = AutoTokenizer.from_pretrained(base_model)
dataset_enc = dataset.map(preprocess, batched=True, num_proc = 4, fn_kwargs={"tokenizer": tokenizer}, remove_columns = ['s', 'gid'])

Downloading:   0%|          | 0.00/762 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

     

#0:   0%|          | 0/772 [00:00<?, ?ba/s]

  

#1:   0%|          | 0/772 [00:00<?, ?ba/s]

 

#3:   0%|          | 0/772 [00:00<?, ?ba/s]

#2:   0%|          | 0/772 [00:00<?, ?ba/s]

In [10]:
dataset_enc

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 3085117
    })
})

In [12]:
block_size = 128
def concat_texts(data):
    joined = {}
    for i in data.keys():
      joined[i] = sum(data[i], [])

    total_length = ((len(joined[list(data.keys())[0]])) // block_size) * block_size
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in joined.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

In [13]:
lm_datasets = dataset_enc.map(
    concat_texts,
    batched=True,
    batch_size=1000,
    num_proc=4,
)

        

#0:   0%|          | 0/772 [00:00<?, ?ba/s]

#2:   0%|          | 0/772 [00:00<?, ?ba/s]

#1:   0%|          | 0/772 [00:00<?, ?ba/s]

#3:   0%|          | 0/772 [00:00<?, ?ba/s]

In [14]:
tokenizer.decode(lm_datasets["train"][0]["input_ids"])

'The Song of Hiawatha is based on the legends and stories of <LINE> many North American Indian tribes, but especially those of the <LINE> Ojibway Indians of northern Michigan, Wisconsin, and Minnesota. <LINE> They were collected by Henry Rowe Schoolcraft, the reknowned <LINE> Schoolcraft married Jane, O-bah-bahm-wawa-ge-zhe-go-qua (The <LINE> fur trader, and O-shau-gus-coday-way-qua (The Woman of the Green <LINE> Pra'

In [15]:
model = TFAutoModelForCausalLM.from_pretrained(base_model)

train_set = model.prepare_tf_dataset(
    lm_datasets["train"],
    shuffle=True,
    batch_size=64,
)

Downloading:   0%|          | 0.00/328M [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFGPT2LMHeadModel.

All the layers of TFGPT2LMHeadModel were initialized from the model checkpoint at distilgpt2.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


In [16]:
optimizer = AdamWeightDecay(lr=2e-5, weight_decay_rate=0.01)
model.compile(optimizer=optimizer, jit_compile=True)

  super(Adam, self).__init__(name, **kwargs)
No loss specified in compile() - the model's internal loss computation will be used as the loss. Don't panic - this is a common way to train TensorFlow models in Transformers! To disable this behaviour please pass a loss argument, or explicitly pass `loss=None` if you do not want your model to compute a loss.


In [None]:
num_epochs = 1
model.fit(
    train_set,
    epochs=num_epochs
)

Epoch 1: saving model to training_poem_gn/cp.ckpt


<keras.callbacks.History at 0x7f3cca445610>

In [None]:
model.save_pretrained("drive/MyDrive/FIRE_3rd Sem/peom_gn/")

## Inference

In [18]:
# load model
new_model = TFAutoModelForCausalLM.from_pretrained('drive/MyDrive/FIRE_3rd Sem/peom_gn/')

All model checkpoint layers were used when initializing TFGPT2LMHeadModel.

All the layers of TFGPT2LMHeadModel were initialized from the model checkpoint at drive/MyDrive/FIRE_3rd Sem/peom_gn/.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


In [19]:
test_sentence = "Is sung in the air,"

In [22]:
tokenized = tokenizer(test_sentence, return_tensors="np")

outputs = model.generate(**tokenized, max_length=20)

print(outputs)

Setting `pad_token_id` to 50256 (first `eos_token_id`) to generate sequence


tf.Tensor(
[[ 3792 23568   287   262  1633    11   290   262  3809   286   262  4453
     11   290   262  3809   286   262  4453    11]], shape=(1, 20), dtype=int32)


In [23]:
tokenizer.decode(outputs[0])

'Is sung in the air, and the voice of the Lord, and the voice of the Lord,'