### Imports

In [None]:
!pip install datasets
! pip install -U accelerate
! pip install -U transformers

Collecting datasets
  Downloading datasets-2.19.1-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub>=0.21.2 (from datasets)
  Downloading huggingface_hub-0.23.0-py3-none-any.

In [52]:
import json
import torch as pt
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments, DataCollatorForLanguageModeling

### Dataset

#### eliwill public dataset

In [44]:
ds = load_dataset("eliwill/Watts") # alternative public dataset, already split
print(ds)

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 17390
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 399
    })
})


#### scrapped talks

In [43]:
# Alan Watts scrapped talks
def merge_json_datapoints(x):
  """
  merge json datapoints
  """
  return {"text": x["tag"] + ".\n" + x["title"] + ".\n" + x["body"].replace("\n\n", "\n") }

!mkdir data && curl https://raw.githubusercontent.com/Can-Sahin/alanwatts-transcripts/master/transcripts.json -o data/transcripts.json

from datasets import concatenate_datasets

ds = load_dataset("json", data_files="./data/transcripts.json", split="train")
ds = ds.train_test_split(test_size=0.1)
ds = ds.map(merge_json_datapoints, remove_columns=['body', 'title', 'tag'])
print(ds)

mkdir: cannot create directory ‘data’: File exists


Map:   0%|          | 0/105 [00:00<?, ? examples/s]

Map:   0%|          | 0/12 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 105
    })
    test: Dataset({
        features: ['text'],
        num_rows: 12
    })
})


#### utils

In [41]:
def tokenize_function(tokenizer, example):
  example["text"] = [line for line in example["text"] if len(line) > 0 and not line.isspace()] # batched
  tokens = tokenizer(example["text"], padding="max_length", max_length=tokenizer.model_max_length, truncation=True, return_tensors="pt")

  ids = tokens['input_ids']

  return {
      'input_ids': ids[:,:-1].numpy(),
      'labels': ids[:,1:].numpy(),
      'attention_mask': tokens['attention_mask'][:,1:].numpy()
      }

### Configs

In [45]:
device = "cuda" if pt.cuda.is_available() else "cpu"
pt.manual_seed(0)

<torch._C.Generator at 0x7b159986f670>

In [54]:
EPOCHS=2
BATCH_SIZE=4
LR=5e-5 # very small lr since we're fine-tuning
GRAD_ACC=8 # effectively making batch_size=32 (8 * 4)

### Training

In [47]:
# model_name = "openai-community/gpt2"
model_name = "distilbert/distilgpt2"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)



In [48]:
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    model.resize_token_embeddings(len(tokenizer))

In [49]:
tokenized_dataset = ds.map(lambda x: tokenize_function(tokenizer, x), batched=True, num_proc=5, remove_columns=["text"])

In [15]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'labels', 'attention_mask'],
        num_rows: 17390
    })
    validation: Dataset({
        features: ['input_ids', 'labels', 'attention_mask'],
        num_rows: 399
    })
})

In [51]:
def generate_text(prompt, model, tokenizer, max_output_length=50):
  prompt = tokenizer.encode(prompt, return_tensors="pt")
  generated_output = model.generate(prompt, pad_token_id=tokenizer.pad_token_id, max_new_tokens=max_output_length, no_repeat_ngram_size=2)
  return tokenizer.decode(generated_output[0], skip_special_tokens=True)


In [None]:
training_args = TrainingArguments(
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=GRAD_ACC,
    num_train_epochs=EPOCHS,
    learning_rate=LR,
    logging_dir='./logs',
    logging_steps=100,
    evaluation_strategy='epoch',
    save_strategy='epoch',
    save_steps=1000,
    output_dir='./output',
    overwrite_output_dir=True,
    remove_unused_columns=False,
    load_best_model_at_end=True,
    metric_for_best_model='loss',
    greater_is_better=False,
  )
collator=DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=collator,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    compute_metrics=lambda pred: {'loss': pred.loss}
)
trainer.train()

In [None]:
# with fine tuning
prompt = "Write a short story in the style of Alan Watts: "
print(generate_text(prompt, model, tokenizer, 500))