### Imports

In [1]:
!pip install datasets
! pip install -U accelerate
! pip install -U transformers



In [2]:
import json
import torch as pt
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments

### Dataset

#### eliwill public dataset

In [None]:
ds = load_dataset("eliwill/Watts") # alternative public dataset

#### scrapped talks

In [None]:
# Alan Watts scrapped talks
def merge_json_datapoints(x):
  """
  merge json datapoints
  """
  return {"text": x["tag"] + ".\n" + x["title"] + ".\n" + x["body"].replace("\n\n", "\n") }

!mkdir data && curl https://raw.githubusercontent.com/Can-Sahin/alanwatts-transcripts/master/transcripts.json -o data/transcripts.json
ds = load_dataset("json", data_files="./data/transcripts.json")["train"]
ds = ds.map(merge_json_datapoints)
ds = ds.remove_columns(['body', 'title', 'tag'])


In [4]:
print(ds)

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 17390
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 399
    })
})


#### utils

In [6]:
def tokenize_function(tokenizer, example):
    tokens = tokenizer(example["text"], padding="max_length", truncation=True, return_tensors="pt")

    ids = tokens['input_ids']

    return {
        'input_ids': ids[:,:-1].numpy(),
        'labels': ids[:,1:].numpy(),
        'attention_mask': tokens['attention_mask'][:,1:].numpy()
        }

### Configs

In [7]:
device = "cuda" if pt.cuda.is_available() else "cpu"

In [None]:
EPOCHS=2
BATCH_SIZE=4
LR=5e-5

In [9]:
pt.manual_seed(0)

<torch._C.Generator at 0x7efe8c15f330>

### Training

In [10]:
# model_name = "openai-community/gpt2"
model_name = "distilbert/distilgpt2"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)



In [11]:
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    model.resize_token_embeddings(len(tokenizer))

In [12]:
tokenized_dataset = ds.map(lambda x: tokenize_function(tokenizer, x))

Map:   0%|          | 0/17390 [00:00<?, ? examples/s]

Map:   0%|          | 0/399 [00:00<?, ? examples/s]

In [15]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'input_ids', 'labels', 'attention_mask'],
        num_rows: 17390
    })
    validation: Dataset({
        features: ['text', 'input_ids', 'labels', 'attention_mask'],
        num_rows: 399
    })
})

In [16]:
def generate_text(prompt, model, tokenizer, max_length=50):
  prompt = tokenizer.encode(prompt, return_tensors="pt")
  generated_output = model.generate(prompt, pad_token_id=tokenizer.pad_token_id, max_new_tokens=max_length, no_repeat_ngram_size=2)
  return tokenizer.decode(generated_output[0], skip_special_tokens=True)


In [None]:
# no fine-tuning
prompt = "Write a short story in the style of Alan Watts: "
print(generate_text(prompt, model, tokenizer, 500))

Write a short story in the style of Alan Watts:  "I'm a writer, and I'm not a novelist. I write a story. And I don't write it in a way that's going to be a good story, because I think it's a bad story."
I've been writing for a while now, but I've never been a fan of the genre. It's not like I was a big fan.
The first time I read a novel, I thought, "Oh, this is a great novel. This is great. But I can't read it. So I'll just read the next one." And then I started reading the first two books, which were really good. The first book was about a guy who's trying to get a job. He's got a girlfriend, he's getting married, so he has to go to college. Then he goes to a college, gets a degree, then he gets married. That's the story of his life. There's no way I could have read that book. If I had read this book, it would have been the best novel I'd read. Because I didn't know what I wanted to read, or what the hell I needed to do. My first novel was The Man Who Lived, about the man who died in 

In [None]:
training_args = TrainingArguments(
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=EPOCHS,
    learning_rate=LR,
    logging_dir='./logs',
    logging_steps=100,
    evaluation_strategy='epoch',
    save_strategy='epoch',
    save_steps=1000,
    output_dir='./output',
    overwrite_output_dir=True,
    remove_unused_columns=False,
    load_best_model_at_end=True,
    metric_for_best_model='loss',
    greater_is_better=False,
  )
trainer = Trainer(
    model=model,
    args = training_args,
    # data_collator=DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False),
    # train_dataset=AuthorDataset(tokenized_dataset[17:]), # incase of scrapped dataset
    # eval_dataset=AuthorDataset(tokenized_dataset[:17]), # incase of scrapped dataset
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    compute_metrics=lambda pred: {'loss': pred.loss}
)
trainer.train()

Epoch,Training Loss,Validation Loss


In [17]:
# with fine tuning
prompt = "Write a short story in the style of Alan Watts: "
print(generate_text(prompt, model, tokenizer, 500))

Write a short story in the style of Alan Watts:  

The first time I read this, I was in awe of the way the story was written. I had never seen a story written by a writer who had written a novel. It was a very different story than the one I'd read in a book.
I was so excited to read it. The first thing I noticed was that the first story I saw was the title. This was an old story. A story about a man who was killed by an assassin. He was dead. And he was alive. So I thought, "Oh, this is a good story."
It was like a dream. You know, it's a fantasy. But it was not. There was no real story to it, and it wasn't. That was just a fiction. No real thing. What was it like?
There was nothing. Nothing. Everything. All the things. Every single thing that I could think of. Even the most mundane things that were not possible. They were impossible. We were all living in this world. These were the worlds we lived in. Those were worlds that we were living. Things that weren't possible, but they were r