# Love letter reply

### Finetune a GPT model

import sys
!{sys.executable} -m pip install datasets

In [31]:
import datasets
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer

In [32]:
tokenizer = AutoTokenizer.from_pretrained('distilgpt2')
model = AutoModelForCausalLM.from_pretrained('distilgpt2')

In [33]:
generator = pipeline('text-generation', model=model, tokenizer=tokenizer)

In [34]:
generator("Such as in evening silence come")

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': "Such as in evening silence come the first part of the night, so the next morning (as always it is on Sundays) we return on the way home.\nI am on the last-minute of every waking call. I'm not doing anything"}]

with open("combined.txt", "w") as fh:
    fh.write(open("combined-20000.txt").read()[:20000])

In [35]:
training_data = datasets.load_dataset('text', data_files="combined.txt")

Found cached dataset text (/Users/voruin/.cache/huggingface/datasets/text/default-649a8d2ae5e0a169/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  0%|          | 0/1 [00:00<?, ?it/s]

In [14]:
tokenizer.pad_token = tokenizer.eos_token
tokenized_training_data = training_data.map(
    lambda x: tokenizer(x['text']),
    remove_columns=["text"]
)

Map:   0%|          | 0/2784 [00:00<?, ? examples/s]

In [15]:
block_size = 64
# magic from https://github.com/huggingface/notebooks/blob/master/examples/language_modeling.ipynb
def group_texts(examples):
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    total_length = (total_length // block_size) * block_size
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result
lm_training_data = tokenized_training_data.map(
    group_texts,
    batched=True,
    batch_size=200
)

Map:   0%|          | 0/2784 [00:00<?, ? examples/s]

In [16]:
from transformers import Trainer, TrainingArguments


In [17]:
trainer = Trainer(model=model,
                  train_dataset=lm_training_data['train'],
                  args=TrainingArguments(
                      output_dir='distilgpt2-finetune-victorianLovePoems',
                      num_train_epochs=1,
                      do_train=True,
                      do_eval=False
                  ),
                  tokenizer=tokenizer)

In [18]:
trainer.train()

***** Running training *****
  Num examples = 366
  Num Epochs = 1
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 46


Step,Training Loss




Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=46, training_loss=4.881471716839334, metrics={'train_runtime': 21.6837, 'train_samples_per_second': 16.879, 'train_steps_per_second': 2.121, 'total_flos': 5977163169792.0, 'train_loss': 4.881471716839334, 'epoch': 1.0})

In [19]:
trainer.save_model()

Saving model checkpoint to distilgpt2-finetune-victorianLovePoems
Configuration saved in distilgpt2-finetune-victorianLovePoems/config.json
Model weights saved in distilgpt2-finetune-victorianLovePoems/pytorch_model.bin
tokenizer config file saved in distilgpt2-finetune-victorianLovePoems/tokenizer_config.json
Special tokens file saved in distilgpt2-finetune-victorianLovePoems/special_tokens_map.json


In [36]:
generator("And days may pass in gay confusion,", max_length=100)[0]['generated_text']

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


'And days may pass in gay confusion, in our time. And on the eve of the 2015 presidential election, we must take care of the LGBT community. This year has had a profound and profound, multifaceted, and deeply profound impact on gay politics.\n\n\n\n\nThis year marks the last time a LGBT-themed ad campaign has been launched that has gone on-the-air against the LGBT community.'

In [37]:
my_tokenizer = AutoTokenizer.from_pretrained('distilgpt2-finetune-victorianLovePoems')
my_model = AutoModelForCausalLM.from_pretrained('distilgpt2-finetune-victorianLovePoems')

In [38]:
my_generator = pipeline("text-generation", model=my_model, tokenizer=my_tokenizer)

my_generator("And days may pass in gay confusion,")[0]['generated_text']


my_generator("And days may pass in gay confusion,")[0]['generated_text']

In [41]:
import re

In [42]:
def poem_generator(prompt, min_length=50, max_length=200):
    generated_text = my_generator(prompt)[0]['generated_text']
    while len(generated_text) < min_length or len(generated_text) > max_length or not re.search(r'[.!?]$', generated_text):
        print(generated_text)
        generated_text = my_generator(prompt)[0]['generated_text']
    return generated_text

poem_generator("Did you miss me?", min_length=80, max_length=300)

In [44]:
def poem_generator2(prompt, min_length=50, max_length=200):

    generated_text = my_generator(prompt)[0]['generated_text']
    
    while len(generated_text) < min_length or len(generated_text) > max_length:
        generated_text = my_generator(prompt)[0]['generated_text']

    if re.search(r'[.!?]$', generated_text):
        generated_text=generated_text
        
    elif not re.search(r'[.!?]$', generated_text):
        generated_text=generated_text+" ..."

    return generated_text


poem_generator2("None would miss me in all the world,", min_length=80, max_length=300)

poem_generator2("I miss you,", min_length=100, max_length=400)

### test on Markovify (not use)

In [48]:
import markovify

In [49]:
generator_mark = markovify.Text(text_lovePoems)

In [50]:
print(generator_mark.make_sentence())

On all her loveliness.


In [51]:
print(generator_mark.make_short_sentence(50, test_output=False))

Thus am I mine own prison.


In [52]:
print(generator_mark.make_short_sentence(400, test_output=False))

Though I tarry, wait for another life, for more suffering, To give them birth; another life and many more tears And love, to make her eat.


In [53]:
poems_model = markovify.NewlineText(text_lovePoems, state_size=1)

In [54]:
for i in range(8):
    print(poems_model.make_sentence())

Smiles, tears, hoped for evermore
Where the goblins cry:
Laura spoke in pain;
No later light with haste
It lurks like mine be
Many in love:
One call’d her resistance,
It was in secret steps, thou showest me in rosy morn,


In [55]:
print(poems_model.make_sentence(test_output=False))

The heart


### giving prompt

In [None]:
text_lovePoems = open("combined.txt").read()

In [57]:
poems_lines = text_lovePoems.split("\n")

In [58]:
from simpleneighbors import SimpleNeighbors

In [59]:
import numpy as np
import spacy
import random

In [60]:
nlp = spacy.load('en_core_web_md')

In [61]:
lines = []
for line in poems_lines:
    if line !="":
        lines.append(line.replace('“', '').replace('”', '').strip())

len(lines)

In [63]:
sampled_lines = random.sample(lines,len(lines))

In [64]:
def summary(sent):
    return nlp(sent, disable=['parser', 'tagger', 'ner']).vector

In [65]:
embeddings = [summary(line) for line in sampled_lines]



In [66]:
lookup = SimpleNeighbors(300)
for vec, line in zip(embeddings, sampled_lines):
    lookup.add_one(line, vec)
lookup.build()

lookup.nearest(summary("miss"), 20)

poem_generator2("And wonder what you’ve missed.", min_length=100, max_length=400)

In [69]:
import random
import spacy

nlp = spacy.load('en_core_web_md')
input_prompt="I sit here, an arch-villain of romance, thinking about you." ## change here

In [77]:
doc = nlp(text_lovePoems)

In [78]:
words = [w for w in list(doc) if w.is_alpha]

In [80]:
nouns = [w for w in words if w.pos_ == "NOUN"]
verbs = [w for w in words if w.pos_ == "VERB"]

In [112]:
nouns_string = [str(w) for w in nouns]

In [70]:
doc_prompt = nlp(input_prompt)
words_in_prompt = [w for w in list(doc_prompt) if w.is_alpha]
nouns_in_prompt = [w for w in words_in_prompt if w.pos_ == "NOUN"]
verbs_in_prompt = [w for w in words_in_prompt if w.pos_ == "VERB"]

if len(verbs_in_prompt) > 0:
    prompt_word = random.choice(verbs_in_prompt)
elif len(nouns_in_prompt) > 0:
    prompt_word = random.choice(nouns_in_prompt)
else:
    prompt_word = random.choice(words)

print(prompt_word)

thinking


In [88]:
nearst_corpus_lists = lookup.nearest(summary(str(prompt_word)), 10)
content_generated = poem_generator2(random.choice(nearst_corpus_lists), min_length=100, max_length=400)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [89]:
print(content_generated)

'And all the world and I seemed much less cold,I sat all day,Even half my night,I had no more time to think.You shall not think I love you.But you shall not think I love you,Nor did you think ...'