In [1]:
import xml.etree.ElementTree as ET

tree = ET.parse('elden_ring.xml')
with open("dark_souls3.txt", "r") as f:
    ds = f.readlines()

elden_ring = [child.text for child in tree.find("entries")]
dark_souls = [t for t in ds if t.strip() != ""]
texts = elden_ring + dark_souls
texts[90:100]

['The conqueror of the stars, General Radahn.',
 'And the Blade of Miquella, Malenia the Severed.',
 'These two were the mightiest to remain, and locked horns in combat.',
 'But there would be no victor.',
 'And so, we inhabit a fractured world.',
 'Awaiting the arrival of the Elden Lord.',
 'Unless of course, thou shouldst take the crown?',
 'It happened an age ago.',
 'But when I recall, I see it true.',
 "On a night of wint'ry fog."]

In [2]:
len(texts)

10346

In [3]:
from transformers import AutoConfig
from transformers import GPT2TokenizerFast, GPT2LMHeadModel

MODEL_NAME = "gpt2"
tokenizer = GPT2TokenizerFast.from_pretrained(MODEL_NAME)

# the eos and bos tokens are defined
bos = '<|endoftext|>'
eos = '<|EOS|>'
pad = '<|pad|>'

special_tokens_dict = {'eos_token': eos, 'bos_token': bos, 'pad_token': pad}

# the new token is added to the tokenizer
num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)

# the model config to which we add the special tokens
config = AutoConfig.from_pretrained('gpt2', 
                                    bos_token_id=tokenizer.bos_token_id,
                                    eos_token_id=tokenizer.eos_token_id,
                                    pad_token_id=tokenizer.pad_token_id,
                                    output_hidden_states=False)

# the pre-trained model is loaded with the custom configuration
model = GPT2LMHeadModel.from_pretrained('gpt2', config=config)

# the model embedding is resized
model.resize_token_embeddings(len(tokenizer))

  from .autonotebook import tqdm as notebook_tqdm
2023-05-24 13:07:19.575932: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda-11.7/lib64
2023-05-24 13:07:19.576011: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda-11.7/lib64


Embedding(50259, 768)

In [4]:
texts

['(dummyText)',
 '(dummyText)',
 'The Tarnished will soon return.',
 'Guided by Grace once lost.',
 'The Golden Order is broken to its core.',
 'They will fight. And they will die. In an unending curse.',
 'For how else is a champion, or a lord, to be born?',
 'Brandish the Elden Ring.',
 '...for all of us.',
 'The fallen leaves have spoken.',
 'Of the return of the Tarnished. To the Lands Between.',
 'Elden Ring, O, Elden Ring.',
 'Shaper of life, arbiter of fate.',
 'Lord of the very stars above.',
 'Tarnished warrior, spurned by the grace of gold.',
 'Seek that which hath been lost to thee.',
 'Across the Sea of Fog, to the Lands Between.',
 'Seek the Elden Ring!',
 'Brave Tarnished. Mighty warriors, who fought at my side.',
 'Await the summons. It will call to thee one day.',
 'Heed the fading grace. Listen to the Fingers.',
 'Gold commandeth the very stars,',
 'giving life its fullest brilliance.',
 'Elden Ring, O, Elden Ring.',
 'Be ready, once it is shattered.',
 'Seek the Elden

In [5]:
from datasets import Dataset
from sklearn.model_selection import train_test_split

texts = [tokenizer.bos_token + " " + t + " " + tokenizer.eos_token for t in texts]
train, validation = train_test_split(texts, test_size=0.05, random_state=7)
train = Dataset.from_dict({"text": train})
validation = Dataset.from_dict({"text": validation})
train, validation

(Dataset({
     features: ['text'],
     num_rows: 9828
 }),
 Dataset({
     features: ['text'],
     num_rows: 518
 }))

In [6]:
def tokenize_function(examples):
    return tokenizer(examples['text'], padding=True)

tokenized_train_dataset = train.map(
    tokenize_function,
    batched=True,
    num_proc=5,
    remove_columns=['text'],
)
tokenized_val_dataset = validation.map(
    tokenize_function,
    batched=True,
    num_proc=5,
    remove_columns=['text'],
)

tokenized_train_dataset, tokenized_val_dataset

                                                                              

(Dataset({
     features: ['input_ids', 'attention_mask'],
     num_rows: 9828
 }),
 Dataset({
     features: ['input_ids', 'attention_mask'],
     num_rows: 518
 }))

In [9]:
tokenizer.decode(tokenized_train_dataset['input_ids'][50])

'<|endoftext|> Should you become Elden Lord. <|EOS|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|>'

In [10]:
from transformers import DataCollatorForLanguageModeling, TrainingArguments, Trainer

OUTPUT_DIR = "./results"

training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,           # output directory
    num_train_epochs=6,              # total # of training epochs
    per_device_train_batch_size=32,  # batch size per device during training
    per_device_eval_batch_size=16,   # batch size for evaluation
    warmup_steps=200,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir=OUTPUT_DIR,          # directory for storing logs
    prediction_loss_only=True,
    save_steps=10000 
)

data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=False
    )

In [8]:
trainer = Trainer(
    model=model,    # the instantiated  Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    data_collator=data_collator,
    train_dataset=tokenized_train_dataset,         # training dataset
    eval_dataset=tokenized_val_dataset            # evaluation dataset
)

trainer.train()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33memrecncelik[0m. Use [1m`wandb login --relogin`[0m to force relogin


You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
500,8.3144
1000,2.6694


TrainOutput(global_step=1848, training_loss=4.054680332993016, metrics={'train_runtime': 507.2287, 'train_samples_per_second': 116.255, 'train_steps_per_second': 3.643, 'total_flos': 1444487675904000.0, 'train_loss': 4.054680332993016, 'epoch': 6.0})

In [9]:
trainer.save_model()
tokenizer.save_pretrained(OUTPUT_DIR)
model.save_pretrained(OUTPUT_DIR)

In [10]:
trainer.evaluate()

{'eval_loss': 2.648031711578369,
 'eval_runtime': 1.4969,
 'eval_samples_per_second': 346.057,
 'eval_steps_per_second': 22.046,
 'epoch': 6.0}

In [11]:
def generate_n_text_samples(model, tokenizer, input_text, device, n_samples=5):
    text_ids = tokenizer.encode(input_text, return_tensors='pt')
    text_ids = text_ids.to(device)
    model = model.to(device)

    generated_text_samples = model.generate(
        text_ids, 
        max_length=300,
        num_return_sequences=n_samples,
        no_repeat_ngram_size=2,
        repetition_penalty=1.5,
        top_p=0.92,
        temperature=.85,
        do_sample=True,
        top_k=125,
        early_stopping= True
    )
    
    gen_text = []
    for t in generated_text_samples:
        text = tokenizer.decode(t, skip_special_tokens=True)
        gen_text.append(text)

    return gen_text


In [16]:
# trained model loading
souls_model = GPT2LMHeadModel.from_pretrained(OUTPUT_DIR)
souls_tokenizer = GPT2TokenizerFast.from_pretrained(OUTPUT_DIR)

device = "cuda:0"

input_text = souls_tokenizer.bos_token + " Fire keeper"
quotes = generate_n_text_samples(souls_model, souls_tokenizer, 
                                 input_text, device, n_samples=20)
for q in quotes:
    print(q)
    print()


 Fire keeper, a fitting name for you. 

 Fire keeper... 

 Fire keeper... 

 Fire keeper, what have I done? 

 Fire keeper, allow me to speak with you in person. 

 Fire keeper! 

 Fire keeper... 

 Fire keeper. You might as well join me, for a moment longer...
 

 Fire keeper, mayhaps the fire of vengeance guides your way. 

 Fire keeper, thou'rt a true knight of valour.
 

 Fire keeper! Stop this, please. 

 Fire keeper, don't you dare try and kill me. 

 Fire keeper. 

 Fire keeper, this must be the greatest task I ever had. 

 Fire keeper... 

 Fire keeper. I could've sworn a fireman like you had my back! 

 Fire keeper, what's happened to you? 

 Fire keeper, welcome home. 

 Fire keeper, be sure to greet her.
 

 Fire keeper, you've no business in this castle.
 

