In [None]:
folder_path = './data'
import os
import json
from transformers import TextDataset, DataCollatorForLanguageModeling
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from transformers import Trainer, TrainingArguments
from transformers import  GPT2LMHeadModel, GPT2Tokenizer, GPT2Config

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')

In [None]:
# read data and structure them
if os.path.isdir(folder_path):
    file_list = os.listdir(folder_path)
    data = []
    for file_name in file_list:
        if file_name.endswith(".txt"):
            file_path = os.path.join(folder_path, file_name)
            with open(file_path, 'r', encoding='utf-8') as file:
                content = file.read()
                title, text = content.split('\n\n', 1)
                data_row = {}
                data_row["[TEXT]"] = text
                data_row["[TITLE]"] = title
                data.append(data_row)

In [None]:
#preprocessing
for data_sample in data:
    data_sample["[TEXT]"]=data_sample["[TEXT]"].replace('\n', '')
    data_sample["[TEXT]"]=data_sample["[TEXT]"].replace('\n\n', '')

    tokenized_text = tokenizer.tokenize(data_sample["[TEXT]"])
    stopwords_set = set(stopwords.words('english'))
    data_sample["[TEXT]"] = [token for token in tokenized_text if token not in stopwords_set]
    data_sample["[TEXT]"] = [token for token in tokenized_text if token.isalnum()]
    data_sample["[TEXT]"] = ' '.join(data_sample["[TEXT]"])

In [None]:
#save preprocessed
data_to_save = json.dumps(data)
file_path = 'preprocessed_data_dict.txt'
with open(file_path, 'w') as file:
    file.write(data_to_save)

In [None]:
data_dict_path = './preprocessed_data_dict.txt'

In [None]:
#data
def load_dataset(file_path, tokenizer, block_size = 128):
    dataset = TextDataset(
        tokenizer = tokenizer,
        file_path = file_path,
        block_size = block_size,
    )
    return dataset

In [None]:
def load_data_collator(tokenizer, mlm = False):
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, 
        mlm=mlm,
    )
    return data_collator

In [None]:
# model, tokenizer definition
def train(train_file_path,model_name,
          output_dir,
          overwrite_output_dir,
          per_device_train_batch_size,
          num_train_epochs,
          ):
  tokenizer = GPT2Tokenizer.from_pretrained(model_name)
  config = GPT2Config.from_pretrained(model_name)
  train_dataset = load_dataset(train_file_path, tokenizer)
  data_collator = load_data_collator(tokenizer)

  tokenizer.save_pretrained(output_dir)
      
  model = GPT2LMHeadModel.from_pretrained(pretrained_model_name_or_path = model_name, config = config)

  model.save_pretrained(output_dir)

  training_args = TrainingArguments(
          output_dir=output_dir,
          overwrite_output_dir=overwrite_output_dir,
          per_device_train_batch_size=per_device_train_batch_size,
          num_train_epochs=num_train_epochs,
      )

  trainer = Trainer(
          model=model,
          args=training_args,
          data_collator=data_collator,
          train_dataset=train_dataset,
  )
      
  trainer.train()
  trainer.save_model()

In [None]:
#model parameters
model_name = 'gpt2'
output_dir = './training_out'
per_device_train_batch_size = 32
num_train_epochs = 10

In [None]:
train(
    train_file_path=data_dict_path,
    model_name=model_name,
    output_dir=output_dir,
    per_device_train_batch_size=per_device_train_batch_size,
    num_train_epochs=num_train_epochs,
)

In [None]:
#inference
def load_model(model_path):
    model = GPT2LMHeadModel.from_pretrained(model_path)
    return model

def load_tokenizer(tokenizer_path):
    tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_path)
    return tokenizer

def generate_text(model_path, sequence):
    model = load_model(model_path)
    tokenizer = load_tokenizer(model_path)
    ids = tokenizer.encode(f'{sequence}', return_tensors='pt')
    final_outputs = model.generate(
        ids,
        do_sample=True,
        pad_token_id=model.config.eos_token_id,
        top_k=20,
        top_p=0.5,
        num_return_sequences=6
    )
    print(tokenizer.decode(final_outputs[0], skip_special_tokens=False))

In [None]:
#inference sample
trained_model_path = './training_out'
sequence_to_test = '{"[TEXT]": German airline Lufthansa has returned to profit in 2004 after posting huge losses in 2003. In a preliminary report, the airline announced net profits of 400m euros ($527.61m; £274.73m), compared with a loss of 984m euros in 2003. Operating profits were at 380m euros, ten times more than in 2003. Lufthansa was hit in 2003 by tough competition and a dip in demand following the Iraq war and the killer SARS virus. It was also hit by troubles at its US catering business. Last year, Lufthansa showed signs of recovery even as some European and US airlines were teetering on the brink of bankruptcy. The board of Lufthansa has recommended paying a 2004 dividend of 0.30 euros per share. In 2003, shareholders did not get a dividend. The company said that it will give all the details of its 2004 results on 23 March, "[TITLE]":   '
generate_text(trained_model_path, sequence_to_test) 