In [3]:
import pandas as pd

In [4]:
# open the feather dataframe bhf.feather
df = pd.read_feather("./bhf.feather")
display(df.head())

# drop the links that are not from the bhf website
df = df[df.url.str.startswith("https://www.bhf.org.uk")]

# remove useless links
df = df[df.url.str.contains("/heart-matters-magazine") == False]
df = df[df.url.str.contains("/healthy-eating") == False]
df = df[df.url.str.contains("/publications") == False]

keep = ["https://www.bhf.org.uk/informationsupport/how-a-healthy-heart-works"]
remove = ["https://www.bhf.org.uk/informationsupport/conditions",
          "https://www.bhf.org.uk/informationsupport/conditions/az-of-heart-and-circulatory-diseases",
          "https://www.bhf.org.uk/informationsupport/support/cardiac-rehabilitation-at-home", 
          "https://www.bhf.org.uk/informationsupport/support/children-and-young-people"
]

urls = [i for i in df.url.to_list() if (i.split("https://www.bhf.org.uk/informationsupport")[1].count("/") > 1 and i not in remove) or i in keep]
df = df[df.url.isin(urls)]
df.shape

ImportError: Missing optional dependency 'pyarrow'.  Use pip or conda to install pyarrow.

In [None]:
# only keep the text from the web page
from bs4 import BeautifulSoup

soups = [BeautifulSoup(i.decode("utf-8"), "html.parser") for i in df.content]
name = [soup.find("h1", {"itemprop":"headline"}).text for soup in soups]
content = []
for soup in soups:
    r = soup.find("section", {"class":"c-text-component"})
    if r is not None:
        content.append(r.text)
    else:
        content.append(None)

ds = pd.DataFrame({"name":name, "content":content, "url":df.url})
ds = ds.dropna()

In [None]:
t = []
for c, n, in zip(ds.content, ds.name):
    if c.startswith("\n"):
        t.append("Article title: " + n + c)
    else:
        t.append("Article title: " + n + "\n" + c)

ds["full"] = t

In [None]:
import re
import json
from sklearn.model_selection import train_test_split


train, test = train_test_split(ds["full"] ,test_size=0.15) 


print("Train dataset length: "+str(len(train)))
print("Test dataset length: "+ str(len(test)))


In [None]:
with open("train.txt", "w", encoding="utf-8") as f:
    f.write("#### START ARTICLE ####\n\n" + "\n#### END ARTICLE ####\n\n#### START ARTICLE ####\n\n".join(train.to_list()) + "\n#### END ARTICLE ####\n")

with open("test.txt", "w", encoding="utf-8") as f:
    f.write("#### START ARTICLE ####\n\n" + "\n#### END ARTICLE ####\n\n#### START ARTICLE ####\n\n".join(test.to_list()) + "\n#### END ARTICLE ####\n")

In [None]:
from transformers import AutoTokenizer
from transformers import GPT2Tokenizer, GPT2LMHeadModel

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

In [None]:
from transformers import TextDataset,DataCollatorForLanguageModeling

def load_dataset(train_path,test_path,tokenizer):
    train_dataset = TextDataset(
          tokenizer=tokenizer,
          file_path=train_path,
          block_size=128)
     
    test_dataset = TextDataset(
          tokenizer=tokenizer,
          file_path=test_path,
          block_size=128)   
    
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, mlm=False,
    )
    return train_dataset,test_dataset,data_collator

train_dataset,test_dataset,data_collator = load_dataset("./train.txt","./test.txt",tokenizer)

# Initialize `Trainer` with `TrainingArguments` and GPT-2 model

The [Trainer](https://huggingface.co/transformers/main_classes/trainer.html#transformers.Trainer) class provides an API for feature-complete training. It is used in most of the [example scripts](https://huggingface.co/transformers/examples.html) from Huggingface. Before we can instantiate our `Trainer` we need to download our GPT-2 model and create a [TrainingArguments](https://huggingface.co/transformers/main_classes/trainer.html#transformers.TrainingArguments) to access all the points of customization during training. In the `TrainingArguments`, we can define the Hyperparameters we are going to use in the training process like our `learning_rate`, `num_train_epochs`, or  `per_device_train_batch_size`. A complete list can you find [here](https://huggingface.co/transformers/main_classes/trainer.html#trainingarguments).

In [None]:
from transformers import Trainer, TrainingArguments

model = GPT2LMHeadModel.from_pretrained('gpt2')

training_args = TrainingArguments(
    output_dir="./gpt2-medical", #The output directory
    overwrite_output_dir=True, #overwrite the content of the output directory
    num_train_epochs=3, # number of training epochs
    per_device_train_batch_size=16, # batch size for training
    per_device_eval_batch_size=64,  # batch size for evaluation
    eval_steps = 400, # Number of update steps between two evaluations.
    save_steps=800, # after # steps model is saved 
    warmup_steps=500,# number of warmup steps for learning rate scheduler
    prediction_loss_only=True,
    )


trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

In [None]:
trainer.train()

After training is done you can save the model by calling `save_model()`. This will save the trained model to our `output_dir` from our `TrainingArguments`.

In [None]:
trainer.save_model()

# Test the model

To test the model we are going to use another [highlight of the transformers library](https://huggingface.co/transformers/main_classes/pipelines.html?highlight=pipelines) called `pipeline`. [Pipelines](https://huggingface.co/transformers/main_classes/pipelines.html?highlight=pipelines) are objects that offer a simple API dedicated to several tasks, among others also `text-generation`

In [None]:
from transformers import pipeline

chef = pipeline('text-generation',model='./gpt2-gerchef', tokenizer='anonymous-german-nlp/german-gpt2',config={'max_length':800})

#result = chef('Zuerst Hähnchen')[0]['generated_text']


In [None]:
chef('Die Nudeln Kochen, Fleisch anbraten')

In [None]:
chef('Zuerst Hähnchen')

In [None]:
chef('Der beste Weg, um einen Schokoladenkuchen zuzubereiten, ist')


In [None]:
chef('Zuerst Hähnchen')[0]['generated_text']