In [1]:
import pandas as pd
from transformers import AutoTokenizer, DataCollatorForLanguageModeling, AutoModelForCausalLM, TrainingArguments, Trainer, pipeline
from datasets import Dataset
from pynvml import *

In [2]:
from transformers import set_seed

set_seed(42)

In [3]:
def print_gpu_utilization():
    nvmlInit()
    handle = nvmlDeviceGetHandleByIndex(0)
    info = nvmlDeviceGetMemoryInfo(handle)
    print(f"GPU memory occupied: {info.used//1024**2} MB.")
    
def print_summary(result):
    print(f"Time: {result.metrics['train_runtime']:.2f}")
    print(f"Samples/second: {result.metrics['train_samples_per_second']:.2f}")
    print_gpu_utilization()
    
print_gpu_utilization()

GPU memory occupied: 756 MB.


### Load dataset

In [4]:
df = pd.read_csv('cleaning/dataset.csv')
df = df.rename(columns={'0': 'text'})

In [5]:
df = df.replace(r'\n',' ', regex=True) 
print(df.iloc[0]['text'])

Discours sur la politique de rigueur, 28 décembre 1958     Le 21 décembre, le général de Gaulle a été élu Président de la République française et de  la  Communauté.  En  annonçant  qu'il  accepte  le  mandat  qui  lui  a  été  confié,  il  expose  comment il l'accomplira et donne les raisons de la politique financière de rigueur qui va  être mise en œuvre.    Avant tout, Françaises, Français, je veux vous dire que j'accepte le mandat que vous m'avez  confié. Votre décision fut marquée lors de la crise nationale du mois de mai, affirmée par le  référendum,  répétée  par  les  élections,  précisée  par  le  vote  des  élus  dimanche  dernier.  La  tâche nationale qui m'incombe depuis dix-huit ans se trouve, de ce fait, confirmée. Guide de  la France et Chef de l'État républicain, j'exercerai le pouvoir suprême dans toute l'étendue  qu'il comporte désormais et suivant l'esprit nouveau qui me l'a fait attribuer.  L'appel  qui  m'est  adressé  par  le  pays  exprime  son  instinct  du  sal

In [6]:
dataset = Dataset.from_pandas(df)

In [7]:
print(dataset)

Dataset({
    features: ['text'],
    num_rows: 89
})


In [8]:
dataset = dataset.train_test_split(test_size=0.2)

In [9]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 71
    })
    test: Dataset({
        features: ['text'],
        num_rows: 18
    })
})


In [10]:
tokenizer = AutoTokenizer.from_pretrained('antoinelouis/belgpt2')

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [11]:
def preprocess_function(examples):
    
    return tokenizer([x for x in examples['text']], truncation=False)

In [12]:
tokenized_dataset = dataset.map(

    preprocess_function,

    batched=True,

    num_proc=12,

    remove_columns = dataset["train"].column_names,

)

                        

#0:   0%|          | 0/1 [00:00<?, ?ba/s]

#2:   0%|          | 0/1 [00:00<?, ?ba/s]

#4:   0%|          | 0/1 [00:00<?, ?ba/s]

#1:   0%|          | 0/1 [00:00<?, ?ba/s]

#3:   0%|          | 0/1 [00:00<?, ?ba/s]

#5:   0%|          | 0/1 [00:00<?, ?ba/s]

#8:   0%|          | 0/1 [00:00<?, ?ba/s]

#6:   0%|          | 0/1 [00:00<?, ?ba/s]

#7:   0%|          | 0/1 [00:00<?, ?ba/s]

#10:   0%|          | 0/1 [00:00<?, ?ba/s]

#11:   0%|          | 0/1 [00:00<?, ?ba/s]

#9:   0%|          | 0/1 [00:00<?, ?ba/s]

                        

#6:   0%|          | 0/1 [00:00<?, ?ba/s]

#2:   0%|          | 0/1 [00:00<?, ?ba/s]

#1:   0%|          | 0/1 [00:00<?, ?ba/s]

#0:   0%|          | 0/1 [00:00<?, ?ba/s]

#4:   0%|          | 0/1 [00:00<?, ?ba/s]

#3:   0%|          | 0/1 [00:00<?, ?ba/s]

#5:   0%|          | 0/1 [00:00<?, ?ba/s]

#7:   0%|          | 0/1 [00:00<?, ?ba/s]

#8:   0%|          | 0/1 [00:00<?, ?ba/s]

#9:   0%|          | 0/1 [00:00<?, ?ba/s]

#10:   0%|          | 0/1 [00:00<?, ?ba/s]

#11:   0%|          | 0/1 [00:00<?, ?ba/s]

In [13]:
print(tokenized_dataset)

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 71
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 18
    })
})


In [14]:
block_size = 1024

def group_texts(examples):

    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}

    total_length = len(concatenated_examples[list(examples.keys())[0]])
    total_length = (total_length // block_size) * block_size

    result = {

        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]

        for k, t in concatenated_examples.items()

    }

    result["labels"] = result["input_ids"].copy()

    return result

In [15]:
lm_dataset = tokenized_dataset.map(group_texts, batched=True, num_proc=12)

                        

#1:   0%|          | 0/1 [00:00<?, ?ba/s]

#0:   0%|          | 0/1 [00:00<?, ?ba/s]

#2:   0%|          | 0/1 [00:00<?, ?ba/s]

#4:   0%|          | 0/1 [00:00<?, ?ba/s]

#3:   0%|          | 0/1 [00:00<?, ?ba/s]

#5:   0%|          | 0/1 [00:00<?, ?ba/s]

#8:   0%|          | 0/1 [00:00<?, ?ba/s]

#7:   0%|          | 0/1 [00:00<?, ?ba/s]

#6:   0%|          | 0/1 [00:00<?, ?ba/s]

#10:   0%|          | 0/1 [00:00<?, ?ba/s]

#9:   0%|          | 0/1 [00:00<?, ?ba/s]

#11:   0%|          | 0/1 [00:00<?, ?ba/s]

                        

#0:   0%|          | 0/1 [00:00<?, ?ba/s]

#2:   0%|          | 0/1 [00:00<?, ?ba/s]

#1:   0%|          | 0/1 [00:00<?, ?ba/s]

#5:   0%|          | 0/1 [00:00<?, ?ba/s]

#4:   0%|          | 0/1 [00:00<?, ?ba/s]

#3:   0%|          | 0/1 [00:00<?, ?ba/s]

#6:   0%|          | 0/1 [00:00<?, ?ba/s]

#7:   0%|          | 0/1 [00:00<?, ?ba/s]

#10:   0%|          | 0/1 [00:00<?, ?ba/s]

#8:   0%|          | 0/1 [00:00<?, ?ba/s]

#9:   0%|          | 0/1 [00:00<?, ?ba/s]

#11:   0%|          | 0/1 [00:00<?, ?ba/s]

In [16]:
print(lm_dataset)

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 102
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 25
    })
})


In [17]:
tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# Training

In [18]:
model = AutoModelForCausalLM.from_pretrained('antoinelouis/belgpt2')

In [19]:
training_args = TrainingArguments(

    output_dir="CharlesDeGaulle-GPT",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    push_to_hub=True,
    num_train_epochs=8,
    per_device_eval_batch_size=4,
    per_device_train_batch_size=4
)

In [20]:
trainer = Trainer(

    model=model,

    args=training_args,

    train_dataset=lm_dataset["train"],

    eval_dataset=lm_dataset["test"],

    data_collator=data_collator

)

result = trainer.train()
print_summary(result)

/home/thomaslemenestrel/Documents/charles_de_gaulle_speeches/charles_de_gaulle_speeches/CharlesDeGaulle-GPT is already a clone of https://huggingface.co/tlemenestrel/CharlesDeGaulle-GPT. Make sure you pull the latest changes with `repo.git_pull()`.
***** Running training *****
  Num examples = 102
  Num Epochs = 8
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 208


Epoch,Training Loss,Validation Loss
1,No log,2.993861
2,No log,2.764109
3,No log,2.662133
4,No log,2.612947
5,No log,2.589722
6,No log,2.572188
7,No log,2.563035
8,No log,2.561931


***** Running Evaluation *****
  Num examples = 25
  Batch size = 4
***** Running Evaluation *****
  Num examples = 25
  Batch size = 4
***** Running Evaluation *****
  Num examples = 25
  Batch size = 4
***** Running Evaluation *****
  Num examples = 25
  Batch size = 4
***** Running Evaluation *****
  Num examples = 25
  Batch size = 4
***** Running Evaluation *****
  Num examples = 25
  Batch size = 4
***** Running Evaluation *****
  Num examples = 25
  Batch size = 4
***** Running Evaluation *****
  Num examples = 25
  Batch size = 4


Training completed. Do not forget to share your model on huggingface.co/models =)




Time: 78.25
Samples/second: 10.43
GPU memory occupied: 15995 MB.


### Save to the hub

In [None]:
trainer.push_to_hub()
tokenizer.push_to_hub('https://huggingface.co/tlemenestrel/CharlesDeGaulle-GPT')

Saving model checkpoint to CharlesDeGaulle-GPT
Configuration saved in CharlesDeGaulle-GPT/config.json
Model weights saved in CharlesDeGaulle-GPT/pytorch_model.bin
Several commits (4) will be pushed upstream.
The progress bars may be unreliable.


Upload file pytorch_model.bin:   0%|          | 32.0k/487M [00:00<?, ?B/s]

Upload file runs/Jan16_17-22-21_pop-os/events.out.tfevents.1673889746.pop-os: 100%|##########| 6.96k/6.96k [00…

Upload file runs/Jan16_17-24-31_pop-os/events.out.tfevents.1673889875.pop-os: 100%|##########| 6.96k/6.96k [00…

Upload file runs/Jan16_17-41-39_pop-os/events.out.tfevents.1673890903.pop-os: 100%|##########| 6.96k/6.96k [00…

Upload file runs/Jan16_17-01-18_pop-os/events.out.tfevents.1673888482.pop-os: 100%|##########| 14.2k/14.2k [00…

Upload file runs/Jan16_17-38-42_pop-os/events.out.tfevents.1673890726.pop-os: 100%|##########| 6.96k/6.96k [00…

Upload file runs/Jan16_17-09-37_pop-os/events.out.tfevents.1673888981.pop-os: 100%|##########| 6.96k/6.96k [00…

Upload file runs/Jan16_17-45-44_pop-os/events.out.tfevents.1673891149.pop-os: 100%|##########| 6.96k/6.96k [00…

Upload file runs/Jan16_17-26-59_pop-os/events.out.tfevents.1673890024.pop-os: 100%|##########| 6.96k/6.96k [00…

Upload file runs/Jan16_17-06-43_pop-os/events.out.tfevents.1673888807.pop-os: 100%|##########| 6.96k/6.96k [00…

Upload file runs/Jan16_17-34-26_pop-os/events.out.tfevents.1673890471.pop-os: 100%|##########| 6.94k/6.94k [00…

Upload file runs/Jan16_17-51-39_pop-os/events.out.tfevents.1673891503.pop-os: 100%|##########| 6.64k/6.64k [00…

Upload file runs/Jan16_17-53-52_pop-os/events.out.tfevents.1673891636.pop-os: 100%|##########| 6.19k/6.19k [00…

Upload file runs/Jan16_17-51-39_pop-os/1673891503.643478/events.out.tfevents.1673891503.pop-os: 100%|#########…

Upload file runs/Jan16_17-06-43_pop-os/1673888807.6749291/events.out.tfevents.1673888807.pop-os: 100%|########…

Upload file runs/Jan16_17-09-37_pop-os/1673888981.895866/events.out.tfevents.1673888981.pop-os: 100%|#########…

Upload file runs/Jan16_17-16-46_pop-os/1673889411.1423275/events.out.tfevents.1673889411.pop-os: 100%|########…

Upload file runs/Jan16_17-22-21_pop-os/1673889746.486646/events.out.tfevents.1673889746.pop-os: 100%|#########…

Upload file runs/Jan16_17-24-31_pop-os/1673889875.849308/events.out.tfevents.1673889875.pop-os: 100%|#########…

Upload file runs/Jan16_17-26-59_pop-os/1673890024.1327515/events.out.tfevents.1673890024.pop-os: 100%|########…

Upload file runs/Jan16_17-31-13_pop-os/1673890277.697291/events.out.tfevents.1673890277.pop-os: 100%|#########…

Upload file runs/Jan16_17-33-15_pop-os/1673890400.2220488/events.out.tfevents.1673890400.pop-os: 100%|########…

Upload file runs/Jan16_17-53-52_pop-os/1673891636.989779/events.out.tfevents.1673891636.pop-os: 100%|#########…

Upload file runs/Jan16_17-34-26_pop-os/1673890471.4536057/events.out.tfevents.1673890471.pop-os: 100%|########…

Upload file runs/Jan16_17-38-42_pop-os/1673890726.8715713/events.out.tfevents.1673890726.pop-os: 100%|########…

Upload file runs/Jan16_17-41-39_pop-os/1673890903.8444293/events.out.tfevents.1673890903.pop-os: 100%|########…

Upload file runs/Jan16_17-44-28_pop-os/1673891073.429549/events.out.tfevents.1673891073.pop-os: 100%|#########…

Upload file runs/Jan16_17-45-44_pop-os/1673891149.0631733/events.out.tfevents.1673891149.pop-os: 100%|########…

Upload file runs/Jan16_17-48-50_pop-os/1673891334.6777077/events.out.tfevents.1673891334.pop-os: 100%|########…

Upload file runs/Jan16_17-50-06_pop-os/1673891410.9653618/events.out.tfevents.1673891410.pop-os: 100%|########…

Upload file runs/Jan16_17-33-15_pop-os/events.out.tfevents.1673890400.pop-os: 100%|##########| 3.75k/3.75k [00…

Upload file runs/Jan16_17-31-13_pop-os/events.out.tfevents.1673890277.pop-os: 100%|##########| 3.75k/3.75k [00…

Upload file runs/Jan16_17-16-46_pop-os/events.out.tfevents.1673889411.pop-os: 100%|##########| 3.75k/3.75k [00…

Upload file runs/Jan16_17-44-28_pop-os/events.out.tfevents.1673891073.pop-os: 100%|##########| 3.75k/3.75k [00…

Upload file runs/Jan16_17-48-50_pop-os/events.out.tfevents.1673891334.pop-os: 100%|##########| 3.75k/3.75k [00…

Upload file runs/Jan16_17-50-06_pop-os/events.out.tfevents.1673891410.pop-os: 100%|##########| 3.74k/3.74k [00…

Upload file training_args.bin: 100%|##########| 3.23k/3.23k [00:00<?, ?B/s]

# Inference

In [26]:
prompt = "Le peuple de France"

In [27]:
inputs = tokenizer(prompt, return_tensors="pt").input_ids

In [28]:
from transformers import AutoModelForCausalLM

model = AutoModelForCausalLM.from_pretrained("CharlesDeGaulle-GPT")

outputs = model.generate(inputs, max_new_tokens=500, do_sample=True, top_k=50, top_p=0.95, min_length=250)

loading configuration file CharlesDeGaulle-GPT/config.json
Model config GPT2Config {
  "_name_or_path": "CharlesDeGaulle-GPT",
  "_num_labels": 2,
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "output_past": true,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "torch_dtype": "float32",
  "transformers_version": "4.21.0",
  "use_cache": true,
  "vocab_size": 50257
}

loading weights file CharlesDeGaulle-GPT/pytorch_model.bin
All 

In [29]:
tokenizer.batch_decode(outputs, skip_special_tokens=True)

["Le peuple de France ne cesse  de rappeler l'édifice de Notre-Dame-des-Dames, il y a cent ans, en commémoration de  la délivrance  de  la  Provence  et  de  la  Renaissance  par  l'Alsace.  Que  l'Histoire,  depuis  les  temps  anciens,  a  toujours  su  distinguer  entre  la  belle-mère  de  la  France  et  la  belle-sœur,  entre  la  dame  de  la  famille  royale  et  la  dame  de  l'impérialisme.  Que,  pendant  l'on  passe  quatre  ans,  il  est  venu  des  nations  d'Europe  qui  ont  eu  d'un  génie  français  celui  de  faire  triompher  leur  idéal  dans  le  monde  lointain.  Qu'y eut-il de plus magnifique que la victoire de la France? Que s'est-il  passé  dans  les  veines  de  la  France  qui  ait  permis  à  cette  France,  cette  France,  cette  France,  de  se  redresser !  Qu'il  ait  été  là  pour  nous  apporter  ce  qu'elle  voulait  de  mieux  dans  l'ordre  du  renseignement  -  nous  savons le  dire  -  c'est-à-dire qu'en  moins de deux ans, elle a acquis toutes l