In [101]:
import pandas as pd
from transformers import AutoTokenizer, DataCollatorForLanguageModeling, AutoModelForCausalLM, TrainingArguments, Trainer, pipeline
from datasets import Dataset

### Load dataset

In [102]:
df = pd.read_csv('cleaning/dataset.csv')
df = df.rename(columns={'0': 'text'})

In [103]:
df = df.replace(r'\n',' ', regex=True) 
print(df.iloc[0]['text'])

Discours sur la politique de rigueur, 28 décembre 1958     Le 21 décembre, le général de Gaulle a été élu Président de la République française et de  la  Communauté.  En  annonçant  qu'il  accepte  le  mandat  qui  lui  a  été  confié,  il  expose  comment il l'accomplira et donne les raisons de la politique financière de rigueur qui va  être mise en œuvre.    Avant tout, Françaises, Français, je veux vous dire que j'accepte le mandat que vous m'avez  confié. Votre décision fut marquée lors de la crise nationale du mois de mai, affirmée par le  référendum,  répétée  par  les  élections,  précisée  par  le  vote  des  élus  dimanche  dernier.  La  tâche nationale qui m'incombe depuis dix-huit ans se trouve, de ce fait, confirmée. Guide de  la France et Chef de l'État républicain, j'exercerai le pouvoir suprême dans toute l'étendue  qu'il comporte désormais et suivant l'esprit nouveau qui me l'a fait attribuer.  L'appel  qui  m'est  adressé  par  le  pays  exprime  son  instinct  du  sal

In [104]:
dataset = Dataset.from_pandas(df)

In [105]:
print(dataset)

Dataset({
    features: ['text'],
    num_rows: 89
})


In [106]:
dataset = dataset.train_test_split(test_size=0.2)

In [107]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 71
    })
    test: Dataset({
        features: ['text'],
        num_rows: 18
    })
})


In [108]:
tokenizer = AutoTokenizer.from_pretrained('antoinelouis/belgpt2', model_max_length=1024)

loading configuration file https://huggingface.co/antoinelouis/belgpt2/resolve/main/config.json from cache at /home/thomaslemenestrel/.cache/huggingface/transformers/30c809ae2b56ecf39b7fb51fd4671b13975477a156c7582cef7d392feb0794c3.b541b90fcc4441544dff1e715cbeeb949327ab3c3af94ce222054edac5d578e9
Model config GPT2Config {
  "_name_or_path": "antoinelouis/belgpt2",
  "_num_labels": 2,
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "output_past": true,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": 

In [109]:
def preprocess_function(examples):
    
    return tokenizer([x for x in examples['text']], truncation=True)

In [110]:
tokenized_dataset = dataset.map(

    preprocess_function,

    batched=True,

    num_proc=12,

    remove_columns = dataset["train"].column_names,

)

                        

#2:   0%|          | 0/1 [00:00<?, ?ba/s]

#1:   0%|          | 0/1 [00:00<?, ?ba/s]

#0:   0%|          | 0/1 [00:00<?, ?ba/s]

#3:   0%|          | 0/1 [00:00<?, ?ba/s]

#5:   0%|          | 0/1 [00:00<?, ?ba/s]

#8:   0%|          | 0/1 [00:00<?, ?ba/s]

#4:   0%|          | 0/1 [00:00<?, ?ba/s]

#7:   0%|          | 0/1 [00:00<?, ?ba/s]

#10:   0%|          | 0/1 [00:00<?, ?ba/s]

#9:   0%|          | 0/1 [00:00<?, ?ba/s]

#6:   0%|          | 0/1 [00:00<?, ?ba/s]

#11:   0%|          | 0/1 [00:00<?, ?ba/s]

                        

#2:   0%|          | 0/1 [00:00<?, ?ba/s]

#0:   0%|          | 0/1 [00:00<?, ?ba/s]

#5:   0%|          | 0/1 [00:00<?, ?ba/s]

#4:   0%|          | 0/1 [00:00<?, ?ba/s]

#6:   0%|          | 0/1 [00:00<?, ?ba/s]

#3:   0%|          | 0/1 [00:00<?, ?ba/s]

#1:   0%|          | 0/1 [00:00<?, ?ba/s]

#7:   0%|          | 0/1 [00:00<?, ?ba/s]

#10:   0%|          | 0/1 [00:00<?, ?ba/s]

#8:   0%|          | 0/1 [00:00<?, ?ba/s]

#9:   0%|          | 0/1 [00:00<?, ?ba/s]

#11:   0%|          | 0/1 [00:00<?, ?ba/s]

In [111]:
print(tokenized_dataset)

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 71
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 18
    })
})


In [112]:
block_size = 128

def group_texts(examples):

    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}

    total_length = len(concatenated_examples[list(examples.keys())[0]])
    total_length = (total_length // block_size) * block_size

    result = {

        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]

        for k, t in concatenated_examples.items()

    }

    result["labels"] = result["input_ids"].copy()

    return result

In [113]:
lm_dataset = tokenized_dataset.map(group_texts, batched=True, num_proc=12)

                        

#2:   0%|          | 0/1 [00:00<?, ?ba/s]

#0:   0%|          | 0/1 [00:00<?, ?ba/s]

#3:   0%|          | 0/1 [00:00<?, ?ba/s]

#1:   0%|          | 0/1 [00:00<?, ?ba/s]

#4:   0%|          | 0/1 [00:00<?, ?ba/s]

#5:   0%|          | 0/1 [00:00<?, ?ba/s]

#8:   0%|          | 0/1 [00:00<?, ?ba/s]

#7:   0%|          | 0/1 [00:00<?, ?ba/s]

#6:   0%|          | 0/1 [00:00<?, ?ba/s]

#9:   0%|          | 0/1 [00:00<?, ?ba/s]

#11:   0%|          | 0/1 [00:00<?, ?ba/s]

#10:   0%|          | 0/1 [00:00<?, ?ba/s]

                        

#0:   0%|          | 0/1 [00:00<?, ?ba/s]

#2:   0%|          | 0/1 [00:00<?, ?ba/s]

#1:   0%|          | 0/1 [00:00<?, ?ba/s]

#3:   0%|          | 0/1 [00:00<?, ?ba/s]

#5:   0%|          | 0/1 [00:00<?, ?ba/s]

#8:   0%|          | 0/1 [00:00<?, ?ba/s]

#4:   0%|          | 0/1 [00:00<?, ?ba/s]

#9:   0%|          | 0/1 [00:00<?, ?ba/s]

#6:   0%|          | 0/1 [00:00<?, ?ba/s]

#7:   0%|          | 0/1 [00:00<?, ?ba/s]

#11:   0%|          | 0/1 [00:00<?, ?ba/s]

#10:   0%|          | 0/1 [00:00<?, ?ba/s]

In [114]:
print(lm_dataset)

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 469
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 115
    })
})


In [115]:
tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# Training

In [116]:
model = AutoModelForCausalLM.from_pretrained('antoinelouis/belgpt2')

loading configuration file https://huggingface.co/antoinelouis/belgpt2/resolve/main/config.json from cache at /home/thomaslemenestrel/.cache/huggingface/transformers/30c809ae2b56ecf39b7fb51fd4671b13975477a156c7582cef7d392feb0794c3.b541b90fcc4441544dff1e715cbeeb949327ab3c3af94ce222054edac5d578e9
Model config GPT2Config {
  "_name_or_path": "antoinelouis/belgpt2",
  "_num_labels": 2,
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "output_past": true,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": 

In [120]:
training_args = TrainingArguments(

    output_dir="CharlesDeGaulle-GPT",

    evaluation_strategy="epoch",

    learning_rate=2e-5,

    weight_decay=0.01,

    push_to_hub=True,

)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [121]:
import torch

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dro

In [122]:
trainer = Trainer(

    model=model,

    args=training_args,

    train_dataset=lm_dataset["train"],

    eval_dataset=lm_dataset["test"],

    data_collator=data_collator

)

trainer.train()

Cloning https://huggingface.co/tlemenestrel/CharlesDeGaulle-GPT into local empty directory.
***** Running training *****
  Num examples = 469
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 177


Epoch,Training Loss,Validation Loss
1,No log,2.919655
2,No log,2.779793
3,No log,2.748848


***** Running Evaluation *****
  Num examples = 115
  Batch size = 8
***** Running Evaluation *****
  Num examples = 115
  Batch size = 8
***** Running Evaluation *****
  Num examples = 115
  Batch size = 8


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=177, training_loss=2.8570887678760593, metrics={'train_runtime': 16.9065, 'train_samples_per_second': 83.222, 'train_steps_per_second': 10.469, 'total_flos': 91909472256000.0, 'train_loss': 2.8570887678760593, 'epoch': 3.0})

In [123]:
trainer.push_to_hub()

Saving model checkpoint to CharlesDeGaulle-GPT
Configuration saved in CharlesDeGaulle-GPT/config.json
Model weights saved in CharlesDeGaulle-GPT/pytorch_model.bin


Upload file pytorch_model.bin:   0%|          | 32.0k/487M [00:00<?, ?B/s]

Upload file runs/Jan16_16-05-23_pop-os/events.out.tfevents.1673885132.pop-os: 100%|##########| 4.88k/4.88k [00…

Upload file runs/Jan16_16-05-23_pop-os/1673885132.7970252/events.out.tfevents.1673885132.pop-os: 100%|########…

Upload file training_args.bin: 100%|##########| 3.23k/3.23k [00:00<?, ?B/s]

remote: Scanning LFS files for validity, may be slow...        
remote: LFS file scan complete.        
To https://huggingface.co/tlemenestrel/CharlesDeGaulle-GPT
   d71a767..d6e413f  main -> main

Dropping the following result as it does not have all the necessary fields:
{'task': {'name': 'Causal Language Modeling', 'type': 'text-generation'}}
To https://huggingface.co/tlemenestrel/CharlesDeGaulle-GPT
   d6e413f..4e49950  main -> main



'https://huggingface.co/tlemenestrel/CharlesDeGaulle-GPT/commit/d6e413ffd296669d33dde489ff9339b2bd32fe38'

In [148]:
tokenizer.push_to_hub('https://huggingface.co/tlemenestrel/CharlesDeGaulle-GPT')

Cloning https://huggingface.co/tlemenestrel/CharlesDeGaulle-GPT into local empty directory.


Download file pytorch_model.bin:   0%|          | 17.4k/487M [00:00<?, ?B/s]

Download file runs/Jan16_16-05-23_pop-os/1673885132.7970252/events.out.tfevents.1673885132.pop-os: 100%|######…

Clean file runs/Jan16_16-05-23_pop-os/1673885132.7970252/events.out.tfevents.1673885132.pop-os:  20%|#9       …

Download file runs/Jan16_16-05-23_pop-os/events.out.tfevents.1673885132.pop-os: 100%|##########| 4.88k/4.88k […

Download file training_args.bin: 100%|##########| 3.23k/3.23k [00:00<?, ?B/s]

Clean file training_args.bin:  31%|###       | 1.00k/3.23k [00:00<?, ?B/s]

Clean file runs/Jan16_16-05-23_pop-os/events.out.tfevents.1673885132.pop-os:  21%|##        | 1.00k/4.88k [00:…

Clean file pytorch_model.bin:   0%|          | 1.00k/487M [00:00<?, ?B/s]

tokenizer config file saved in https://huggingface.co/tlemenestrel/CharlesDeGaulle-GPT/tokenizer_config.json
Special tokens file saved in https://huggingface.co/tlemenestrel/CharlesDeGaulle-GPT/special_tokens_map.json
To https://huggingface.co/tlemenestrel/CharlesDeGaulle-GPT
   4e49950..79822b9  main -> main



'https://huggingface.co/tlemenestrel/CharlesDeGaulle-GPT/commit/79822b9a8bd5d1ffc45ce5c0491da56ed3420895'

# Inference

In [149]:
prompt = "Les gens"

In [150]:
inputs = tokenizer(prompt, return_tensors="pt").input_ids

In [151]:
from transformers import AutoModelForCausalLM

model = AutoModelForCausalLM.from_pretrained("CharlesDeGaulle-GPT")

outputs = model.generate(inputs, max_new_tokens=100, do_sample=True, top_k=50, top_p=0.95)

loading configuration file CharlesDeGaulle-GPT/config.json
Model config GPT2Config {
  "_name_or_path": "CharlesDeGaulle-GPT",
  "_num_labels": 2,
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "output_past": true,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "torch_dtype": "float32",
  "transformers_version": "4.21.0",
  "use_cache": true,
  "vocab_size": 50257
}

loading weights file CharlesDeGaulle-GPT/pytorch_model.bin
All 

In [152]:
tokenizer.batch_decode(outputs, skip_special_tokens=True)

["Les gens peuvent voir les mêmes choses quand ils sont dans la rue à un point donné. Et vous, que voulez-vous? Ce que vous dites? Il ne s'en a donc pas fallu, en effet, pour que la nation, l'Afrique et l'Europe nous envoient un million, une fois de plus, dans la  crise. C'est ainsi, sans doute, car il y a dans la France, de quoi entretenir une profonde sympathie pour la France, pour le progrès"]