In [1]:
import pandas as pd
from transformers import AutoTokenizer, DataCollatorForLanguageModeling, AutoModelForCausalLM, TrainingArguments, Trainer, pipeline
from datasets import Dataset

In [2]:
from transformers import set_seed

set_seed(42)

### Load dataset

In [3]:
df = pd.read_csv('cleaning/dataset.csv')
df = df.rename(columns={'0': 'text'})

In [4]:
df = df.replace(r'\n',' ', regex=True) 
print(df.iloc[0]['text'])

Discours sur la politique de rigueur, 28 décembre 1958     Le 21 décembre, le général de Gaulle a été élu Président de la République française et de  la  Communauté.  En  annonçant  qu'il  accepte  le  mandat  qui  lui  a  été  confié,  il  expose  comment il l'accomplira et donne les raisons de la politique financière de rigueur qui va  être mise en œuvre.    Avant tout, Françaises, Français, je veux vous dire que j'accepte le mandat que vous m'avez  confié. Votre décision fut marquée lors de la crise nationale du mois de mai, affirmée par le  référendum,  répétée  par  les  élections,  précisée  par  le  vote  des  élus  dimanche  dernier.  La  tâche nationale qui m'incombe depuis dix-huit ans se trouve, de ce fait, confirmée. Guide de  la France et Chef de l'État républicain, j'exercerai le pouvoir suprême dans toute l'étendue  qu'il comporte désormais et suivant l'esprit nouveau qui me l'a fait attribuer.  L'appel  qui  m'est  adressé  par  le  pays  exprime  son  instinct  du  sal

In [5]:
dataset = Dataset.from_pandas(df)

In [6]:
print(dataset)

Dataset({
    features: ['text'],
    num_rows: 89
})


In [7]:
dataset = dataset.train_test_split(test_size=0.2)

In [8]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 71
    })
    test: Dataset({
        features: ['text'],
        num_rows: 18
    })
})


In [9]:
tokenizer = AutoTokenizer.from_pretrained('antoinelouis/belgpt2')

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [10]:
def preprocess_function(examples):
    
    return tokenizer([x for x in examples['text']], truncation=False)

In [11]:
tokenized_dataset = dataset.map(

    preprocess_function,

    batched=True,

    num_proc=12,

    remove_columns = dataset["train"].column_names,

)

                        

#0:   0%|          | 0/1 [00:00<?, ?ba/s]

#3:   0%|          | 0/1 [00:00<?, ?ba/s]

#1:   0%|          | 0/1 [00:00<?, ?ba/s]

#2:   0%|          | 0/1 [00:00<?, ?ba/s]

#4:   0%|          | 0/1 [00:00<?, ?ba/s]

#6:   0%|          | 0/1 [00:00<?, ?ba/s]

#5:   0%|          | 0/1 [00:00<?, ?ba/s]

#8:   0%|          | 0/1 [00:00<?, ?ba/s]

#7:   0%|          | 0/1 [00:00<?, ?ba/s]

#10:   0%|          | 0/1 [00:00<?, ?ba/s]

#11:   0%|          | 0/1 [00:00<?, ?ba/s]

#9:   0%|          | 0/1 [00:00<?, ?ba/s]

                        

#2:   0%|          | 0/1 [00:00<?, ?ba/s]

#1:   0%|          | 0/1 [00:00<?, ?ba/s]

#3:   0%|          | 0/1 [00:00<?, ?ba/s]

#0:   0%|          | 0/1 [00:00<?, ?ba/s]

#4:   0%|          | 0/1 [00:00<?, ?ba/s]

#6:   0%|          | 0/1 [00:00<?, ?ba/s]

#5:   0%|          | 0/1 [00:00<?, ?ba/s]

#7:   0%|          | 0/1 [00:00<?, ?ba/s]

#10:   0%|          | 0/1 [00:00<?, ?ba/s]

#11:   0%|          | 0/1 [00:00<?, ?ba/s]

#8:   0%|          | 0/1 [00:00<?, ?ba/s]

#9:   0%|          | 0/1 [00:00<?, ?ba/s]

In [12]:
print(tokenized_dataset)

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 71
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 18
    })
})


In [13]:
block_size = 128

def group_texts(examples):

    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}

    total_length = len(concatenated_examples[list(examples.keys())[0]])
    total_length = (total_length // block_size) * block_size

    result = {

        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]

        for k, t in concatenated_examples.items()

    }

    result["labels"] = result["input_ids"].copy()

    return result

In [14]:
lm_dataset = tokenized_dataset.map(group_texts, batched=True, num_proc=12)

                        

#1:   0%|          | 0/1 [00:00<?, ?ba/s]

#0:   0%|          | 0/1 [00:00<?, ?ba/s]

#2:   0%|          | 0/1 [00:00<?, ?ba/s]

#3:   0%|          | 0/1 [00:00<?, ?ba/s]

#5:   0%|          | 0/1 [00:00<?, ?ba/s]

#4:   0%|          | 0/1 [00:00<?, ?ba/s]

#7:   0%|          | 0/1 [00:00<?, ?ba/s]

#9:   0%|          | 0/1 [00:00<?, ?ba/s]

#6:   0%|          | 0/1 [00:00<?, ?ba/s]

#10:   0%|          | 0/1 [00:00<?, ?ba/s]

#8:   0%|          | 0/1 [00:00<?, ?ba/s]

#11:   0%|          | 0/1 [00:00<?, ?ba/s]

                        

#1:   0%|          | 0/1 [00:00<?, ?ba/s]

#0:   0%|          | 0/1 [00:00<?, ?ba/s]

#2:   0%|          | 0/1 [00:00<?, ?ba/s]

#3:   0%|          | 0/1 [00:00<?, ?ba/s]

#6:   0%|          | 0/1 [00:00<?, ?ba/s]

#7:   0%|          | 0/1 [00:00<?, ?ba/s]

#5:   0%|          | 0/1 [00:00<?, ?ba/s]

#4:   0%|          | 0/1 [00:00<?, ?ba/s]

#8:   0%|          | 0/1 [00:00<?, ?ba/s]

#9:   0%|          | 0/1 [00:00<?, ?ba/s]

#10:   0%|          | 0/1 [00:00<?, ?ba/s]

#11:   0%|          | 0/1 [00:00<?, ?ba/s]

In [15]:
print(lm_dataset)

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 852
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 239
    })
})


In [16]:
tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# Training

In [17]:
model = AutoModelForCausalLM.from_pretrained('antoinelouis/belgpt2')

In [18]:
training_args = TrainingArguments(

    output_dir="CharlesDeGaulle-GPT",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    push_to_hub=True,
    num_train_epochs=11,
    per_device_eval_batch_size=64,
    per_device_train_batch_size=64

)

In [19]:
print(training_args)

TrainingArguments(
_n_gpu=1,
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_pin_memory=True,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
debug=[],
deepspeed=None,
disable_tqdm=False,
do_eval=True,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_steps=None,
evaluation_strategy=epoch,
fp16=False,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
fsdp=[],
fsdp_min_num_params=0,
fsdp_transformer_layer_cls_to_wrap=None,
full_determinism=False,
gradient_accumulation_steps=1,
gradient_checkpointing=False,
greater_is_better=None,
group_by_length=False,
half_precision_backend=auto,
hub_model_id=None,
hub_private_repo=False,
hub_strategy=every_save,
hub_token=<HUB_TOKEN>,
ignore_data_skip=False,
include_inputs_for_metrics=False,
jit_mode_eval=False,
label_names=None,
label_s

In [None]:
trainer = Trainer(

    model=model,

    args=training_args,

    train_dataset=lm_dataset["train"],

    eval_dataset=lm_dataset["test"],

    data_collator=data_collator

)

trainer.train()

/home/thomaslemenestrel/Documents/charles_de_gaulle_speeches/charles_de_gaulle_speeches/CharlesDeGaulle-GPT is already a clone of https://huggingface.co/tlemenestrel/CharlesDeGaulle-GPT. Make sure you pull the latest changes with `repo.git_pull()`.
***** Running training *****
  Num examples = 852
  Num Epochs = 50
  Instantaneous batch size per device = 64
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 700


Epoch,Training Loss,Validation Loss
1,No log,3.33881
2,No log,2.988494
3,No log,2.836293
4,No log,2.750417
5,No log,2.702802
6,No log,2.671404
7,No log,2.647637
8,No log,2.638451
9,No log,2.633578
10,No log,2.625381


***** Running Evaluation *****
  Num examples = 239
  Batch size = 64
***** Running Evaluation *****
  Num examples = 239
  Batch size = 64
***** Running Evaluation *****
  Num examples = 239
  Batch size = 64
***** Running Evaluation *****
  Num examples = 239
  Batch size = 64
***** Running Evaluation *****
  Num examples = 239
  Batch size = 64
***** Running Evaluation *****
  Num examples = 239
  Batch size = 64
***** Running Evaluation *****
  Num examples = 239
  Batch size = 64
***** Running Evaluation *****
  Num examples = 239
  Batch size = 64
***** Running Evaluation *****
  Num examples = 239
  Batch size = 64
***** Running Evaluation *****
  Num examples = 239
  Batch size = 64
***** Running Evaluation *****
  Num examples = 239
  Batch size = 64
***** Running Evaluation *****
  Num examples = 239
  Batch size = 64
***** Running Evaluation *****
  Num examples = 239
  Batch size = 64
***** Running Evaluation *****
  Num examples = 239
  Batch size = 64
***** Running Evalua

### Save to the hub

In [None]:
#trainer.push_to_hub()
#tokenizer.push_to_hub('https://huggingface.co/tlemenestrel/CharlesDeGaulle-GPT')

# Inference

In [None]:
prompt = "Les gens"

In [None]:
inputs = tokenizer(prompt, return_tensors="pt").input_ids

In [None]:
from transformers import AutoModelForCausalLM

model = AutoModelForCausalLM.from_pretrained("CharlesDeGaulle-GPT")

outputs = model.generate(inputs, max_new_tokens=100, do_sample=True, top_k=50, top_p=0.95)

In [None]:
tokenizer.batch_decode(outputs, skip_special_tokens=True)