In [None]:
from random import randint

import torch
from transformers import pipeline
import pandas as pd
from datasets import Dataset
from transformers import AdamW, AutoTokenizer, AutoModelForSequenceClassification

In [None]:
fillmask = pipeline("fill-mask", model="almanach/camembertv2-base")
mask_token = fillmask.tokenizer.mask_token

In [None]:
def augment_data(examples):
    outputs = []
    for sentence in examples["Avis"]:
        words = sentence.split(' ')
        K = randint(1, len(words)-1)
        masked_sentence = " ".join(words[:K]  + [mask_token] + words[K+1:])
        predictions = fillmask(masked_sentence)
        augmented_sequences = [predictions[i]["sequence"] for i in range(2)]
        outputs += [sentence] + augmented_sequences
    return {"data": outputs}

In [12]:
df_train = pd.read_csv("../data/ftdataset_train.tsv", sep=' *\t *', encoding='utf-8', engine='python')
df_val = pd.read_csv("../data/ftdataset_val.tsv", sep=' *\t *', encoding='utf-8', engine='python')
df_test = pd.read_csv("../data/ftdataset_test.tsv", sep=' *\t *', encoding='utf-8', engine='python')

In [None]:
modified_data = df_train[df_train["Avis"].apply(len) <= 512][:9]
# Conversion en Dataset
dataset = Dataset.from_pandas(modified_data)
# Application de la fonction augment_data par lots
modified_data = dataset.map(augment_data, batched=True, remove_columns=dataset.column_names,batch_size=8)
modified_data

In [None]:
from transformers import AutoTokenizer

print(len(df_train))
print(len(modified_data["data"]))
[df_train.Ambiance.value_counts(),
df_train.Cuisine.value_counts(),
df_train.Prix.value_counts(),
df_train.Service.value_counts()]

In [None]:
modified_data_512 = df_train[df_train["Avis"].apply(len) > 1024][:9]

In [None]:
tokenizer = AutoTokenizer.from_pretrained("almanach/camembertv2-base")
dataset_512 = Dataset.from_pandas(modified_data_512)
sequences = [dataset_512["Avis"][0]]
batch = tokenizer(sequences, padding=True, truncation=True, return_tensors="pt")
batch["labels"] = torch.tensor([1, 1])
dataset_512.features

In [13]:
from datasets import Dataset, ClassLabel
# Convertir le DataFrame en Dataset
dataset = {}
dataset["train"] = Dataset.from_pandas(df_train)
dataset["test"] = Dataset.from_pandas(df_test)

# Fonction de transformation pour convertir les colonnes en ClassLabel
# Définir les étiquettes de classe pour chaque colonne
labels = ["Négative", "Neutre", "Positive", "NE"]
class_label = ClassLabel(names=labels)

# Fonction de transformation pour convertir les colonnes en ClassLabel
def transform_labels(example):
    example["Prix"] = class_label.encode_example(str(example["Prix"]))
    example["Cuisine"] = class_label.encode_example(str(example["Cuisine"]))
    example["Service"] = class_label.encode_example(str(example["Service"]))
    example["Ambiance"] = class_label.encode_example(str(example["Ambiance"]))
    return example
# Appliquer la transformation au dataset test
dataset["train"] = dataset["train"].map(transform_labels)
dataset["test"] = dataset["test"].map(transform_labels)

Map: 100%|██████████| 4471/4471 [00:00<00:00, 5087.96 examples/s]
Map: 100%|██████████| 902/902 [00:00<00:00, 5100.72 examples/s]


In [152]:
# Afficher le dataset transformé
print(dataset["train"]["labels"][0])
print(dataset["train"]["Prix"][0], dataset["train"]["Cuisine"][0], dataset["train"]["Service"][0], dataset["train"]["Ambiance"][0])

[2, 2, 2, 2]
Positive Positive Positive Positive


In [154]:
classes = ["Négative", "Neutre", "Positive", "NE"]
class2id = {class_:id for id, class_ in enumerate(classes)}
id2class = {id:class_ for class_, id in class2id.items()}
print(classes)
print(class2id)
print(id2class)

['Négative', 'Neutre', 'Positive', 'NE']
{'Négative': 0, 'Neutre': 1, 'Positive': 2, 'NE': 3}
{0: 'Négative', 1: 'Neutre', 2: 'Positive', 3: 'NE'}


In [189]:
from transformers import DataCollatorWithPadding
def preprocess_function(example):
    return tokenizer(example["Avis"], truncation=True)

tokenized_dataset = {}
tokenized_dataset["train"] = dataset["train"].map(preprocess_function)
tokenized_dataset["test"] = dataset["test"].map(preprocess_function)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
data_collator

Map: 100%|██████████| 4471/4471 [00:03<00:00, 1129.06 examples/s]
Map: 100%|██████████| 902/902 [00:00<00:00, 1137.98 examples/s]


DataCollatorWithPadding(tokenizer=RobertaTokenizerFast(name_or_path='almanach/camembertv2-base', vocab_size=32768, model_max_length=1024, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '[CLS]', 'eos_token': '[SEP]', 'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True, added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	3: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	4: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
), padding=True, max_length=None, pad_to_multip

In [186]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained("almanach/camembertv2-base", num_labels=len(classes),id2label=id2class, label2id=class2id,problem_type = "multi_label_classification")

tokenized_dataset

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at almanach/camembertv2-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'train': Dataset({
     features: ['NomDuGroupe', 'Restaurant', 'Note', 'Prix', 'Cuisine', 'Service', 'Ambiance', 'Avis', 'URL', 'labels', 'input_ids', 'attention_mask'],
     num_rows: 4471
 }),
 'test': Dataset({
     features: ['NomDuGroupe', 'Restaurant', 'Note', 'Prix', 'Cuisine', 'Service', 'Ambiance', 'Avis', 'URL', 'labels', 'input_ids', 'attention_mask'],
     num_rows: 902
 })}

In [190]:
tokenized_dataset["train"] = tokenized_dataset["train"].remove_columns(["Restaurant","Avis", "Prix", "Cuisine", "Service", "Ambiance","NomDuGroupe","Note","URL"])
tokenized_dataset["test"] = tokenized_dataset["test"].remove_columns(["Restaurant","Avis", "Prix", "Cuisine", "Service", "Ambiance","NomDuGroupe","Note","URL"])
tokenized_dataset["train"].set_format("torch")
tokenized_dataset["test"].set_format("torch")

In [44]:
tokenized_dataset

NameError: name 'tokenized_dataset' is not defined

In [192]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(
    tokenized_dataset["train"], shuffle=True, batch_size=8, collate_fn=data_collator
)
eval_dataloader = DataLoader(
    tokenized_dataset["test"], batch_size=8, collate_fn=data_collator
)

In [193]:
for batch in train_dataloader:
    break
{k: v.shape for k, v in batch.items()}

{'labels': torch.Size([8, 4]),
 'input_ids': torch.Size([8, 113]),
 'attention_mask': torch.Size([8, 113])}

In [200]:

optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
from transformers import get_scheduler

num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)
print(num_training_steps)

1677


In [201]:
import torch

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)
device

device(type='cpu')

In [202]:
from tqdm.auto import tqdm

progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

  0%|          | 0/1677 [00:00<?, ?it/s]

RuntimeError: result type Float can't be cast to the desired output type Long

In [48]:
from sklearn.preprocessing import MultiLabelBinarizer
import pandas as pd


from datasets import Dataset, ClassLabel

# Charger le dataset
df_train = pd.read_csv("../data/ftdataset_train.tsv", sep=' *\t *', encoding='utf-8', engine='python')
df_test = pd.read_csv("../data/ftdataset_test.tsv", sep=' *\t *', encoding='utf-8', engine='python')

# Convertir le DataFrame en Dataset
dataset = {}
dataset["train"] = Dataset.from_pandas(df_train)
dataset["test"] = Dataset.from_pandas(df_test)



# Appliquer la transformation au dataset
dataset["train"] = dataset["train"].map(transform_labels)
dataset["test"] = dataset["test"].map(transform_labels)

# Afficher le dataset transformé
print(dataset["train"]["Prix"][0], dataset["train"]["Cuisine"][0], dataset["train"]["Service"][0],dataset["train"]["Ambiance"][0])

Map: 100%|██████████| 4471/4471 [00:00<00:00, 5244.56 examples/s]
Map: 100%|██████████| 902/902 [00:00<00:00, 5260.05 examples/s]

2 2 2 2





In [56]:
# Convertir le Dataset en DataFrame pour calculer les moyennes
df_transformed = dataset["train"].to_pandas()

# Calculer les moyennes des colonnes spécifiées
moyennes = df_transformed[['Prix', 'Cuisine', 'Ambiance', 'Service']].describe()
nombre_de_uns = (df_transformed[['Prix', 'Cuisine', 'Ambiance', 'Service']] == 1).sum()
nombre_de_deux = (df_transformed[['Prix', 'Cuisine', 'Ambiance', 'Service']] == 2).sum()
nombre_de_trois = (df_transformed[['Prix', 'Cuisine', 'Ambiance', 'Service']] == 3).sum()
nombre_de_zero = (df_transformed[['Prix', 'Cuisine', 'Ambiance', 'Service']] == 0).sum()
# Afficher les moyennes
print(moyennes)

              Prix      Cuisine     Ambiance      Service
count  4471.000000  4471.000000  4471.000000  4471.000000
mean      2.362335     1.739432     2.242899     1.828450
std       1.039598     0.798130     0.925915     0.927027
min       0.000000     0.000000     0.000000     0.000000
25%       2.000000     2.000000     2.000000     2.000000
50%       3.000000     2.000000     2.000000     2.000000
75%       3.000000     2.000000     3.000000     2.000000
max       3.000000     3.000000     3.000000     3.000000
