In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from parrot import Parrot
import warnings
from transformers import MarianMTModel, MarianTokenizer
from tqdm import tqdm
warnings.filterwarnings("ignore")

In [None]:
file_path = './data/cleaned_combined_dataset.csv'
df = pd.read_csv(file_path)

print(df.head())

train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

print(f"Ensemble de formation: {train_df.shape}")
print(f"Ensemble de test: {test_df.shape}")

In [None]:
parrot = Parrot(model_tag="prithivida/parrot_paraphraser_on_T5", use_gpu=True)

In [None]:
model_name_fr_en = 'Helsinki-NLP/opus-mt-fr-en'
tokenizer_fr_en = MarianTokenizer.from_pretrained(model_name_fr_en)
model_fr_en = MarianMTModel.from_pretrained(model_name_fr_en)

model_name_en_fr = 'Helsinki-NLP/opus-mt-en-fr'
tokenizer_en_fr = MarianTokenizer.from_pretrained(model_name_en_fr)
model_en_fr = MarianMTModel.from_pretrained(model_name_en_fr)

In [None]:
def translate_fr_to_en(texts):
  translated_texts = []
  for text in texts:
    batch = tokenizer_fr_en.prepare_seq2seq_batch([text], return_tensors="pt")
    generated_ids = model_fr_en.generate(**batch)
    translated_text = tokenizer_fr_en.batch_decode(generated_ids, skip_special_tokens=True)[0]
    translated_texts.append(translated_text)
  return translated_texts

def translate_en_to_fr(texts):
  translated_texts = []
  for text in texts:
    batch = tokenizer_en_fr.prepare_seq2seq_batch([text], return_tensors="pt")
    generated_ids = model_en_fr.generate(**batch)
    translated_text = tokenizer_en_fr.batch_decode(generated_ids, skip_special_tokens=True)[0]
    translated_texts.append(translated_text)
  return translated_texts

In [None]:
def paraphrase(text, num_return_sequences=3):
  paraphrased_texts = []
  para_phrases = parrot.augment(input_phrase=text)
  if para_phrases:
    paraphrased_texts = [phrase[0] for phrase in para_phrases[:num_return_sequences]]
  return paraphrased_texts

text_column = 'text'
label_column = 'label'

In [None]:
if not train_df.empty and text_column in train_df.columns and label_column in train_df.columns:
  sample_text = train_df[text_column].iloc[0]
  sample_label = train_df[label_column].iloc[0]

  translated_text = translate_fr_to_en([sample_text])[0]
  paraphrased_texts = paraphrase(translated_text)

  translated_back_texts = translate_en_to_fr(paraphrased_texts)

  augmented_texts = []
  augmented_labels = []

  for index, row in tqdm(train_df.iterrows(), total=train_df.shape[0], desc="Processing"):
    original_text = row[text_column]
    original_label = row[label_column]

    translated_text = translate_fr_to_en([original_text])[0]
    paraphrased_texts = paraphrase(translated_text)

    translated_back_texts = translate_en_to_fr(paraphrased_texts)
    augmented_texts.append(original_text)
    augmented_labels.append(original_label)
    for t_text in translated_back_texts:
      augmented_texts.append(t_text)
      augmented_labels.append(original_label)

  augmented_df = pd.DataFrame({
    text_column: augmented_texts,
    label_column: augmented_labels
  })

  train_df.to_csv('./data/train_dataset.csv', index=False)
  test_df.to_csv('./data/test_dataset.csv', index=False)
  augmented_df.to_csv('./data/augmented_train_dataset.csv', index=False)

  print("Fichiers sauvegardés avec succès.")
else:
  print("Les colonnes spécifiées ne sont pas présentes dans le DataFrame ou le DataFrame est vide.")