In [None]:
# installing huggingface libraries for dataset, models and metrics
!pip install datasets transformers[sentencepiece] sacrebleu

!pip install numpy==1.24.3
!pip install -U transformers
!pip install -U accelerate



In [None]:
import warnings

from datasets import load_dataset, load_metric, DatasetDict, Dataset
import transformers
import datasets
import random
import pandas as pd
from IPython.display import display, HTML
import zipfile
import io
import numpy as np

warnings.filterwarnings('ignore')

In [None]:
# selecting model checkpoint
model_checkpoint = "microsoft/prophetnet-large-uncased"

In [None]:
FILEPATH = 'filtered_paranmt.zip'


def unzip_tsv(filepath=FILEPATH):
    with zipfile.ZipFile(filepath, 'r') as zip_ref:
        file_content = zip_ref.read("filtered.tsv").decode("utf-8")
    df = pd.read_csv(io.StringIO(file_content), sep="\t")
    return df


def determine_toxic(df):
    df['toxic'] = df.apply(lambda row: row['reference'] if row['ref_tox'] > row['trn_tox'] else row['translation'], axis=1)
    df['toxic_tox'] = df[['ref_tox', 'trn_tox']].max(axis=1)
    df['neutral'] = df.apply(lambda row: row['reference'] if row['ref_tox'] <= row['trn_tox'] else row['translation'], axis=1)
    df['neutral_tox'] = df[['ref_tox', 'trn_tox']].min(axis=1)

    # Drop the old columns
    df = df.drop(['reference', 'translation', 'ref_tox', 'trn_tox'], axis=1)
    return df


def formalize_dataset(df):
    dataset = Dataset.from_pandas(df)

    # Split the dataset into train, validation, and test sets
    train_val_dataset, test_dataset = dataset.train_test_split(test_size=0.2).values()
    print(train_val_dataset, test_dataset)
    train_dataset, val_dataset = train_val_dataset.train_test_split(test_size=0.2).values()

    dataset_dict = DatasetDict({'train': train_dataset, 'validation': val_dataset, 'test': test_dataset})
    return dataset_dict


def get_dataset_dict(df=None):
    if df is not None:
        return formalize_dataset(df)
    else:
        return formalize_dataset(determine_toxic(unzip_tsv()))

In [None]:
# setting random seed for transformers library
transformers.set_seed(42)

# Load the paraNMT dataset
df = determine_toxic(unzip_tsv())
raw_datasets = get_dataset_dict(df)

# Load the BLUE metric
metric = load_metric("sacrebleu")

raw_datasets

Dataset({
    features: ['Unnamed: 0', 'similarity', 'lenght_diff', 'toxic', 'toxic_tox', 'neutral', 'neutral_tox'],
    num_rows: 462221
}) Dataset({
    features: ['Unnamed: 0', 'similarity', 'lenght_diff', 'toxic', 'toxic_tox', 'neutral', 'neutral_tox'],
    num_rows: 115556
})


DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'similarity', 'lenght_diff', 'toxic', 'toxic_tox', 'neutral', 'neutral_tox'],
        num_rows: 369776
    })
    validation: Dataset({
        features: ['Unnamed: 0', 'similarity', 'lenght_diff', 'toxic', 'toxic_tox', 'neutral', 'neutral_tox'],
        num_rows: 92445
    })
    test: Dataset({
        features: ['Unnamed: 0', 'similarity', 'lenght_diff', 'toxic', 'toxic_tox', 'neutral', 'neutral_tox'],
        num_rows: 115556
    })
})

In [None]:
raw_datasets["train"][:5]

{'Unnamed: 0': [388388, 367847, 106189, 405734, 21830],
 'similarity': [0.825834810877,
  0.787850557714,
  0.8681826442899999,
  0.7335714624070001,
  0.839950002449],
 'lenght_diff': [0.1153846153846153,
  0.2105263157894736,
  0.05,
  0.16,
  0.1830985915492957],
 'toxic': ['how about a little urine?',
  "All right, Mr. Fuckin' Compassion, I will call somebody!",
  "what's wrong with collecting garbage?",
  "you're driving me crazy.",
  "I kicked Brad so I could be with you and I'd do it again."],
 'toxic_tox': [0.8562554717063904,
  0.99751615524292,
  0.764396071434021,
  0.9080155491828918,
  0.9336898922920228],
 'neutral': ['What about some urine?',
  "all right, Mr. Kurvasous, I'll call someone!",
  "So, what's wrong with picking up trash?",
  "I'm crazy about you.",
  "I kicked Brad out of the equation to be with you, and I'd do it again."],
 'neutral_tox': [0.0115808770060539,
  7.118633948266506e-05,
  0.003911392763257,
  0.0006224995595403,
  0.0002570915094111]}

In [None]:
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
# create a model for the pretrained model
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
model

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/1.57G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

ProphetNetForConditionalGeneration(
  (prophetnet): ProphetNetModel(
    (word_embeddings): Embedding(30522, 1024, padding_idx=0)
    (encoder): ProphetNetEncoder(
      (word_embeddings): Embedding(30522, 1024, padding_idx=0)
      (position_embeddings): ProphetNetPositionalEmbeddings(512, 1024, padding_idx=0)
      (embeddings_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (layers): ModuleList(
        (0-11): 12 x ProphetNetEncoderLayer(
          (self_attn): ProphetNetAttention(
            (key_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (value_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (query_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (feed_forward): ProphetNetFeedForward(
            (a

In [None]:
from transformers import AutoTokenizer

# we will use autotokenizer for this purpose
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
tokenizer

Downloading (…)okenizer_config.json:   0%|          | 0.00/141 [00:00<?, ?B/s]

Downloading (…)prophetnet.tokenizer:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/90.0 [00:00<?, ?B/s]

ProphetNetTokenizer(name_or_path='microsoft/prophetnet-large-uncased', vocab_size=30522, model_max_length=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [None]:
max_input_length = 128
max_target_length = 128
prefix = "detoxify the following text (make it less toxic while preserving the meaning):"
ref = "toxic"
trn = "neutral"

def preprocess_function(examples):
    inputs = [prefix + ex for ex in examples[ref]]
    targets = [ex for ex in examples[trn]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    # Setup the tokenizer for targets
    labels = tokenizer(targets, max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
preprocess_function(raw_datasets['train'][:2])

{'input_ids': [[20010, 11636, 8757, 1996, 2206, 3793, 1006, 2191, 2009, 2625, 11704, 2096, 15224, 1996, 3574, 1007, 1024, 2129, 2055, 1037, 2210, 17996, 1029, 102], [20010, 11636, 8757, 1996, 2206, 3793, 1006, 2191, 2009, 2625, 11704, 2096, 15224, 1996, 3574, 1007, 1024, 2035, 2157, 1010, 2720, 1012, 6616, 2378, 1005, 15398, 1010, 1045, 2097, 2655, 8307, 999, 102]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], 'labels': [[2054, 2055, 2070, 17996, 1029, 102], [2035, 2157, 1010, 2720, 1012, 13970, 19146, 6499, 2271, 1010, 1045, 1005, 2222, 2655, 2619, 999, 102]]}

In [None]:
cropped_datasets = raw_datasets
cropped_datasets['train'] = raw_datasets['train'].select(range(30000))
cropped_datasets['validation'] = raw_datasets['validation'].select(range(2500))
cropped_datasets['test'] = raw_datasets['test'].select(range(2500))
tokenized_datasets = cropped_datasets.map(preprocess_function, batched=True)
tokenized_datasets['train'][0]

Map:   0%|          | 0/30000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2500 [00:00<?, ? examples/s]

Map:   0%|          | 0/2500 [00:00<?, ? examples/s]

{'Unnamed: 0': 388388,
 'similarity': 0.825834810877,
 'lenght_diff': 0.1153846153846153,
 'toxic': 'how about a little urine?',
 'toxic_tox': 0.8562554717063904,
 'neutral': 'What about some urine?',
 'neutral_tox': 0.0115808770060539,
 'input_ids': [20010,
  11636,
  8757,
  1996,
  2206,
  3793,
  1006,
  2191,
  2009,
  2625,
  11704,
  2096,
  15224,
  1996,
  3574,
  1007,
  1024,
  2129,
  2055,
  1037,
  2210,
  17996,
  1029,
  102],
 'attention_mask': [1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1],
 'labels': [2054, 2055, 2070, 17996, 1029, 102]}

In [None]:
import accelerate
accelerate.__version__

'0.24.1'

In [None]:
# defining the parameters for training
batch_size = 32
model_name = model_checkpoint.split("/")[-1]
args = Seq2SeqTrainingArguments(
    f"{model_name}-finetuned-detox",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=8,
    predict_with_generate=True,
    fp16=True,
    report_to='tensorboard',
)

In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# simple postprocessing for text
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels

# compute metrics function from the Lab5
def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Epoch,Training Loss,Validation Loss,Bleu,Gen Len
1,2.4518,2.069872,27.2468,15.352
2,1.9294,2.006852,26.7696,16.0568
3,1.683,1.975318,24.7231,16.6936
4,1.502,1.997321,24.4891,16.6216
5,1.3622,2.024966,24.1689,16.85
6,1.2517,2.064383,24.172,16.8564
7,1.1754,2.080637,23.7872,16.8384


Epoch,Training Loss,Validation Loss,Bleu,Gen Len
1,2.4518,2.069872,27.2468,15.352
2,1.9294,2.006852,26.7696,16.0568
3,1.683,1.975318,24.7231,16.6936
4,1.502,1.997321,24.4891,16.6216
5,1.3622,2.024966,24.1689,16.85
6,1.2517,2.064383,24.172,16.8564
7,1.1754,2.080637,23.7872,16.8384
8,1.1109,2.103028,23.6867,16.8372


TrainOutput(global_step=7504, training_loss=1.5385969279925706, metrics={'train_runtime': 11277.0494, 'train_samples_per_second': 21.282, 'train_steps_per_second': 0.665, 'total_flos': 3.1007888725180416e+16, 'train_loss': 1.5385969279925706, 'epoch': 8.0})

In [None]:
trainer.save_model('best')

In [None]:
model = AutoModelForSeq2SeqLM.from_pretrained('best')
model.eval()
model.config.use_cache = False

In [None]:
def detoxify(model, inference_request, tokenizer=tokenizer):
    input_ids = tokenizer(inference_request, return_tensors="pt").input_ids
    outputs = model.generate(input_ids=input_ids)
    print(f'Initial message: {inference_request.strip(prefix)}')
    print(tokenizer.decode(outputs[0], skip_special_tokens=True,temperature=0))


inference_request = prefix + "Heck that shit is hittin' hard, what an assignment!"
detoxify(model, inference_request,tokenizer)

inference_request = prefix + "Heck the night is so damn cold out here"
detoxify(model, inference_request,tokenizer)

inference_request = prefix + df.iloc[-10]['toxic']
detoxify(model, inference_request,tokenizer)
inference_request = prefix + df.iloc[-2]['toxic']
detoxify(model, inference_request,tokenizer)
inference_request = prefix + df.iloc[-1]['toxic']
detoxify(model, inference_request,tokenizer)

inference_request = prefix + cropped_datasets['test'][3]['toxic']
detoxify(model, inference_request,tokenizer)

Initial message: Heck that shit is hittin' hard, what an assignment!
hell, it's hitting him really hard, what an assignment!
Initial message: Heck the night is so damn cold ou
god, it's so damn cold out here.
Initial message: Who the hell is ringing my bell?
who's ringing the doorbell at my house?
Initial message: They called me a fucking hero. The truth is I didn't care anymore.
they called me a hero, but the truth is, i didn't care anymore.
Initial message: I didn't fuck him.
i didn't sleep with him either.
Initial message: ur heads in front of a live god.
lower your heads in front of a living god.
