# Install requirements

In [6]:
!pip install -r ../requirements.txt

In [None]:
%pip install pandas -q
%pip install torch -q
%pip install transformers -q
%pip install ipywidgets -q
%pip install --user -U nltk -q
%pip install datasets sacrebleu -q
%pip install evaluate -q
%pip install accelerate -U -q

# Prepare data

### Preprocess

### Filter and split on train and validate

!python ../src/models/filter_split_train_val.py

# Test simple models - hypothesis

## Test baseline model to compare all other metrics with

### Initial dataset

In [None]:
!python ../src/models/metric/compute_metric.py --dataframe_file=../data/raw/filtered.tsv --predictions_column=reference --separator='\t'

2023-10-23 18:28:14.450036: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-10-23 18:28:14.450093: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-10-23 18:28:14.450153: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-10-23 18:28:14.457112: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
  dataframe = pd.read_csv(dataframe_file, sep=sepa

### Preprocessed dataset

In [1]:
!python ../src/models/metric/compute_metric.py --dataframe_file=../data/interim/preprocessed.csv --predictions_column=reference

2023-10-23 18:32:29.174844: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-10-23 18:32:29.174898: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-10-23 18:32:29.174960: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-10-23 18:32:29.181973: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
  dataframe = pd.read_csv(dataframe_file, sep=sepa

### Train dataset

In [3]:
!python ../src/models/metric/compute_metric.py --dataframe_file=../data/interim/train.csv --predictions_column=reference

2023-10-23 18:35:01.055900: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-10-23 18:35:01.055947: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-10-23 18:35:01.055991: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-10-23 18:35:01.063060: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
  dataframe = pd.read_csv(dataframe_file, sep=sepa

### Validation dataset

In [4]:
!python ../src/models/metric/compute_metric.py --dataframe_file=../data/interim/val.csv --predictions_column=reference

2023-10-23 18:36:32.138354: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-10-23 18:36:32.138399: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-10-23 18:36:32.138458: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-10-23 18:36:32.148301: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
  dataframe = pd.read_csv(dataframe_file, sep=sepa

## Result

### As you can see the highest score (37%) is on the filtered data: train or val datasets. So, the final solution will be compared with it.

## Test hypothesis 1 model (simple removal of swearing words)

### Fit the train data

### Evaluate on validation dataset

## The result of simple solution is quite good. It is 10% more than of baseline

# Final model - T5 fine-tuning

# Imports

In [25]:
import os
current_dir = os.getcwd() if 'current_dir' not in locals() else current_dir
print(f'Current directory: {current_dir}')

%load_ext autoreload
%autoreload 2

%cd {current_dir}/../src/data
from make_dataset import load_dataframe
from dataframe_preprocessing import preprocess
from text_preprocessing import simple_row_preprocessing
from analysis.analyze import get_toxic_words

%cd {current_dir}/../src/visualization
from visualize import visualize

from datasets import Dataset
from transformers import DataCollatorForSeq2Seq, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer, AutoTokenizer
import evaluate
import numpy as np

# import warnings
# warnings.simplefilter(action='ignore', category=FutureWarning)

Current directory: /workspaces/PMLDL_Assignment1/notebooks
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
/workspaces/PMLDL_Assignment1/src/data
/workspaces/PMLDL_Assignment1/src/visualization


In [2]:
df = load_dataframe()

df_processed = preprocess(df) # add more columns, switch ref and trn
df_processed = df_processed.apply(simple_row_preprocessing, axis=1) # preprocess text in simple way

TODO: train on a subset of dataset: take reference toxic with score > 75% (~0.94), difference > 0.5, translation toxic < 25% (0.04) or something
+ should compute metric of initial dataset, the preprocessed and the filtered as a baseline model

In [39]:
# Filter. Take only the most valuable. Drastically reduce the size of a dataset
df_for_training = df_processed[df_processed['length_difference'] < 15]
df_for_training = df_for_training[df_for_training['trn_length'] < 70]
df_for_training = df_for_training[df_for_training['trn_tox'] < 0.002]
df_for_training = df_for_training[df_for_training['ref_tox'] > 0.95]
df_for_training = df_for_training[df_for_training['similarity'] > 0.8]
print('number of datapoints:',len(df_for_training))
df_for_training.sort_values(by='length_difference', ascending=False).head()

number of datapoints: 449100


Unnamed: 0,reference,translation,similarity,lenght_diff,ref_tox,trn_tox,tox_diff,ref_length,trn_length,length_difference
110388,they ll come scared shitless i ll get the trut...,i m sure they ll come scared shitless i m gonn...,0.93,0.18,0.923222,0.377313,0.546,65,79,14
546382,the chef is eager for a rabbit or a wimp thari...,cook has a hankering to cook some rabbit or gr...,0.62,0.2,0.907365,8e-05,0.907,56,70,14
358156,most people want sex and meat so it means that...,most people are squirrely about sex and flesh ...,0.81,0.16,0.992469,0.011296,0.981,73,87,14
37266,now now give it back to me you silly little goose,now give it back to my little goose,0.79,0.27,0.999074,0.001795,0.997,51,37,14
190651,who knows if this is some fool s errand or the...,who knows if this is a crazy message or do we all,0.7,0.21,0.909461,0.000904,0.909,65,51,14


In [17]:
checkpoint = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)
metric = evaluate.load("sacrebleu")

prefix = "paraphrase from toxic to neutral: "

training_args = Seq2SeqTrainingArguments(
    output_dir="output_dir",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=10,
    predict_with_generate=True,
    # fp16=True,
    report_to='tensorboard',
)

In [38]:
def preprocess_function(examples):
    inputs = [prefix + example for example in examples["reference"]]
    targets = examples["translation"]
    model_inputs = tokenizer(inputs, text_target=targets, max_length=128, truncation=True)
    return model_inputs

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

# Training

In [40]:
train_dataset = Dataset.from_pandas(df_for_training[['reference', 'translation']].iloc[:400000]).map(preprocess_function, batched=True)
val_dataset = Dataset.from_pandas(df_for_training[['reference', 'translation']].iloc[400000:]).map(preprocess_function, batched=True)

Map:   0%|          | 0/400000 [00:00<?, ? examples/s]

Map:   0%|          | 0/49100 [00:00<?, ? examples/s]

In [46]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

In [None]:
trainer.save_model("saved_moodel")

In [None]:
text = prefix + "you can t talk to these old ass ladies like that"
checkp = 'saved_moodel'


tokenizer = AutoTokenizer.from_pretrained(checkp)
inputs = tokenizer(text, return_tensors="pt").input_ids

model = AutoModelForSeq2SeqLM.from_pretrained(checkp)

In [None]:
outputs = model.generate(inputs, max_new_tokens=40, do_sample=True, top_k=30, top_p=0.95)

tokenizer.decode(outputs[0], skip_special_tokens=True)

# Visualization