In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install wandb simpletransformers nlpaug

# Handle imports
import pandas as pd
from tqdm import tqdm
from simpletransformers.classification import ClassificationModel
from simpletransformers.config.model_args import T5Args, ClassificationArgs
from simpletransformers.t5 import T5Model
from sklearn import metrics
from sklearn.model_selection import train_test_split
import nlpaug.augmenter.word as naw
import wandb

# To show progress
tqdm.pandas()

# See the assigned GPU
!nvidia-smi

# Define constants
BASE_PATH_PROJECT = '/content/drive/MyDrive/Colab Notebooks'
BASE_PATH_DATA = f'{BASE_PATH_PROJECT}/data'
TRAINING_CSV = f'{BASE_PATH_DATA}/dataset/2022_hatespeech_dataset_train.csv'
TEST_CSV = f'{BASE_PATH_DATA}/dataset/2022_hatespeech_dataset_survey.csv'
RT_TRANSLATED_CSV = f'{BASE_PATH_DATA}/augmented dataset/rt_translation.csv'
CONTEXTUAL_EMBEDDING_CSV = f'{BASE_PATH_DATA}/augmented dataset/contextual_embedding.csv'
TRAIN_CONTEXTUAL_EMBEDDING_CSV = f'{BASE_PATH_DATA}/augmented dataset/train_contextual_embedding.csv'
TRAIN_RT_TRANSLATION_TRAINING_CSV = f'{BASE_PATH_DATA}/augmented dataset/train_rt_translation.csv'
TRAIN_CONTEXTUAL_EMBEDDING_RT_TRANSLATION_CSV = f'{BASE_PATH_DATA}/augmented dataset/train_contextual_embedding_rt_translation.csv'
T5_PREFIX = "binary classification"

# Read functions
def read_train_CSV():
    df = pd.read_csv(TRAINING_CSV, sep=";",
                     encoding="ISO-8859-1",
                     header=0,
                     usecols=[1, 2, 3],
                     names=["input_text", "target_text", "dataset"])

    df = pd.DataFrame({
        'prefix': [T5_PREFIX for i in range(len(df))],
        'input_text': df["input_text"].str.replace('\n', ' '),
        'target_text': df["target_text"],
        'dataset': df["dataset"].astype(str),
    })

    return df


def read_survey_CSV():
    df = pd.read_csv(TEST_CSV, sep=";",
                     usecols=[2, 3, 4], names=["input_text", "target_text", "dataset"])

    df = pd.DataFrame({
        'prefix': [T5_PREFIX for i in range(len(df))],
        'input_text': df["input_text"].str.replace('\n', ' '),
        'target_text': df["target_text"],
        'dataset': df["dataset"].astype(str),
    })

    return df

def read_contextual_embedding_dataset():
    return pd.read_csv(CONTEXTUAL_EMBEDDING_CSV, sep=",")


def read_rt_translation_dataset():
    return pd.read_csv(RT_TRANSLATED_CSV, sep=",")


def read_train_rt_translation_dataset():
    return pd.read_csv(TRAIN_RT_TRANSLATION_TRAINING_CSV, sep=",")


def read_train_contextual_embedding_dataset():
    return pd.read_csv(TRAIN_CONTEXTUAL_EMBEDDING_CSV, sep=",")


def read_train_contextual_embedding_rt_translation_dataset():
    return pd.read_csv(TRAIN_CONTEXTUAL_EMBEDDING_RT_TRANSLATION_CSV, sep=",")


# Log metrics
def log_test_metrics(y_true, y_pred, metric_prefix):
    assert not None in y_true, "None in y_true"
    assert not None in y_pred, "None in y_pred"
    assert len(y_pred) == len(y_true), "Unequal length of y_pred and y_true"
    y_true = [int(i) for i in y_true]
    y_pred = [int(i) for i in y_pred]

    tn, fp, fn, tp = metrics.confusion_matrix(y_true, y_pred).ravel()
    wandb.log({f"{metric_prefix}_TP": tp})
    wandb.log({f"{metric_prefix}_TN": tn})
    wandb.log({f"{metric_prefix}_FP": fp})
    wandb.log({f"{metric_prefix}_FN": fn})

    accuracy = metrics.accuracy_score(y_true, y_pred)
    f1_score = metrics.f1_score(y_true, y_pred)
    wandb.log({f"{metric_prefix}_accuracy": accuracy})
    wandb.log({f"{metric_prefix}_f1-score": f1_score})

# Load datasets
def print_dataset_statistics(df, dataset_str, true_label_col="target_text", is_t5=False):
    print(f'Number of entries in {dataset_str}: {len(df)}')

    if is_t5:
        print(f'Number of 0s in {dataset_str}: {len(df[df[true_label_col] == "0"])}')
        print(f'Number of 1s in {dataset_str}: {len(df[df[true_label_col] == "1"])}')
    else:
        print(f'Number of 0s in {dataset_str}: {len(df[df[true_label_col] == 0])}')
        print(f'Number of 1s in {dataset_str}: {len(df[df[true_label_col] == 1])}')


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting wandb
  Downloading wandb-0.12.19-py2.py3-none-any.whl (1.8 MB)
[K     |████████████████████████████████| 1.8 MB 7.2 MB/s 
[?25hCollecting simpletransformers
  Downloading simpletransformers-0.63.7-py3-none-any.whl (249 kB)
[K     |████████████████████████████████| 249 kB 71.1 MB/s 
[?25hCollecting nlpaug
  Downloading nlpaug-1.1.10-py3-none-any.whl (410 kB)
[K     |████████████████████████████████| 410 kB 73.3 MB/s 
Collecting shortuuid>=0.5.0
  Downloading shortuuid-1.0.9-py3-none-any.whl (9.4 kB)
Collecting pathtools
  Downloading pathtools-0.1.2.tar.gz (11 kB)
Collecting GitPython>=1.0.0
  Downloading GitPython-3.1.27-py3-none-any.whl (181 kB)
[K     |████████████████████████████████| 181 kB 66.4 MB/s 
[?25hCollecting docker-pycreds>=0.4.0
  Downloading docker_pycreds-0.4.0-py2.py3-none-any.whl (9.0 kB)
Collecting sentry-sdk>=1.0.0
  Downloading sentry_sdk-1.6.0-py2.

Thu Jun 23 07:11:46 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   40C    P8     9W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
 sweep_config = {
    "method": "grid",
    "metric": {"name": "train_loss", "goal": "minimize"},
    "parameters": {
        "num_train_epochs": {"values": [2, 3, 5]},
        "train_batch_size": {"values": [16]},
        "learning_rate": {"values": [2e-5, 3e-5, 5e-5]},
    },
}

sweep_id = wandb.sweep(sweep_config)

train_df = read_train_CSV().astype(str)
train_df, eval_df = train_test_split(train_df, test_size=0.3, random_state=42, shuffle=True)
eval_df, test_df = train_test_split(eval_df, test_size=0.2, random_state=45, shuffle=True)
print_dataset_statistics(train_df, 'train', is_t5=True)
print_dataset_statistics(eval_df, 'eval', is_t5=True)
print_dataset_statistics(test_df, 'test', is_t5=True)
survey_df = read_survey_CSV().astype(str)
print_dataset_statistics(survey_df, 'survey', is_t5=True)

model_args = T5Args()
model_args.train_batch_size = 16
model_args.eval_batch_size = 64
model_args.evaluate_during_training = True
model_args.evaluate_during_training_steps = 500
model_args.save_eval_checkpoints = False
model_args.reprocess_input_data = True
model_args.overwrite_output_dir = True
model_args.no_save = True
model_args.use_multiprocessing = False
model_args.save_model_every_epoch = False
model_args.save_steps = -1

def train():
    wandb.init(project="German HSD test")

    model = T5Model(
        "t5",
        "t5-small",
        args=model_args,
        use_cuda=False,
        sweep_config=wandb.config
    )

    model.train_model(
        train_df,
        eval_data=eval_df,
    )

    model.save_model()
    wandb.save("./outputs/best_model/*")

    to_predict = (T5_PREFIX + ': ' + test_df["input_text"]).tolist()
    preds = model.predict(to_predict)
    log_test_metrics(y_true=test_df["target_text"].tolist(), y_pred=preds, metric_prefix="test")

    to_predict = (T5_PREFIX + ': ' + survey_df["input_text"]).tolist()
    preds = model.predict(to_predict)
    log_test_metrics(y_true=survey_df["target_text"].tolist(), y_pred=preds, metric_prefix="survey")

    wandb.join()


wandb.agent(sweep_id, train)

if wandb.run is not None:
    wandb.finish()


Create sweep with ID: ja6wkveu
Sweep URL: https://wandb.ai/rove271116/German%20HSD/sweeps/ja6wkveu
Number of entries in train: 94743
Number of 0s in train: 36189
Number of 1s in train: 58554
Number of entries in eval: 10828
Number of 0s in eval: 4112
Number of 1s in eval: 6716
Number of entries in test: 2707
Number of 0s in test: 1074
Number of 1s in test: 1633
Number of entries in survey: 15
Number of 0s in survey: 7
Number of 1s in survey: 8


[34m[1mwandb[0m: Agent Starting Run: 023w2ylb with config:
[34m[1mwandb[0m: 	learning_rate: 5e-05
[34m[1mwandb[0m: 	num_train_epochs: 3
[34m[1mwandb[0m: 	train_batch_size: 16
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-small automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


  0%|          | 0/94743 [00:00<?, ?it/s]

`prepare_seq2seq_batch` is deprecated and will be removed in version 5 of HuggingFace Transformers. Use the regular
`__call__` method to prepare your inputs and the tokenizer under the `as_target_tokenizer` context manager to prepare
your targets.

Here is a short example:

model_inputs = tokenizer(src_texts, ...)
with tokenizer.as_target_tokenizer():
    labels = tokenizer(tgt_texts, ...)
model_inputs["labels"] = labels["input_ids"]

See the documentation of your specific tokenizer for more details on the specific arguments to the tokenizer of choice.
For a more complete example, see the implementation of `prepare_seq2seq_batch`.



Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Running Epoch 0 of 3:   0%|          | 0/5922 [00:00<?, ?it/s]



  0%|          | 0/10828 [00:00<?, ?it/s]

  0%|          | 0/10828 [00:00<?, ?it/s]

  0%|          | 0/10828 [00:00<?, ?it/s]

  0%|          | 0/10828 [00:00<?, ?it/s]

Running Epoch 1 of 3:   0%|          | 0/5922 [00:00<?, ?it/s]

  0%|          | 0/10828 [00:00<?, ?it/s]

  0%|          | 0/10828 [00:00<?, ?it/s]

  0%|          | 0/10828 [00:00<?, ?it/s]

  0%|          | 0/10828 [00:00<?, ?it/s]

  0%|          | 0/10828 [00:00<?, ?it/s]

Running Epoch 2 of 3:   0%|          | 0/5922 [00:00<?, ?it/s]

  0%|          | 0/10828 [00:00<?, ?it/s]

  0%|          | 0/10828 [00:00<?, ?it/s]

  0%|          | 0/10828 [00:00<?, ?it/s]

  0%|          | 0/10828 [00:00<?, ?it/s]

  0%|          | 0/10828 [00:00<?, ?it/s]



Generating outputs:   0%|          | 0/43 [00:00<?, ?it/s]

Decoding outputs:   0%|          | 0/2707 [00:00<?, ?it/s]

Generating outputs:   0%|          | 0/1 [00:00<?, ?it/s]

`prepare_seq2seq_batch` is deprecated and will be removed in version 5 of HuggingFace Transformers. Use the regular
`__call__` method to prepare your inputs and the tokenizer under the `as_target_tokenizer` context manager to prepare
your targets.

Here is a short example:

model_inputs = tokenizer(src_texts, ...)
with tokenizer.as_target_tokenizer():
    labels = tokenizer(tgt_texts, ...)
model_inputs["labels"] = labels["input_ids"]

See the documentation of your specific tokenizer for more details on the specific arguments to the tokenizer of choice.
For a more complete example, see the implementation of `prepare_seq2seq_batch`.



Decoding outputs:   0%|          | 0/15 [00:00<?, ?it/s]

VBox(children=(Label(value='0.269 MB of 0.269 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
Training loss,█▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
eval_loss,█▅▄▅▄▃▂▂▂▂▂▂▁▁
global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
lr,▁▄▇█████████████████████████████████████
survey_FN,▁
survey_FP,▁
survey_TN,▁
survey_TP,▁
survey_accuracy,▁
survey_f1-score,▁

0,1
Training loss,0.15203
eval_loss,0.1056
global_step,17766.0
lr,5e-05
survey_FN,1.0
survey_FP,4.0
survey_TN,3.0
survey_TP,7.0
survey_accuracy,0.66667
survey_f1-score,0.73684


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Sweep Agent: Exiting.
