In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install wandb simpletransformers nlpaug

# Handle imports
import pandas as pd
from tqdm import tqdm
from simpletransformers.classification import ClassificationModel
from simpletransformers.config.model_args import T5Args, ClassificationArgs
from simpletransformers.t5 import T5Model
from sklearn import metrics
from sklearn.model_selection import train_test_split
import nlpaug.augmenter.word as naw
import wandb

# To show progress
tqdm.pandas()

# See the assigned GPU
!nvidia-smi

# Define constants
BASE_PATH_PROJECT = '/content/drive/MyDrive/Colab Notebooks'
BASE_PATH_DATA = f'{BASE_PATH_PROJECT}/data'
TRAINING_CSV = f'{BASE_PATH_DATA}/dataset/2022_hatespeech_dataset_train.csv'
TEST_CSV = f'{BASE_PATH_DATA}/dataset/2022_hatespeech_dataset_survey.csv'
RT_TRANSLATED_CSV = f'{BASE_PATH_DATA}/augmented dataset/rt_translation.csv'
CONTEXTUAL_EMBEDDING_CSV = f'{BASE_PATH_DATA}/augmented dataset/contextual_embedding.csv'
TRAIN_CONTEXTUAL_EMBEDDING_CSV = f'{BASE_PATH_DATA}/augmented dataset/train_contextual_embedding.csv'
TRAIN_RT_TRANSLATION_TRAINING_CSV = f'{BASE_PATH_DATA}/augmented dataset/train_rt_translation.csv'
TRAIN_CONTEXTUAL_EMBEDDING_RT_TRANSLATION_CSV = f'{BASE_PATH_DATA}/augmented dataset/train_contextual_embedding_rt_translation.csv'
T5_PREFIX = "binary classification"

# Read functions
def read_train_CSV():
    df = pd.read_csv(TRAINING_CSV, sep=";",
                     encoding="ISO-8859-1",
                     header=0,
                     usecols=[1, 2, 3],
                     names=["input_text", "target_text", "dataset"])

    df = pd.DataFrame({
        'prefix': [T5_PREFIX for i in range(len(df))],
        'input_text': df["input_text"].str.replace('\n', ' '),
        'target_text': df["target_text"],
        'dataset': df["dataset"].astype(str),
    })

    return df


def read_survey_CSV():
    df = pd.read_csv(TEST_CSV, sep=";",
                     usecols=[2, 3, 4], names=["input_text", "target_text", "dataset"])

    df = pd.DataFrame({
        'prefix': [T5_PREFIX for i in range(len(df))],
        'input_text': df["input_text"].str.replace('\n', ' '),
        'target_text': df["target_text"],
        'dataset': df["dataset"].astype(str),
    })

    return df

def read_contextual_embedding_dataset():
    return pd.read_csv(CONTEXTUAL_EMBEDDING_CSV, sep=",")


def read_rt_translation_dataset():
    return pd.read_csv(RT_TRANSLATED_CSV, sep=",")


def read_train_rt_translation_dataset():
    return pd.read_csv(TRAIN_RT_TRANSLATION_TRAINING_CSV, sep=",")


def read_train_contextual_embedding_dataset():
    return pd.read_csv(TRAIN_CONTEXTUAL_EMBEDDING_CSV, sep=",")


def read_train_contextual_embedding_rt_translation_dataset():
    return pd.read_csv(TRAIN_CONTEXTUAL_EMBEDDING_RT_TRANSLATION_CSV, sep=",")


# Log metrics
def log_test_metrics(y_true, y_pred, metric_prefix):
    assert not None in y_true, "None in y_true"
    assert not None in y_pred, "None in y_pred"
    assert len(y_pred) == len(y_true), "Unequal length of y_pred and y_true"
    y_true = [int(i) for i in y_true]
    y_pred = [int(i) for i in y_pred]

    tn, fp, fn, tp = metrics.confusion_matrix(y_true, y_pred).ravel()
    wandb.log({f"{metric_prefix}_TP": tp})
    wandb.log({f"{metric_prefix}_TN": tn})
    wandb.log({f"{metric_prefix}_FP": fp})
    wandb.log({f"{metric_prefix}_FN": fn})

    accuracy = metrics.accuracy_score(y_true, y_pred)
    f1_score = metrics.f1_score(y_true, y_pred)
    wandb.log({f"{metric_prefix}_accuracy": accuracy})
    wandb.log({f"{metric_prefix}_f1-score": f1_score})

# Load datasets
def print_dataset_statistics(df, dataset_str, true_label_col="target_text", is_t5=False):
    print(f'Number of entries in {dataset_str}: {len(df)}')

    if is_t5:
        print(f'Number of 0s in {dataset_str}: {len(df[df[true_label_col] == "0"])}')
        print(f'Number of 1s in {dataset_str}: {len(df[df[true_label_col] == "1"])}')
    else:
        print(f'Number of 0s in {dataset_str}: {len(df[df[true_label_col] == 0])}')
        print(f'Number of 1s in {dataset_str}: {len(df[df[true_label_col] == 1])}')


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting wandb
  Downloading wandb-0.12.19-py2.py3-none-any.whl (1.8 MB)
[K     |████████████████████████████████| 1.8 MB 27.7 MB/s 
[?25hCollecting simpletransformers
  Downloading simpletransformers-0.63.7-py3-none-any.whl (249 kB)
[K     |████████████████████████████████| 249 kB 66.0 MB/s 
[?25hCollecting nlpaug
  Downloading nlpaug-1.1.10-py3-none-any.whl (410 kB)
[K     |████████████████████████████████| 410 kB 71.6 MB/s 
[?25hCollecting sentry-sdk>=1.0.0
  Downloading sentry_sdk-1.6.0-py2.py3-none-any.whl (145 kB)
[K     |████████████████████████████████| 145 kB 68.8 MB/s 
[?25hCollecting GitPython>=1.0.0
  Downloading GitPython-3.1.27-py3-none-any.whl (181 kB)
[K     |████████████████████████████████| 181 kB 67.6 MB/s 
Collecting docker-pycreds>=0.4.0
  Downloading docker_pycreds-0.4.0-py2.py3-none-any.whl (9.0 kB)
Collecting pathtools
  Downloading pathtools-0.1.2.tar.g

Thu Jun 23 10:50:04 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   50C    P8     9W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
sweep_config = {
    "method": "grid",
    "metric": {"name": "train_loss", "goal": "minimize"},
    "parameters": {
        "num_train_epochs": {"values": [2, 3, 5]},
        "train_batch_size": {"values": [16]},
        "learning_rate": {"values": [2e-5, 3e-5, 5e-5]},
    },
}

sweep_id = wandb.sweep(sweep_config)

train_df = read_train_CSV()
train_df.rename(columns={'input_text': 'text', 'target_text': 'labels'}, inplace=True)
train_df, eval_df = train_test_split(train_df, test_size=0.3, random_state=42, shuffle=True)
eval_df, test_df = train_test_split(eval_df, test_size=0.2, random_state=45, shuffle=True)
print_dataset_statistics(train_df, 'train', "labels")
print_dataset_statistics(eval_df, 'eval', "labels")
print_dataset_statistics(test_df, 'test', "labels")

survey_df = read_survey_CSV()
survey_df.rename(columns={'input_text': 'text', 'target_text': 'labels'}, inplace=True)
print_dataset_statistics(survey_df, 'survey', "labels")

model_args = ClassificationArgs()
model_args.use_early_stopping = True
model_args.early_stopping_delta = 0.01
model_args.early_stopping_metric = "mcc"
model_args.early_stopping_metric_minimize = False
model_args.early_stopping_patience = 5
model_args.evaluate_during_training = True
model_args.evaluate_during_training_steps = 500
model_args.save_eval_checkpoints = False
model_args.reprocess_input_data = True
model_args.overwrite_output_dir = True
model_args.no_save = True
model_args.use_multiprocessing = True
model_args.save_model_every_epoch = False
model_args.save_steps = -1

def train():
    wandb.init(project="German HSD")

    model = ClassificationModel(
        "bert",
        "dbmdz/bert-base-german-uncased",
        args=model_args,
        use_cuda=True,
        sweep_config=wandb.config
    )

    model.train_model(
        train_df,
        eval_df=eval_df,
        accuracy=metrics.accuracy_score,
        f1_score=metrics.f1_score
    )

    model.save_model()
    wandb.save("./outputs/best_model/*")

    test_y_pred, _ = model.predict(test_df['text'].tolist())
    log_test_metrics(y_true=test_df['labels'].tolist(), y_pred=test_y_pred, metric_prefix="test")

    survey_y_pred, _ = model.predict(survey_df['text'].tolist())
    log_test_metrics(y_true=survey_df['labels'].tolist(), y_pred=survey_y_pred, metric_prefix="survey")

    wandb.join()


wandb.agent(sweep_id, train)

if wandb.run is not None:
    wandb.finish()


Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Create sweep with ID: 8l7pc221
Sweep URL: https://wandb.ai/rove271116/German%20HSD/sweeps/8l7pc221
Number of entries in train: 63162
Number of 0s in train: 24100
Number of 1s in train: 39062
Number of entries in eval: 10828
Number of 0s in eval: 4112
Number of 1s in eval: 6716
Number of entries in test: 2707
Number of 0s in test: 1074
Number of 1s in test: 1633
Number of entries in survey: 15
Number of 0s in survey: 7
Number of 1s in survey: 8


[34m[1mwandb[0m: Agent Starting Run: 3wbcnhgp with config:
[34m[1mwandb[0m: 	learning_rate: 2e-05
[34m[1mwandb[0m: 	num_train_epochs: 3
[34m[1mwandb[0m: 	train_batch_size: 16
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mrove271116[0m. Use [1m`wandb login --relogin`[0m to force relogin


Downloading:   0%|          | 0.00/433 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/422M [00:00<?, ?B/s]

Some weights of the model checkpoint at dbmdz/bert-base-german-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model 

Downloading:   0%|          | 0.00/59.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/242k [00:00<?, ?B/s]

  0%|          | 0/63162 [00:00<?, ?it/s]



Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Running Epoch 0 of 3:   0%|          | 0/3948 [00:00<?, ?it/s]



  0%|          | 0/10828 [00:00<?, ?it/s]

  0%|          | 0/10828 [00:00<?, ?it/s]

  0%|          | 0/10828 [00:00<?, ?it/s]

Running Epoch 1 of 3:   0%|          | 0/3948 [00:00<?, ?it/s]

  0%|          | 0/10828 [00:00<?, ?it/s]

  0%|          | 0/10828 [00:00<?, ?it/s]

  0%|          | 0/10828 [00:00<?, ?it/s]

  0%|          | 0/10828 [00:00<?, ?it/s]

Running Epoch 2 of 3:   0%|          | 0/3948 [00:00<?, ?it/s]

  0%|          | 0/10828 [00:00<?, ?it/s]



  0%|          | 0/2707 [00:00<?, ?it/s]

  0%|          | 0/339 [00:00<?, ?it/s]

  0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

VBox(children=(Label(value='0.249 MB of 0.249 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
Training loss,█▂▅▅▃▆▅▆▇▆▃▆▄▆▂▄▆▆▄▂▄▄▂▂▃▄▃▂▄▃▄▃▄▅▁▄▂▃▂▁
accuracy,▄▅█▅██▁▅
auprc,▁▄█▅▇█▇▂
auroc,▁▄█▅▇▇▇▂
eval_loss,▁▁▁▃▃▃▃█
f1_score,▁▅█▆██▅▆
fn,█▅▆▅▅▆▁▄
fp,▁▃▂▄▃▂█▅
global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
lr,▁▃▆████▇▇▇▇▇▇▆▆▆▆▆▆▅▅▅▅▅▅▄▄▄▄▄▄▃▃▃▃▃▂▂▂▂

0,1
Training loss,0.00378
accuracy,0.85768
auprc,0.96544
auroc,0.93202
eval_loss,0.50599
f1_score,0.88225
fn,943.0
fp,598.0
global_step,9000.0
lr,1e-05


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Sweep Agent: Exiting.
