In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install wandb simpletransformers nlpaug

# Handle imports
import pandas as pd
from tqdm import tqdm
from simpletransformers.classification import ClassificationModel
from simpletransformers.config.model_args import T5Args, ClassificationArgs
from simpletransformers.t5 import T5Model
from sklearn import metrics
from sklearn.model_selection import train_test_split
import nlpaug.augmenter.word as naw
import wandb

# To show progress
tqdm.pandas()

# See the assigned GPU
!nvidia-smi

# Define constants
BASE_PATH_PROJECT = '/content/drive/MyDrive/Colab Notebooks'
BASE_PATH_DATA = f'{BASE_PATH_PROJECT}/data'
TRAINING_CSV = f'{BASE_PATH_DATA}/dataset/2022_hatespeech_dataset_train.csv'
TEST_CSV = f'{BASE_PATH_DATA}/dataset/2022_hatespeech_dataset_survey.csv'
RT_TRANSLATED_CSV = f'{BASE_PATH_DATA}/augmented dataset/rt_translation.csv'
CONTEXTUAL_EMBEDDING_CSV = f'{BASE_PATH_DATA}/augmented dataset/contextual_embedding.csv'
TRAIN_CONTEXTUAL_EMBEDDING_CSV = f'{BASE_PATH_DATA}/augmented dataset/train_contextual_embedding.csv'
TRAIN_RT_TRANSLATION_TRAINING_CSV = f'{BASE_PATH_DATA}/augmented dataset/train_rt_translation.csv'
TRAIN_CONTEXTUAL_EMBEDDING_RT_TRANSLATION_CSV = f'{BASE_PATH_DATA}/augmented dataset/train_contextual_embedding_rt_translation.csv'
T5_PREFIX = "binary classification"

# Read functions
def read_train_CSV():
    df = pd.read_csv(TRAINING_CSV, sep=";",
                     encoding="ISO-8859-1",
                     header=0,
                     usecols=[1, 2, 3],
                     names=["input_text", "target_text", "dataset"])

    df = pd.DataFrame({
        'prefix': [T5_PREFIX for i in range(len(df))],
        'input_text': df["input_text"].str.replace('\n', ' '),
        'target_text': df["target_text"],
        'dataset': df["dataset"].astype(str),
    })

    return df


def read_survey_CSV():
    df = pd.read_csv(TEST_CSV, sep=";",
                     usecols=[2, 3, 4], names=["input_text", "target_text", "dataset"])

    df = pd.DataFrame({
        'prefix': [T5_PREFIX for i in range(len(df))],
        'input_text': df["input_text"].str.replace('\n', ' '),
        'target_text': df["target_text"],
        'dataset': df["dataset"].astype(str),
    })

    return df

def read_contextual_embedding_dataset():
    return pd.read_csv(CONTEXTUAL_EMBEDDING_CSV, sep=",")


def read_rt_translation_dataset():
    return pd.read_csv(RT_TRANSLATED_CSV, sep=",")


def read_train_rt_translation_dataset():
    return pd.read_csv(TRAIN_RT_TRANSLATION_TRAINING_CSV, sep=",")


def read_train_contextual_embedding_dataset():
    return pd.read_csv(TRAIN_CONTEXTUAL_EMBEDDING_CSV, sep=",")


def read_train_contextual_embedding_rt_translation_dataset():
    return pd.read_csv(TRAIN_CONTEXTUAL_EMBEDDING_RT_TRANSLATION_CSV, sep=",")


# Log metrics
def log_test_metrics(y_true, y_pred, metric_prefix):
    assert not None in y_true, "None in y_true"
    assert not None in y_pred, "None in y_pred"
    assert len(y_pred) == len(y_true), "Unequal length of y_pred and y_true"
    y_true = [int(i) for i in y_true]
    y_pred = [int(i) for i in y_pred]

    tn, fp, fn, tp = metrics.confusion_matrix(y_true, y_pred).ravel()
    wandb.log({f"{metric_prefix}_TP": tp})
    wandb.log({f"{metric_prefix}_TN": tn})
    wandb.log({f"{metric_prefix}_FP": fp})
    wandb.log({f"{metric_prefix}_FN": fn})

    accuracy = metrics.accuracy_score(y_true, y_pred)
    f1_score = metrics.f1_score(y_true, y_pred)
    wandb.log({f"{metric_prefix}_accuracy": accuracy})
    wandb.log({f"{metric_prefix}_f1-score": f1_score})

# Load datasets
def print_dataset_statistics(df, dataset_str, true_label_col="target_text", is_t5=False):
    print(f'Number of entries in {dataset_str}: {len(df)}')

    if is_t5:
        print(f'Number of 0s in {dataset_str}: {len(df[df[true_label_col] == "0"])}')
        print(f'Number of 1s in {dataset_str}: {len(df[df[true_label_col] == "1"])}')
    else:
        print(f'Number of 0s in {dataset_str}: {len(df[df[true_label_col] == 0])}')
        print(f'Number of 1s in {dataset_str}: {len(df[df[true_label_col] == 1])}')


# Contextual word embedding

In [None]:
train_df = read_train_CSV().head()
train_df, _ = train_test_split(train_df, test_size=0.3, random_state=42, shuffle=True)

aug = naw.ContextualWordEmbsAug(
    model_path='deepset/gbert-base',
    action="substitute",
    device='cpu'
)

contextual_embedding_df = train_df.copy()
contextual_embedding_df['input_text'] = train_df['input_text'].progress_apply(aug.augment)

contextual_embedding_df.to_csv(CONTEXTUAL_EMBEDDING_CSV, index=False)


# RT translation

In [None]:
train_df = read_train_CSV()
train_df, _ = train_test_split(train_df, test_size=0.3, random_state=42, shuffle=True)

aug = naw.BackTranslationAug(
    from_model_name='Helsinki-NLP/opus-mt-de-en',
    to_model_name='Helsinki-NLP/opus-mt-en-de',
    device='cpu'
)
rt_translation = train_df.copy()
rt_translation['input_text'] = train_df['input_text'].progress_apply(aug.augment)

rt_translation.to_csv(RT_TRANSLATED_CSV, index=False)


# Create combined datasets

In [None]:
train_df = read_train_CSV()
train_df, _ = train_test_split(train_df, test_size=0.3, random_state=42, shuffle=True)

contextual_embedding_df = read_contextual_embedding_dataset()
rt_translation_df = read_rt_translation_dataset()

train_contextual_embedding_df = pd.concat([train_df, contextual_embedding_df])
train_rt_translation_df = pd.concat([train_df, rt_translation_df])
train_contextual_embedding_rt_translation_df = pd.concat([train_df, contextual_embedding_df, rt_translation_df])


train_contextual_embedding_df.to_csv(TRAIN_CONTEXTUAL_EMBEDDING_CSV, index=False)
train_rt_translation_df.to_csv(TRAIN_RT_TRANSLATION_TRAINING_CSV, index=False)
train_contextual_embedding_rt_translation_df.to_csv(TRAIN_CONTEXTUAL_EMBEDDING_RT_TRANSLATION_CSV, index=False)
