# Data preparation


This notebook contains all the code necessary to prepare the different datasets and sources. The different preprocessing steps are explained in more details below.


In [1]:
from config import SBIC_OFFENSIVENESS_THRESHOLD, SBIC_SEXUAL_THRESHOLD, RANDOM_SEED
from util import preprocess_twitter_texts
from os import getcwd, path
from pathlib import Path
from sklearn.model_selection import train_test_split

import numpy as np
import pandas as pd
import sys

sys.path.append("../common")


In [2]:
parent_path = Path(path.abspath("")).parent
DATA_DIR = path.join(parent_path, "data")
INTERMEDIATE_DIR = path.join(parent_path, "intermediate")


## Social Bias Frames Corpus

Also referred to as _SBIC_ throughout the codebase. The corpus can be retrieved from https://maartensap.com/social-bias-frames/ (in case the link is not reachable anymore, the data can also be retrieved from the internet archive: https://web.archive.org/web/20230523165906/https://maartensap.com/social-bias-frames/ ). We decided to use the "aggregated by post" data, as we don't need the single annotator annotations. This version is furthermore already deduplicated.

The code expects the following files from the corpus in the following directories:

- `SBIC.v2.agg.trn.csv` in `data/common/`
- `SBIC.v2.agg.dev.csv` in `data/common/`
- `SBIC.v2.agg.tst.csv` in `data/common/`


In [3]:
# Load SBIC data from file
sbic_train = pd.read_csv(path.join(DATA_DIR, "common", "SBIC.v2.agg.trn.csv"))
sbic_val = pd.read_csv(path.join(DATA_DIR, "common", "SBIC.v2.agg.dev.csv"))
sbic_test = pd.read_csv(path.join(DATA_DIR, "common", "SBIC.v2.agg.tst.csv"))

# Extract columns of interest and rename ID column
columns_of_interest = ["post", "sexYN", "offensiveYN", "intentYN", "Unnamed: 0", "dataSource"]
sbic_train = sbic_train[columns_of_interest].rename(columns={"Unnamed: 0": "post_id"})
sbic_val = sbic_val[columns_of_interest].rename(columns={"Unnamed: 0": "post_id"})
sbic_test = sbic_test[columns_of_interest].rename(columns={"Unnamed: 0": "post_id"})


In [4]:
def infer_social_bias_labels(df):
    """Transform continuous values to binary labels.

    Also create a new binary column "label" that combines the values of offensive and sexual to a
    new "biased" column.

    1: offensive or sexual (depending on the column)
    0: neither

    This function modifies the provided dataframe inplace, so no return value is given.
    """
    df["lewdness_label"] = np.where(df["sexYN"] >= SBIC_SEXUAL_THRESHOLD, 1, 0)
    df["offensiveness_label"] = np.where(df["offensiveYN"] >= SBIC_OFFENSIVENESS_THRESHOLD, 1, 0)
    df["social_bias_label"] = df.apply(
        lambda x: 1 if (x["lewdness_label"] + x["offensiveness_label"]) > 0 else 0, axis=1
    )


# Preprocess text columns
sbic_train_prep = preprocess_twitter_texts(sbic_train, target_text_colum="post_preprocessed")
sbic_val_prep = preprocess_twitter_texts(sbic_val, target_text_colum="post_preprocessed")
sbic_test_prep = preprocess_twitter_texts(sbic_test, target_text_colum="post_preprocessed")

# Create binary labels for each data split
infer_social_bias_labels(sbic_train_prep)
infer_social_bias_labels(sbic_val_prep)
infer_social_bias_labels(sbic_test_prep)


In [5]:
# Select only the columns we are interested in for later tasks
sbic_train_prep = sbic_train_prep[
    ["post_id", "post_preprocessed", "social_bias_label", "offensiveness_label", "lewdness_label"]
].copy()
sbic_val_prep = sbic_val_prep[
    ["post_id", "post_preprocessed", "social_bias_label", "offensiveness_label", "lewdness_label"]
].copy()
sbic_test_prep = sbic_test_prep[
    ["post_id", "post_preprocessed", "social_bias_label", "offensiveness_label", "lewdness_label"]
].copy()

# Clean up data from either empty posts or hand-selected examples that don't meet our minimum
# requirements
sbic_train_prep.drop(
    sbic_train_prep[sbic_train_prep["post_preprocessed"].str.len() == 0].index, inplace=True
)
sbic_train_prep.drop(
    sbic_train_prep[sbic_train_prep["post_id"].isin([1, 35493, 35495])].index, inplace=True
)


In [6]:
# Write final prepared data to file
sbic_train_prep.to_csv(
    path.join(INTERMEDIATE_DIR, "common", "sbic-train-prep.csv"), sep=",", index=False
)
sbic_val_prep.to_csv(
    path.join(INTERMEDIATE_DIR, "common", "sbic-val-prep.csv"), sep=",", index=False
)
sbic_test_prep.to_csv(
    path.join(INTERMEDIATE_DIR, "common", "sbic-test-prep.csv"), sep=",", index=False
)


### TwitterAAE corpus

The corpus can be retrieved from http://slanglab.cs.umass.edu/TwitterAAE/. It is later used to train a AAE dialect classifier, which is in turn used to annotate the SBIC data for containing AAE dialect.

In order to create a balanced dataset, we randomly subsample the non-AAE Twitter posts to the size of the AAE posts data.

The code expects the following files from the corpus in the following directories:

- `twitteraae_all_aa` in `data/common/`
- `twitteraae_all` in `data/common/`


In [7]:
# Load TwitterAAE data from file
twitter_data_full = pd.read_csv(
    path.join(DATA_DIR, "common", "twitteraae_all"),
    sep="\t",
    escapechar="\\",
    names=[
        "post_id",
        "timestamp",
        "user_id",
        "location",
        "census_blk_group",
        "post",
        "demograpic1_inference",
        "demograpic2_inference",
        "demograpic3_inference",
        "demograpic4_inference",
    ],
)
twitter_data_aae = pd.read_csv(
    path.join(DATA_DIR, "common", "twitteraae_all_aa"),
    sep="\t",
    escapechar="\\",
    names=[
        "post_id",
        "timestamp",
        "user_id",
        "location",
        "census_blk_group",
        "post",
        "demograpic1_inference",
        "demograpic2_inference",
        "demograpic3_inference",
        "demograpic4_inference",
    ],
)

# Since the full dataset is a superset of the AAE dataset, we need to filter AAE samples
twitter_data_no_aae = twitter_data_full[
    ~twitter_data_full.post_id.isin(twitter_data_aae.post_id)
].copy()

# Extract columns of interest
columns_of_interest = ["post_id", "post"]
twitter_data_aae = twitter_data_aae[columns_of_interest]
twitter_data_no_aae = twitter_data_no_aae[columns_of_interest]


In [8]:
# Preprocess text columns
twitter_data_aae_prep = preprocess_twitter_texts(twitter_data_aae)
twitter_data_no_aae_prep = preprocess_twitter_texts(twitter_data_no_aae)

# Remove empty samples that might have appeared after preprocessing
twitter_data_aae_prep.dropna(inplace=True)
twitter_data_no_aae_prep.dropna(inplace=True)
twitter_data_aae_prep = twitter_data_aae_prep[twitter_data_aae_prep.post_preprocessed != ""]
twitter_data_no_aae_prep = twitter_data_no_aae_prep[
    twitter_data_no_aae_prep.post_preprocessed != ""
]


In [9]:
# Subsample AAE posts
twitter_data_no_aae_prep_sampled = twitter_data_no_aae_prep.sample(
    n=len(twitter_data_aae_prep), random_state=RANDOM_SEED, axis=0
)


In [10]:
# Merge AAE and non AAE posts into a single dataframe
twitter_data_all_labels = twitter_data_aae_prep.merge(twitter_data_no_aae_prep_sampled, how="outer")
twitter_data_all_labels["aae_dialect_label"] = twitter_data_all_labels.post_id.apply(
    lambda x: 1 if x in twitter_data_aae.post_id.values else 0
)

# Only select the columns of interest
twitter_data_all_labels = twitter_data_all_labels[
    ["post_id", "post_preprocessed", "aae_dialect_label"]
]


In [11]:
# Create stratified data splits for training and evaluation
twitter_data_train, twitter_data_test = train_test_split(
    twitter_data_all_labels,
    train_size=0.8,
    stratify=twitter_data_all_labels["aae_dialect_label"],
    random_state=RANDOM_SEED,
)
twitter_data_train, twitter_data_val = train_test_split(
    twitter_data_train,
    train_size=0.8,
    stratify=twitter_data_train["aae_dialect_label"],
    random_state=RANDOM_SEED,
)


In [12]:
# Write final prepared data to file
twitter_data_train.to_csv(
    path.join(INTERMEDIATE_DIR, "common", "twitteraae-train-prep.csv"), sep=",", index=False
)
twitter_data_val.to_csv(
    path.join(INTERMEDIATE_DIR, "common", "twitteraae-val-prep.csv"), sep=",", index=False
)
twitter_data_test.to_csv(
    path.join(INTERMEDIATE_DIR, "common", "twitteraae-test-prep.csv"), sep=",", index=False
)
