In [1]:
import pandas as pd
import math
import os

def read_from_csv(data_name, header=0, names=None):
    if "tsv" in data_name:
        data = pd.read_csv(data_name,
                            sep='\t',
                            encoding = "utf-8",
                            engine = "python",
                            header = header,
                            names = names)
    elif "csv" in data_name:
        data = pd.read_csv(data_name,
                        encoding = "utf-8",
                        engine = "python",
                        header = header,
                        names = names)
    else:
        raise NotImplementedError("Given data file type is not supported yet.")
    return data

def print_data_info(data, split, label_col):
    label_counts = data[label_col].value_counts().to_dict()
    output = f"{split}\t{len(data)}"
    for label in sorted(label_counts.keys()):
        output += f"\t{label}: {label_counts[label]}, "
        output += "{:.1%}".format(label_counts[label]/len(data))
    print(output)

In [8]:
# AMI
# https://live.european-language-grid.eu/catalogue/corpus/7272/download/
data_name = "ami"
train_data = read_from_csv(f"./{data_name}/train.tsv")
test_data = read_from_csv(f"./{data_name}/test.tsv")
print('AMI\nMulti - {"discredit": 0, "stereotype": 1, "dominance": 2, "sexual harassment": 3, "derailing": 4}')
category2index = {"discredit": 0, "stereotype": 1, "dominance": 2, "sexual_harassment": 3, "derailing": 4}
train_misogynous = train_data[train_data["misogynous"] != 0].reset_index(drop=True)
train_misogynous["label_multi"] = train_misogynous['misogyny_category'].map(category2index)
# train_misogynous.to_csv("./ami/train_clean.csv", index=False)
# dict(train_misogynous.label_multi.value_counts()) # {0: 1014, 3: 352, 1: 179, 2: 148, 4: 92}
test_misogynous = test_data[test_data["misogynous"] != 0].reset_index(drop=True)
test_misogynous["label_multi"] = test_misogynous['misogyny_category'].map(category2index)
# test_misogynous.to_csv("./ami/test_clean.csv", index=False)
# dict(test_misogynous.label_multi.value_counts()) # {0: 141, 1: 140, 2: 124, 3: 44, 4: 11}
# lc = [760, 134, 111, 264, 69] # train
# {label: round((1/count)*100, 1) for label, count in enumerate(lc)} # {0: 0.1, 1: 0.7, 2: 0.9, 3: 0.4, 4: 1.4}
lc = {0: 141, 1: 140, 2: 124, 3: 44, 4: 11} # test
{label: round((1/count)*10, 1) for label, count in lc.items()} # {0: 0.1, 1: 0.1, 2: 0.1, 3: 0.2, 4: 0.9}

AMI
Multi - {"discredit": 0, "stereotype": 1, "dominance": 2, "sexual harassment": 3, "derailing": 4}


{0: 0.1, 1: 0.1, 2: 0.1, 3: 0.2, 4: 0.9}

In [None]:
# Bretschneider-TH: bin - 0: neutral, 1: harassment
merged_data = read_from_csv("./bretschneider-th/merged_data.csv")
merged_data = merged_data.dropna().reset_index(drop=True)
merged_data = merged_data.drop_duplicates().reset_index(drop=True)

school_data = read_from_csv("./bretschneider-th/school_labels.csv")
school_data = school_data.rename(columns={"tweetId": "tweet_id"})
school_data = school_data[["tweet_id", "bullyLabel"]]
school_data = school_data.drop_duplicates().reset_index(drop=True)
school_data = school_data.merge(merged_data, on='tweet_id')
# school_data.to_csv("./bretschneider-th/school_data_clean.csv", index=False)
# school_data.label.value_counts() # 1285, 111
for i, df in school.groupby(['tweetId']):
    label_counts = df.groupby("bullyLabel").count().to_dict()['tweetId']
    tweet_id = df['tweetId'].values[0]
    if label_counts.get(0) and (label_counts[0] == 3 or label_counts[0] == 2):
        school_major = school_major.append({'tweet_id': tweet_id, 'label': 0}, ignore_index=True)
    else:
        school_major = school_major.append({'tweet_id': tweet_id, 'label': 1}, ignore_index=True)

main_data = read_from_csv("./bretschneider-th/main_labels.csv")
main_data = main_data.rename(columns={"tweetId": "tweet_id"})
main_data = main_data[["tweet_id", "bullyLabel"]]
main_data = main_data.drop_duplicates().reset_index(drop=True)
main_data = main_data.merge(merged_data, on='tweet_id')
# main_data.to_csv("./bretschneider-th/main_data_clean.csv", index=False)
# main_data.label.value_counts() # 2713, 185

In [4]:
# Civil Comments
df_test = read_from_csv('./civil-comments/test.csv')
df_test = df_test.rename(columns={"comment_text": "text"})
df_test["label"] = 0
df_test["label"] = df_test["label"].where(df_test["toxicity"] < 0.5, 1)
df_test= df_test.sample(n=5000, random_state=1).reset_index(drop=True)
df_test.to_csv("./civil-comments/data_clean.csv", index=False)
dict(df_test.label.value_counts()) # {0: 9188, 1: 812}

{0: 4601, 1: 399}

In [30]:
# CMSB - TSD
# https://search.gesis.org/research_data/SDN-10.7802-2251
data = read_from_csv("./cmsb-tsd/sexism_data.csv")
data["label"] = 0
data["label"] = data["label"].where(data["sexist"] == False, 1)
# data.label.value_counts().to_dict() # {0: 11822, 1: 1809}
data.to_csv("./cmsb-tsd/data_clean.csv", index=False)

In [29]:
# Davidson-THON: multi - 0: hate speech, 1: offensive language, 2: neither
data = read_from_csv("./davidson-thon/davidson-thon.csv")
data = data.rename(columns={"tweet": "text"})
data = data.rename(columns={"class": "label_multi"})
data = data[["text", "label_multi"]]
# data.to_csv("./davidson-thon/data_clean.csv", index=False)
data.label_multi.value_counts().to_dict()
lc = [858, 11514, 2497]
{label: round((1/count)*1000, 1) for label, count in enumerate(lc)} # {0: 1.2, 1: 0.1, 2: 0.4}

{0: 1.2, 1: 0.1, 2: 0.4}

In [4]:
# Founta 2018 - THAS: multi - normal, spam, abusive, hateful
data = read_from_csv("./founta-2018-thas/large_scale_hatespeechtwitter.csv", header=None, names=['id', 'text', 'class'])
data = data.dropna().reset_index(drop=True)
class2label = {"normal": 0, "spam": 1, "abusive": 2, "hateful": 3}
data["label_multi"] = data["class"].map(class2label)
data = data.dropna().reset_index(drop=True)
# data.to_csv("./founta-2018-thas/data_clean.csv", index=False)
# dict(data.label_multi.value_counts()) # {0: 33325, 1: 7555, 2: 3934, 3: 1638}
lc = [19994, 4533, 2360, 983]
# {label: round((1/count)*1000, 1) for label, count in enumerate(lc)} # {0: 0.1, 1: 0.2, 2: 0.4, 3: 1.0}
{label: round((27870/(4*count)), 1) for label, count in enumerate(lc)}

{0: 0.3, 1: 1.5, 2: 3.0, 3: 7.1}

In [44]:
# Gao 2018 FHC - bin
data_name = "gao-2018-fhc"
data = read_from_csv(f"./{data_name}/data_clean.csv")
label_counts = dict(data.label.value_counts()) # {0: 1093, 1: 435}

max_class = max(label_counts, key=label_counts.get)
min_class = min(label_counts, key=label_counts.get)
print(f"Original rho = {label_counts[max_class] / label_counts[min_class]}\t{label_counts}")

rhos = [5.0, 10.0, 15.0, 20.0]
for rho in rhos:
    df_resampled = pd.DataFrame()
    num_samples_other_classes = math.ceil(label_counts[max_class] / rho)
    for label, count in label_counts.items():
        if label != max_class:
            df = data[data["label"] == label].sample(n=num_samples_other_classes, random_state=1)
        else:
            df = data[data["label"] == label]
        df_resampled = pd.concat([df_resampled, df])
    outdir = f"./{data_name}_rho={rho}"
    if not os.path.exists(outdir):
        os.mkdir(outdir)
    df_resampled.to_csv(f"{outdir}/data_clean.csv")
    print(f"rho={rho}\t\t{dict(df_resampled.label.value_counts())}")

Original rho = 2.5126436781609196	{0: 1093, 1: 435}
rho=5.0		{0: 1093, 1: 219}
rho=10.0		{0: 1093, 1: 110}
rho=15.0		{0: 1093, 1: 73}
rho=20.0		{0: 1093, 1: 55}


In [20]:
# Gibert 2018 - SHS binary
data_annotations = read_from_csv("./gibert-2018-shs/annotations_metadata.csv")
# print(dict(data_annotations.label.value_counts())) # {'noHate': 9507, 'hate': 1196, 'relation': 168, 'idk/skip': 73}
data_annotations = data_annotations.rename(columns={"label": "class"})
class2label = {"noHate": 0, "hate": 1, "relation": -1, "idk/skip": -1}
data_annotations["label"] = data_annotations["class"].map(class2label)
data_annotations = data_annotations[data_annotations["label"] != -1].reset_index(drop=True)
# dict(data_annotations.label.value_counts()) # {0: 9507, 1: 1196}
def get_text_by_file_id(file_id):
    with open(f"./gibert-2018-shs/all_files/{file_id}.txt", "r") as f:
        text = f.readlines()[0]
    return text
data_annotations["text"] = data_annotations.apply(lambda row: get_text_by_file_id(row.file_id), axis=1)
data = data_annotations[["text", "label"]]
# data.to_csv("./gibert-2018-shs/data_clean.csv", index=False)
# data

In [4]:
# OLID: https://github.com/idontflow/OLID/tree/master
train_data = read_from_csv("./olid/olid-training-v1.0.tsv")
train_data.subtask_a.value_counts()

subtask_a
NOT    8840
OFF    4400
Name: count, dtype: int64

In [42]:
# Twitter hate speech - TSA: binary - not hate speech, hate speech
data = read_from_csv("./twitter-hate-speech-tsa/data.csv")
data = data.rename(columns={"tweet": "text"})
data.to_csv("./twitter-hate-speech-tsa/data_clean.csv", index=False)

In [3]:
# US Election 2020: binary - HOF, Non-HOF
train_data = read_from_csv("./us-election-2020/train.tsv")
train_data["label"] = 0
train_data["label"] = train_data["label"].where(train_data["HOF"] == "Non-Hateful", 1)
train_data.to_csv("./us-election-2020/train_clean.csv", index=False)

test_data = read_from_csv("./us-election-2020/test.tsv")
test_data["label"] = 0
test_data["label"] = test_data["label"].where(test_data["HOF"] == "Non-Hateful", 1)
test_data.to_csv("./us-election-2020/test_clean.csv", index=False)

In [41]:
# Waseem-and-Hovy 2016: bin - 0: neither, 1: sexism or racism
data_name = "waseem-and-hovy-2016"
data = read_from_csv(f"./{data_name}/srw.csv")
data["label"] = 0
data["label"] = data["label"].where(data["category"] == "none", 1)
data = data.dropna().reset_index(drop=True)
# data.to_csv("./waseem-and-hovy-2016/data_clean.csv", index=False)
label_counts = dict(data.label.value_counts()) # {0: 1093, 1: 435}

max_class = max(label_counts, key=label_counts.get)
min_class = min(label_counts, key=label_counts.get)
print(f"Original rho = {label_counts[max_class] / label_counts[min_class]}\t{label_counts}")

rhos = [5.0, 10.0, 15.0, 20.0]
for rho in rhos:
    df_resampled = pd.DataFrame()
    num_samples_other_classes = math.ceil(label_counts[max_class] / rho)
    for label, count in label_counts.items():
        if label != max_class:
            df = data[data["label"] == label].sample(n=num_samples_other_classes, random_state=1)
        else:
            df = data[data["label"] == label]
        df_resampled = pd.concat([df_resampled, df])
    outdir = f"./{data_name}_rho={rho}"
    if not os.path.exists(outdir):
        os.mkdir(outdir)
    df_resampled.to_csv(f"{outdir}/data_clean.csv")
    print(f"rho={rho}\t\t{dict(df_resampled.label.value_counts())}")

Original rho = 2.7768595041322315	{0: 7392, 1: 2662}
rho=5.0		{0: 7392, 1: 1479}
rho=10.0		{0: 7392, 1: 740}
rho=15.0		{0: 7392, 1: 493}
rho=20.0		{0: 7392, 1: 370}


In [3]:
import torch
import gc
gc.collect()
torch.cuda.empty_cache()

# Hate Speech Lexicon

1. Get a list of lexicon, saved in 'augmentation_src/abusive_language_lexicon/abusive_lexicon.json'

In [1]:
# 1. Wikipedia English Swear words: https://en.wiktionary.org/wiki/Category:English_swear_words
f = open("augmentation_src/abusive_language_lexicon/wikipedia_english_swear_words.txt", "r")
wikipedia_swear_words = f.read().splitlines()
len(wikipedia_swear_words) # 60

60

In [2]:
# 2. Wikipedia English profanity: https://en.wikipedia.org/wiki/Category:English_profanity
f = open("augmentation_src/abusive_language_lexicon/wikipedia_english_profanity.txt", "r")
wikipedia_profanity_words = f.read().splitlines()
len(wikipedia_profanity_words) # 55

55

In [5]:
# 3. MOL: https://github.com/franciellevargas/MOL
mol = read_from_csv("augmentation_src/abusive_language_lexicon/mol.csv")
mol = mol[mol['en-contextual-label'] == 1][['en-american-english']].reset_index(drop=True)
mol_words = mol['en-american-english'].values.tolist() # 610
mol_words_lowercase = [word.lower() for word in mol_words]
mol_words_unique = list(set(mol_words_lowercase)) # drop duplicates
mol_words_str = [word for word in mol_words_unique if not word.isdigit()] # remove only digit 0
mol_words_clean = [] # refine word of formats: "a / b", "a (x) b"
optional_words = []
for word in mol_words_str:
    word_nospace = word.replace(" ", "")
    word_nohyper = word_nospace.replace("-", "")
    if not word_nohyper.isalnum():
        optional_words.append(word)
    else:
        mol_words_clean.append(word)
# print(optional_words) # ["ass kissers (adj) / to kiss someone's ass (v)", 'loser / fool', '(white) trash', 
#                         # "to kiss (someone's) ass", 'discard / blow something off', '(to get) dumped', 
#                         # 'obnoxious / loudmouth', 'cuck / cuckold', 'cuck / cuckold', "buffin' the muffin", 
#                         # 'ladies of the night (hoes)']
mol_words_clean += ["ass kissers", "to kiss ass", "loser", "fool", "trash", "white trash", 
                    "discard", "blow off", "dumped", "to get dumped",
                    "obnoxious", "loudmouth", "cuck ", "cuckold", "buffin' the muffin", 
                    "ladies of the night", "ladies of the night hoes"]
len(mol_words_clean) # 375

375

In [6]:
# 4. Davidson Lexicon: https://github.com/t-davidson/hate-speech-and-offensive-language/blob/master/lexicons/refined_ngram_dict.csv
davidson_ngram_lexicon = read_from_csv("augmentation_src/abusive_language_lexicon/refined_ngram_dict.csv")
davidson_ngram_words = davidson_ngram_lexicon.ngram.values.tolist()
len(davidson_ngram_words) # 178

178

In [7]:
# 5. Lexicon of Abusive words https://github.com/uds-lsv/lexicon-of-abusive-words
f = open("augmentation_src/abusive_language_lexicon/law_baseLexicon.txt", "r")
base_lexicon = f.read().splitlines()
base_lexicon_clean = [word.split()[0].split("_")[0] for word in base_lexicon if word.split()[1] == "TRUE"]
len(base_lexicon_clean) # 551

551

In [10]:
import ast
# 5. Lexicon of Abusive words https://github.com/uds-lsv/lexicon-of-abusive-words
f = open("augmentation_src/abusive_language_lexicon/law_expandedLexicon.txt", "r")
expanded_lexicon = f.read().splitlines()
expanded_lexicon_words = [word.split()[0].split("_")[0] for word in expanded_lexicon if '-' not in word.split()[1]] # 2989
expanded_lexicon_words = [word for word in expanded_lexicon_words if not word.isdigit()] # remove only digits
len(expanded_lexicon_words) # 2987

2987

In [19]:
import json
abusive_lexicon = (wikipedia_swear_words + wikipedia_profanity_words 
                    + mol_words_clean + davidson_ngram_words
                    + base_lexicon_clean + expanded_lexicon_words)
# len(abusive_lexicon) # 4208
abusive_lexicon_lowercase = [word.lower().strip() for word in abusive_lexicon]
abusive_lexicon_unique = list(set(abusive_lexicon_lowercase))
abusive_lexicon_unique.sort()
len(abusive_lexicon_unique) # 3331
lexicon_path = '../data/augmentation_src/abusive_language_lexicon/abusive_lexicon.json'
with open(lexicon_path, 'w', encoding='utf-8') as f:
    json.dump(abusive_lexicon_unique, f, ensure_ascii=False, indent=4)
# with open(lexicon_path) as f:
#     lexicon = json.load(f)

2. Get the fastText vectors for each of the expression in the lexicon

In [21]:
import fasttext
import numpy as np
fasttext_model_path = '/mounts/Users/cisintern/zhangyaq/imbalanced_text_classification/data/augmentation_src/abusive_language_lexicon/cc.en.300.bin'
fasttext_model = fasttext.load_model(fasttext_model_path)

def get_fasttext_vector(text):
    fasttext_vec = np.mean([fasttext_model.get_word_vector(word) for word in text.split()], axis=0)
    return fasttext_vec
fasttext_vectors_path = "augmentation_src/abusive_language_lexicon/abusive_lexicon_fasttext_vectors.npy"
fasttext_vectors = []
for word in abusive_lexicon_unique:
    fasttext_vectors.append(get_fasttext_vector(word))
print(np.stack(fasttext_vectors).shape) # (3331, 300)
with open(fasttext_vectors_path, "wb") as f:
    np.save(f, np.stack(fasttext_vectors))
# # with open(fasttext_vectors_path, 'rb') as f:
# #     vectors = np.load(f)



(3331, 300)


In [3]:
import torch
alpha = torch.tensor([0.1, 0.9])
target = torch.tensor([[0],[1],[0],[0],[1]])
alpha.gather(0, target.squeeze(-1))

tensor([0.1000, 0.9000, 0.1000, 0.1000, 0.9000])

In [4]:
pt = torch.tensor([[0.4, 0.6], [0.3, 0.7], [0.8, 0.2], [0.75, 0.25], [0.1, 0.9]])
pt.gather(1, target).squeeze(-1)

tensor([0.4000, 0.7000, 0.8000, 0.7500, 0.9000])