In [2]:
import pandas as pd
def read_from_csv(data_name, header=0, names=None):
    if "tsv" in data_name:
        data = pd.read_csv(data_name,
                            sep='\t',
                            encoding = "utf-8",
                            engine = "python",
                            header = header,
                            names = names)
    elif "csv" in data_name:
        data = pd.read_csv(data_name,
                        encoding = "utf-8",
                        engine = "python",
                        header = header,
                        names = names)
    else:
        raise NotImplementedError("Given data file type is not supported yet.")
    return data

def print_data_info(data, split, label_col):
    label_counts = data[label_col].value_counts().to_dict()
    output = f"{split}\t{len(data)}"
    for label in sorted(label_counts.keys()):
        output += f"\t{label}: {label_counts[label]}, "
        output += "{:.1%}".format(label_counts[label]/len(data))
    print(output)

In [22]:
# AMI
# https://live.european-language-grid.eu/catalogue/corpus/7272/download/
train_data = read_from_csv("data/ami/train.tsv")
test_data = read_from_csv("data/ami/test.tsv")
print('AMI\nMulti - {"discredit": 0, "stereotype": 1, "dominance": 2, "sexual harassment": 3, "derailing": 4}')
category2index = {"discredit": 0, "stereotype": 1, "dominance": 2, "sexual_harassment": 3, "derailing": 4}
train_misogynous = train_data[train_data["misogynous"] != 0].reset_index(drop=True)
train_misogynous["label_multi"] = train_misogynous['misogyny_category'].map(category2index)
train_misogynous.to_csv("./data/ami/train_clean.csv", index=False)
# dict(train_misogynous.label_multi.value_counts()) # {0: 1014, 3: 352, 1: 179, 2: 148, 4: 92}
test_misogynous = test_data[test_data["misogynous"] != 0].reset_index(drop=True)
test_misogynous["label_multi"] = test_misogynous['misogyny_category'].map(category2index)
test_misogynous.to_csv("./data/ami/test_clean.csv", index=False)
# dict(test_misogynous.label_multi.value_counts()) # {0: 141, 1: 140, 2: 124, 3: 44, 4: 11}

AMI
Multi - {"discredit": 0, "stereotype": 1, "dominance": 2, "sexual harassment": 3, "derailing": 4}


{0: 141, 1: 140, 2: 124, 3: 44, 4: 11}

In [26]:
# Bretschneider-TH: bin - 0: neutral, 1: harassment
merged_data = read_from_csv("data/bretschneider-th/merged_data.csv")
merged_data = merged_data.dropna().reset_index(drop=True)
merged_data = merged_data.drop_duplicates().reset_index(drop=True)

school_data = read_from_csv("data/bretschneider-th/school_labels.csv")
school_data = school_data.rename(columns={"tweetId": "tweet_id"})
school_data = school_data[["tweet_id", "bullyLabel"]]
school_data = school_data.drop_duplicates().reset_index(drop=True)
school_data = school_data.merge(merged_data, on='tweet_id')
# school_data.to_csv("./data/bretschneider-th/school_data_clean.csv", index=False)
# school_data.label.value_counts() # 1285, 111

main_data = read_from_csv("data/bretschneider-th/main_labels.csv")
main_data = main_data.rename(columns={"tweetId": "tweet_id"})
main_data = main_data[["tweet_id", "bullyLabel"]]
main_data = main_data.drop_duplicates().reset_index(drop=True)
main_data = main_data.merge(merged_data, on='tweet_id')
# main_data.to_csv("./data/bretschneider-th/main_data_clean.csv", index=False)
# main_data.label.value_counts() # 2713, 185

Unnamed: 0,tweet_id,label,text
0,260403351269814272,0,You have ruined everything in my life. You sho...
1,260419381920686080,1,You cared when i busted your ass in 2k RT @Dlo...
2,260418970878894080,0,I'm just really not in the mood for this whole...
3,260418962486067200,0,"i have woke up twice out my good sleep , to fi..."
4,261837467689304065,1,@GuyAdami Y r U on Fast? U always wrong. plus ...
...,...,...,...
4117,270957486801043456,0,"Staying in after school to catch up, since my ..."
4118,271036092290068480,0,Compton College is the greatest place ever. I ...
4119,271680047969087488,0,@Aideen61 I'm ready and waiting :D You coming ...
4120,271287985394315264,1,Have a great day at school my niggys.. &lt;3


In [30]:
# CMSB - TSD
# https://search.gesis.org/research_data/SDN-10.7802-2251
data = read_from_csv("data/cmsb-tsd/sexism_data.csv")
data["label"] = 0
data["label"] = data["label"].where(data["sexist"] == False, 1)
# data.label.value_counts().to_dict() # {0: 11822, 1: 1809}
data.to_csv("./data/cmsb-tsd/data_clean.csv", index=False)

In [29]:
# Davidson-THON: multi - 0: hate speech, 1: offensive language, 2: neither
data = read_from_csv("data/davidson-thon/davidson-thon.csv")
data = data.rename(columns={"tweet": "text"})
data = data.rename(columns={"class": "label_multi"})
data = data[["text", "label_multi"]]
# data.to_csv("./data/davidson-thon/data_clean.csv", index=False)
data.label_multi.value_counts().to_dict()
lc = [858, 11514, 2497]
{label: round((1/count)*1000, 1) for label, count in enumerate(lc)} # {0: 1.2, 1: 0.1, 2: 0.4}

{0: 1.2, 1: 0.1, 2: 0.4}

In [None]:
# Founta 2018 - THAS: multi - normal, spam, abusive, hateful
data = read_from_csv("data/founta-2018-thas/large_scale_hatespeechtwitter.csv", header=None, names=['id', 'text', 'class'])
data = data.dropna().reset_index(drop=True)
class2label = {"normal": 0, "spam": 1, "abusive": 2, "hateful": 3}
data["label_multi"] = data["class"].map(class2label)
data = data.dropna().reset_index(drop=True)
data.to_csv("./data/founta-2018-thas/data_clean.csv", index=False)
# dict(data.label_multi.value_counts()) # {0: 33325, 1: 7555, 2: 3934, 3: 1638}
# lc = [19994, 4533, 2360, 983]
# {label: round((1/count)*1000, 1) for label, count in enumerate(lc)} # {0: 0.1, 1: 0.2, 2: 0.4, 3: 1.0}

In [24]:
# Gao 2018 FHC - bin
data = read_from_csv("data/gao-2018-fhc/data_clean.csv")
dict(data.label.value_counts()) # {0: 1093, 1: 435}

{0: 1093, 1: 435}

In [20]:
# Gibert 2018 - SHS binary
data_annotations = read_from_csv("data/gibert-2018-shs/annotations_metadata.csv")
# print(dict(data_annotations.label.value_counts())) # {'noHate': 9507, 'hate': 1196, 'relation': 168, 'idk/skip': 73}
data_annotations = data_annotations.rename(columns={"label": "class"})
class2label = {"noHate": 0, "hate": 1, "relation": -1, "idk/skip": -1}
data_annotations["label"] = data_annotations["class"].map(class2label)
data_annotations = data_annotations[data_annotations["label"] != -1].reset_index(drop=True)
# dict(data_annotations.label.value_counts()) # {0: 9507, 1: 1196}
def get_text_by_file_id(file_id):
    with open(f"data/gibert-2018-shs/all_files/{file_id}.txt", "r") as f:
        text = f.readlines()[0]
    return text
data_annotations["text"] = data_annotations.apply(lambda row: get_text_by_file_id(row.file_id), axis=1)
data = data_annotations[["text", "label"]]
# data.to_csv("./data/gibert-2018-shs/data_clean.csv", index=False)
# data

In [42]:
# Twitter hate speech - TSA: binary - not hate speech, hate speech
data = read_from_csv("data/twitter-hate-speech-tsa/data.csv")
data = data.rename(columns={"tweet": "text"})
data.to_csv("./data/twitter-hate-speech-tsa/data_clean.csv", index=False)

In [10]:
# US Election 2020: binary - HOF, Non-HOF
train_data = read_from_csv("data/us-election-2020/train.tsv")
train_data["label"] = 0
train_data["label"] = train_data["label"].where(train_data["HOF"] == "Non-Hateful", 1)
train_data.to_csv("./data/us-election-2020/train_clean.csv", index=False)

test_data = read_from_csv("data/us-election-2020/test.tsv")
test_data["label"] = 0
test_data["label"] = test_data["label"].where(test_data["HOF"] == "Non-Hateful", 1)
test_data.to_csv("./data/us-election-2020/test_clean.csv", index=False)

In [13]:
# Waseem-and-Hovy 2016: bin - 0: neither, 1: sexism or racism
data = read_from_csv("data/waseem-and-hovy-2016/srw.csv")
data["label"] = 0
data["label"] = data["label"].where(data["category"] == "none", 1)
data = data.dropna().reset_index(drop=True)
# data.to_csv("./data/waseem-and-hovy-2016/data_clean.csv", index=False)
data.label.value_counts()

label
0    7392
1    2662
Name: count, dtype: int64

In [1]:
import torch
torch.cuda.empty_cache()

In [8]:
import torch
x = torch.randn(10, 5)
target = torch.randint(0, 5, (10,))

weights = torch.tensor([1., 2., 3., 4., 5.])
criterion_weighted = torch.nn.CrossEntropyLoss(weight=weights)
loss_weighted = criterion_weighted(x, target)

criterion_weighted_manual = torch.nn.CrossEntropyLoss(weight=weights, reduction='none')
loss_weighted_manual = criterion_weighted_manual(x, target)
loss_weighted_manual = loss_weighted_manual.sum() / weights[target].sum()

print(loss_weighted == loss_weighted_manual)

tensor(True)


In [30]:
import torch
from torch.nn import Softmax
logits = torch.tensor([[0.1, -0.3, 0.6], [-0.1, 0.2, 0.1]])
prior = {0: 0.6, 1: 0.3, 2: 0.1}
y = torch.tensor([0, 2])

In [41]:
prior_ys = torch.tensor([prior[yi] for yi in y.tolist()]).unsqueeze(1)

In [42]:
x / prior_ys

tensor([[ 0.1667, -0.5000,  1.0000],
        [-1.0000,  2.0000,  1.0000]])