In [6]:
import pandas as pd

In [3]:
def read_from_csv(data_name):
    if "tsv" in data_name:
        data = pd.read_csv(data_name,
                            sep='\t',
                            encoding = "utf-8",
                            engine = "python",
                            header = 0)
    elif "csv" in data_name:
        data = pd.read_csv(data_name,
                        encoding = "utf-8",
                        engine = "python",
                        header = 0)
    else:
        raise NotImplementedError("Given data file type is not supported yet.")
    return data

def print_data_info(data, split, label_col):
    label_counts = data[label_col].value_counts().to_dict()
    output = f"{split}\t{len(data)}"
    for label in sorted(label_counts.keys()):
        output += f"\t{label}: {label_counts[label]}, "
        output += "{:.1%}".format(label_counts[label]/len(data))
    print(output)

In [10]:
# US Election 2020: binary - HOF, Non-HOF
train_data = read_from_csv("data/us-election-2020/train.tsv")
train_data["label"] = 0
train_data["label"] = train_data["label"].where(train_data["HOF"] == "Non-Hateful", 1)
train_data.to_csv("./data/us-election-2020/train_clean.csv", index=False)

test_data = read_from_csv("data/us-election-2020/test.tsv")
test_data["label"] = 0
test_data["label"] = test_data["label"].where(test_data["HOF"] == "Non-Hateful", 1)
test_data.to_csv("./data/us-election-2020/test_clean.csv", index=False)

In [11]:
# Davidson-THON: multi - 0: hate speech, 1: offensive language, 2: neither
data = read_from_csv("data/davidson-thon/davidson-thon.csv")
data = data.rename(columns={"tweet": "text"})
data = data.rename(columns={"class": "label_multi"})
data = data[["text", "label_multi"]]
data.to_csv("./data/davidson-thon/data_clean.csv", index=False)

In [2]:
import torch
y = [torch.tensor(1), torch.tensor(1), torch.tensor(0), torch.tensor(0), torch.tensor(1), torch.tensor(1), torch.tensor(0), torch.tensor(1), torch.tensor(1), torch.tensor(0), torch.tensor(0), torch.tensor(1), torch.tensor(1), torch.tensor(1), torch.tensor(1), torch.tensor(1), torch.tensor(0), torch.tensor(0), torch.tensor(1), torch.tensor(1), torch.tensor(0), torch.tensor(0), torch.tensor(1), torch.tensor(0), torch.tensor(0), torch.tensor(1), torch.tensor(0), torch.tensor(0), torch.tensor(0), torch.tensor(1), torch.tensor(1), torch.tensor(0), torch.tensor(1), torch.tensor(0), torch.tensor(1), torch.tensor(1), torch.tensor(0), torch.tensor(1), torch.tensor(0), torch.tensor(0), torch.tensor(1), torch.tensor(1), torch.tensor(1), torch.tensor(1), torch.tensor(0), torch.tensor(0), torch.tensor(1), torch.tensor(1), torch.tensor(1), torch.tensor(1), torch.tensor(1), torch.tensor(0), torch.tensor(0), torch.tensor(1), torch.tensor(0), torch.tensor(0), torch.tensor(1), torch.tensor(1), torch.tensor(0), torch.tensor(0), torch.tensor(1), torch.tensor(1), torch.tensor(0), torch.tensor(1), torch.tensor(0), torch.tensor(0), torch.tensor(1), torch.tensor(1), torch.tensor(0), torch.tensor(1), torch.tensor(1), torch.tensor(0), torch.tensor(0), torch.tensor(0), torch.tensor(0), torch.tensor(0), torch.tensor(0), torch.tensor(0), torch.tensor(0), torch.tensor(0)]
torch.stack(y)

tensor([1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0,
        0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1,
        1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0,
        0, 0, 0, 0, 0, 0, 0, 0])

In [1]:
import pandas as pd
pd.set_option('display.max_columns', None)

In [8]:
# check if all samples appear in the resampled training set
train = read_from_csv("resampled_train_data.csv")
train.groupby("label").text.nunique()

label
0    1686
1     234
Name: text, dtype: int64

In [7]:
train.label.value_counts()

label
0    15380
1    15340
Name: count, dtype: int64

In [11]:
from torch import tensor
from torchmetrics.classification import Precision, Recall, F1Score
target = tensor([0, 0, 0, 0, 1, 0])
preds = tensor([1, 0, 0, 1, 1, 0])
f1_macro = F1Score(task="multiclass", num_classes=2, average="macro")
f1_micro = F1Score(task="multiclass", num_classes=2, average="micro")
f1_per_label = F1Score(task="multiclass", num_classes=2, average="none")
f1_macro(preds, target), f1_micro(preds, target), f1_per_label(preds, target)

(tensor(0.6250), tensor(0.6667), tensor([0.7500, 0.5000]))

In [4]:
from sklearn.metrics import f1_score
f1_score(preds, target, average='macro'), f1_score(preds, target, average='micro')

(0.625, 0.6666666666666666)

In [20]:
from torch import tensor
from sklearn.metrics import accuracy_score
y_pred = tensor([0, 2, 1, 2])
y_true = tensor([0, 1, 2, 2])
accuracy_score(y_true, y_pred)

0.5

In [21]:
from torchmetrics.classification import Precision, Recall, F1Score, Accuracy, AveragePrecision
accuracy = Accuracy(task="multiclass", num_classes=4, average="weighted")
accuracy(y_pred, y_true)

tensor(0.5000)

In [1]:
import torch
torch.cuda.empty_cache()