In [2]:
import numpy as np
import pandas as pd
import pickle as pkl
from tqdm import tqdm

from transformers import pipeline

In [3]:
# load toxicity annotation model
toxpipe = pipeline("text-classification", model="cooperleong00/deberta-v3-large_toxicity-scorer")

In [4]:
# read in original data
dfa = pd.read_csv('../data/toxicity_annotations.tsv',sep='\t')
dfc = pd.read_csv('../data/toxicity_annotated_comments.tsv',sep='\t')

# read in dialect subsets
aave = pd.read_json("../data/aave.jsonl", lines=True)
nigerianD = pd.read_json("../data/nigerianD.jsonl", lines=True)
indianD = pd.read_json("../data/indianD.jsonl", lines=True)
singlish = pd.read_json("../data/singlish.jsonl", lines=True)

# get the first 1000 toxicity scores in dfa for development
dfa_sample = dfa.iloc[:1000]

In [5]:
dfa_sample.head()

Unnamed: 0,rev_id,worker_id,toxicity,toxicity_score
0,2232.0,723,0,0.0
1,2232.0,4000,0,0.0
2,2232.0,3989,0,1.0
3,2232.0,3341,0,0.0
4,2232.0,1574,0,1.0


In [7]:
set(dfa_sample["toxicity_score"])

{-2.0, -1.0, 0.0, 1.0, 2.0}

In [None]:
# dfc.head()

In [8]:
aave.head()

Unnamed: 0,text,rules
0,This:NEWLINE_TOKEN:One might can make an analo...,"[existential_there, mass_noun_plurals, complet..."
1,`NEWLINE_TOKENNEWLINE_TOKEN:Clarification for ...,"[shadow_pronouns, existential_there, regulariz..."
2,Elected or Electoral? JHK,[]
3,`This such a fun entry. DevotchkaNEWLINE_TOK...,"[regularized_plurals, their_they, future_sub_g..."
4,Please do relate that ozone hole to increase i...,"[zero_plural, double_modals, demonstrative_for..."


In [9]:
for sent in aave["text"][:10]:
    results = toxpipe(sent)
    print(results)

[{'label': 'LABEL_0', 'score': 0.8678526282310486}]
[{'label': 'LABEL_0', 'score': 0.8393023014068604}]
[{'label': 'LABEL_0', 'score': 0.9183971881866455}]
[{'label': 'LABEL_1', 'score': 0.5532406568527222}]
[{'label': 'LABEL_0', 'score': 0.6342032551765442}]
[{'label': 'LABEL_0', 'score': 0.8806992173194885}]
[{'label': 'LABEL_0', 'score': 0.612224280834198}]
[{'label': 'LABEL_0', 'score': 0.6925280094146729}]
[{'label': 'LABEL_0', 'score': 0.7933699488639832}]
[{'label': 'LABEL_0', 'score': 0.8887091875076294}]


In [10]:
dfa_sample[:10]["toxicity_score"]

0    0.0
1    0.0
2    1.0
3    0.0
4    1.0
5    1.0
6    1.0
7    0.0
8    1.0
9   -1.0
Name: toxicity_score, dtype: float64