In [1]:
from ast import literal_eval
import pandas as pd
import random
from toxic_spans.evaluation.semeval2021 import f1
import re
import numpy as np

In [12]:
def normalize(text):
    text = re.sub('`|‘|’', "'", text)
    text = re.sub('”|“', '"', text)
    text = re.sub('\n|…|[^\x00-\x7F]|\[|\]', ' ', text)
    return text

def extract_toxic_span(spans):
    spans = list(spans)
    length = len(spans)
    segments = []
    span = []
    for i in range(length):
        if len(span) == 0:
            span.append(spans[i])
        if i != length-1 and spans[i+1] == spans[i] + 1:
            span.append(spans[i+1])
        else:
            segments.append(span)
            span = []
    return segments

In [13]:
tsd_train = pd.read_csv('toxic_spans/data/tsd_train.csv')
tsd_trial = pd.read_csv('toxic_spans/data/tsd_trial.csv')

In [14]:
tsd_train.spans = tsd_train.spans.apply(literal_eval)
tsd_trial.spans = tsd_trial.spans.apply(literal_eval)

In [23]:
def transform(tsd):
    data = []
    for row in tsd.iterrows():
        spans = row[1]['spans']
        text = normalize(row[1]['text'])
        temp = []
        text_spans = []
        if spans:
            segments = extract_toxic_span(spans)
            for seg in segments:
                temp.append([seg[0], seg[-1]])
                text_spans.append(text[seg[0]: seg[-1]+1])
        if len(temp) == 1 and temp[0][1] - temp[0][0] >= 60:
            continue
        data.append({'text': text, 'spans': temp, 'text_spans': text_spans})
    return data

In [24]:
def replace(text, pattern, replacement, pos):
    matches = [0]

    def capture_and_replace(match, ret):
        matches.extend([match.start() + 1, match.end()])
        return ret

    l = len(text)
    text = re.sub(pattern, lambda match: capture_and_replace(match, replacement), text, flags=re.IGNORECASE)
    matches.append(l)
    slices = np.array_split(matches, int(len(matches) / 2))
    res = []
    for s in slices:
        res += pos[s[0]:s[1]]
    assert len(text) == len(res)
    return text, res

In [25]:
def preprocess(text, pos):
    # remove urls
    text, pos = replace(text, r'http\S+', ' ', pos)
    text, pos = replace(text, r'www.\S+', ' ', pos)
    
    # collapse duplicated punctuations 
    punc = ',. !?\"\''
    for c in punc:
        pat = '([' + c + ']{2,})'
        text, pos = replace(text, pat, c, pos)
    
    # strip text
    text = text.lstrip(' |.|,|!|?')
    pos = pos[len(pos)-len(text):]
    text = text.rstrip(' |.|,|!|?')
    pos = pos[:len(text)]
    assert len(text) == len(pos)
    return text, pos

In [26]:
from nltk.tokenize.api import TokenizerI

In [27]:
class MacIntyreContractions:
    """
    List of contractions adapted from Robert MacIntyre's tokenizer.
    """

    CONTRACTIONS2 = [
        r"(?i)\b(can)(?#X)(not)\b",
        r"(?i)\b(d)(?#X)('ye)\b",
        r"(?i)\b(gim)(?#X)(me)\b",
        r"(?i)\b(gon)(?#X)(na)\b",
        r"(?i)\b(got)(?#X)(ta)\b",
        r"(?i)\b(lem)(?#X)(me)\b",
        r"(?i)\b(mor)(?#X)('n)\b",
        r"(?i)\b(wan)(?#X)(na)\s",
    ]
    CONTRACTIONS3 = [r"(?i) ('t)(?#X)(is)\b", r"(?i) ('t)(?#X)(was)\b"]
    CONTRACTIONS4 = [r"(?i)\b(whad)(dd)(ya)\b", r"(?i)\b(wha)(t)(cha)\b"]


class NLTKWordTokenizer(TokenizerI):

    # Starting quotes.
    STARTING_QUOTES = [
        (re.compile(u"([«“‘„]|[`]+)", re.U), r" \1 "),
        (re.compile(r"(\")"), r" \1 "),
        (re.compile(r"([ \(\[{<])(\"|\'{2})"), r'\1 " '),
        (re.compile(r"(?i)(\')(?!re|ve|ll|m|t|s|d)(\w)\b", re.U), r"\1 \2"),
    ]

    # Ending quotes.
    ENDING_QUOTES = [
        (re.compile(u"( ')", re.U), r" \1 "),
        (re.compile(r'"'), ' " '),
        (re.compile(r"(\S)(\")"), r"\1 \2 "),
        (re.compile(r"([^' ]['ll|'LL|'re|'RE|'ve|'VE|n't|N'T]) "), r"\1 "),
    ]

    PUNCTUATION = [
        (re.compile(r'([^\.])(\.)([\]\)}>"\'' u"»”’ " r"]*)\s*$", re.U), r"\1 \2 \3 "),
        (re.compile(r"([:])([^\d])"), r" \1 \2"),
        (re.compile(r"([:,])$"), r" \1 "),
        (re.compile(r"\.{2,}", re.U), r" \g<0> "), # See https://github.com/nltk/nltk/pull/2322
        (re.compile(r"([\w;@*#$%?!&\^\'\"])+"), r" \g<0> "),
        (
            re.compile(r'([^\.])(\.)([\]\)}>"\']*)\s*$'),
            r"\1 \2\3 ",
        ),  # Handles the final period.
        (re.compile(r"([,])+"), r" \g<0> "),
        (re.compile(r"([^'])' "), r"\1 ' "),
    ]

    # Pads parentheses
    PARENS_BRACKETS = (re.compile(r"[\]\[\(\)\{\}\<\>]"), r" \g<0> ")

    # Optionally: Convert parentheses, brackets and converts them to PTB symbols.
    CONVERT_PARENTHESES = [
        (re.compile(r"\("), "-LRB-"),
        (re.compile(r"\)"), "-RRB-"),
        (re.compile(r"\["), "-LSB-"),
        (re.compile(r"\]"), "-RSB-"),
        (re.compile(r"\{"), "-LCB-"),
        (re.compile(r"\}"), "-RCB-"),
    ]

    DOUBLE_DASHES = (re.compile(r"--"), r" -- ")

    # List of contractions adapted from Robert MacIntyre's tokenizer.
    _contractions = MacIntyreContractions()
    CONTRACTIONS2 = list(map(re.compile, _contractions.CONTRACTIONS2))
    CONTRACTIONS3 = list(map(re.compile, _contractions.CONTRACTIONS3))

    def tokenize(self, text, convert_parentheses=False, return_str=False):
        for regexp, substitution in self.STARTING_QUOTES:
            text = regexp.sub(substitution, text)

        for regexp, substitution in self.PUNCTUATION:
            text = regexp.sub(substitution, text)

        # Handles parentheses.
        regexp, substitution = self.PARENS_BRACKETS
        text = regexp.sub(substitution, text)
        # Optionally convert parentheses
        if convert_parentheses:
            for regexp, substitution in self.CONVERT_PARENTHESES:
                text = regexp.sub(substitution, text)

        # Handles double dash.
        regexp, substitution = self.DOUBLE_DASHES
        text = regexp.sub(substitution, text)

        # add extra space to make things easier
        text = " " + text + " "

        for regexp, substitution in self.ENDING_QUOTES:
            text = regexp.sub(substitution, text)


        for regexp in self.CONTRACTIONS2:
            text = regexp.sub(r" \1 \2 ", text)
        for regexp in self.CONTRACTIONS3:
            text = regexp.sub(r" \1 \2 ", text)

        return text if return_str else text.split()

In [28]:
tokenizer = NLTKWordTokenizer()

In [29]:
def tokenize(text, pos):
    tokens = tokenizer.tokenize(text)
    alignment = []
    start = 0
    for token in tokens:
        res =  text.find(token, start)
        alignment.append(pos[res: res + len(token)])
        start = res + len(token)
    assert len(tokens) == len(alignment)
    return tokens, alignment

In [12]:
def annotate(spans, alignment, tokens):
    i = 0
    annotations = []
    if len(spans) != 0:
        for span in spans:
            while i < len(alignment):
                if alignment[i][-1] <= span[0]:
                    annotations.append('O')
                    i += 1
                elif alignment[i][0] <= span[0] < alignment[i][-1]:
                    annotations.append('B-T')
                    i += 1
                elif span[0] < alignment[i][0] < span[-1]:
                    annotations.append('I-T')
                    i += 1
                elif alignment[i][0] >= span[-1]:
                    annotations.append('O')
                    i += 1
                    break
    annotations.extend(['O'] * (len(tokens) - len(annotations)))
    return annotations

In [13]:
def prepare_data(data):
    formated_data = []
    for d in data:
        text = d['text']
        pos = [i for i in range(len(text))]
        text, pos = preprocess(text, pos)
        tokens, alignment = tokenize(text, pos)
        annotations = annotate(d['spans'], alignment, tokens)
        ls = [[tokens[i], annotations[i]] for i in range(len(tokens))]
        formated_data.extend(ls)
        formated_data.append([None])
    return formated_data

In [14]:
train = transform(tsd_train)
train = prepare_data(train)
train = pd.DataFrame(train)
train.rename({0:'tokens', 1:'POS'})
train.to_csv('train.txt',index=False, columns=None, header=False, sep='\t')
test = transform(tsd_trial)
test = prepare_data(test)
test = pd.DataFrame(test)
test.rename({0:'tokens', 1:'POS'})
test.to_csv('test.txt',index=False, columns=None, header=False, sep='\t')

In [18]:
from flair.datasets import ColumnCorpus
from torch.optim.adamw import AdamW
from flair.models import SequenceTagger
from flair.trainers import ModelTrainer
from flair.embeddings import *
from flair.training_utils import AnnealOnPlateau

In [16]:
columns = {0: 'text', 1: 'ner'}
corpus = ColumnCorpus('', columns, train_file='train.txt', dev_file='test.txt')

2021-08-15 19:32:20,880 Reading data from .
2021-08-15 19:32:20,882 Train: train.txt
2021-08-15 19:32:20,884 Dev: test.txt
2021-08-15 19:32:20,885 Test: None


In [17]:
embedding_types = [
    TransformerWordEmbeddings('unitary/unbiased-toxic-roberta', layers="-1, -6, 0", fine_tune=False, allow_long_sentences=True),
    WordEmbeddings('crawl'),
    BytePairEmbeddings('en'),
    FlairEmbeddings('news-forward'),
    FlairEmbeddings('news-backward')
]
embeddings = StackedEmbeddings(embeddings=embedding_types)

Some weights of the model checkpoint at unitary/unbiased-toxic-roberta were not used when initializing RobertaModel: ['classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at unitary/unbiased-toxic-roberta and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [19]:
tag_type = 'ner'
tag_dict = corpus.make_tag_dictionary(tag_type=tag_type)
model = SequenceTagger.load('checkpoint/final-model.pt')
                       
trainer = ModelTrainer(model, corpus, optimizer=AdamW)                     

2021-08-15 19:33:48,291 loading file checkpoint/final-model.pt


In [None]:
trainer.train('checkpoint',
              learning_rate=0.001,
              min_learning_rate=0.0000001,
              max_epochs=50,
              scheduler=AnnealOnPlateau)

2021-08-15 19:34:10,569 ----------------------------------------------------------------------------------------------------
2021-08-15 19:34:10,573 Model: "SequenceTagger(
  (embeddings): StackedEmbeddings(
    (list_embedding_0): TransformerWordEmbeddings(
      (model): RobertaModel(
        (embeddings): RobertaEmbeddings(
          (word_embeddings): Embedding(50265, 768, padding_idx=1)
          (position_embeddings): Embedding(514, 768, padding_idx=1)
          (token_type_embeddings): Embedding(1, 768)
          (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (encoder): RobertaEncoder(
          (layer): ModuleList(
            (0): RobertaLayer(
              (attention): RobertaAttention(
                (self): RobertaSelfAttention(
                  (query): Linear(in_features=768, out_features=768, bias=True)
                  (key): Linear(in_features=768, out_features=768, bias=True)


2021-08-15 19:34:10,574 ----------------------------------------------------------------------------------------------------
2021-08-15 19:34:10,575 Corpus: "Corpus: 6981 train + 681 dev + 776 test sentences"
2021-08-15 19:34:10,576 ----------------------------------------------------------------------------------------------------
2021-08-15 19:34:10,576 Parameters:
2021-08-15 19:34:10,577  - learning_rate: "0.001"
2021-08-15 19:34:10,578  - mini_batch_size: "32"
2021-08-15 19:34:10,578  - patience: "3"
2021-08-15 19:34:10,580  - anneal_factor: "0.5"
2021-08-15 19:34:10,581  - max_epochs: "50"
2021-08-15 19:34:10,582  - shuffle: "True"
2021-08-15 19:34:10,583  - train_with_dev: "False"
2021-08-15 19:34:10,583  - batch_growth_annealing: "False"
2021-08-15 19:34:10,584 ----------------------------------------------------------------------------------------------------
2021-08-15 19:34:10,584 Model training base path: "checkpoint"
2021-08-15 19:34:10,585 ---------------------------------

2021-08-15 20:36:31,554 EPOCH 5 done: loss 3.7034 - lr 0.0010000
2021-08-15 20:37:27,204 DEV : loss 3.1428592205047607 - score 0.6337
2021-08-15 20:37:27,418 BAD EPOCHS (no improvement): 0
saving best model
2021-08-15 20:37:41,731 ----------------------------------------------------------------------------------------------------
2021-08-15 20:38:53,938 epoch 6 - iter 21/219 - loss 3.41599864 - samples/sec: 9.31 - lr: 0.001000
2021-08-15 20:40:03,671 epoch 6 - iter 42/219 - loss 3.62802636 - samples/sec: 9.64 - lr: 0.001000
2021-08-15 20:41:12,225 epoch 6 - iter 63/219 - loss 3.54350084 - samples/sec: 9.80 - lr: 0.001000
2021-08-15 20:42:25,453 epoch 6 - iter 84/219 - loss 3.59823447 - samples/sec: 9.18 - lr: 0.001000
2021-08-15 20:43:37,952 epoch 6 - iter 105/219 - loss 3.63932765 - samples/sec: 9.27 - lr: 0.001000
2021-08-15 20:44:49,247 epoch 6 - iter 126/219 - loss 3.65036504 - samples/sec: 9.43 - lr: 0.001000
2021-08-15 20:45:52,704 epoch 6 - iter 147/219 - loss 3.60572270 - sampl

2021-08-15 21:53:21,914 epoch 11 - iter 168/219 - loss 3.22610756 - samples/sec: 10.19 - lr: 0.000500
2021-08-15 21:54:42,410 epoch 11 - iter 189/219 - loss 3.24449078 - samples/sec: 8.35 - lr: 0.000500
2021-08-15 21:55:57,266 epoch 11 - iter 210/219 - loss 3.23232382 - samples/sec: 8.98 - lr: 0.000500
2021-08-15 21:56:24,809 ----------------------------------------------------------------------------------------------------
2021-08-15 21:56:24,810 EPOCH 11 done: loss 3.2265 - lr 0.0005000
2021-08-15 21:57:26,684 DEV : loss 2.8549885749816895 - score 0.6296
2021-08-15 21:57:26,863 BAD EPOCHS (no improvement): 2
2021-08-15 21:57:26,864 ----------------------------------------------------------------------------------------------------
2021-08-15 21:58:42,008 epoch 12 - iter 21/219 - loss 3.43883891 - samples/sec: 8.94 - lr: 0.000500
2021-08-15 21:59:46,757 epoch 12 - iter 42/219 - loss 3.26973660 - samples/sec: 10.38 - lr: 0.000500
2021-08-15 22:00:54,421 epoch 12 - iter 63/219 - loss 3

2021-08-15 23:04:19,247 epoch 17 - iter 63/219 - loss 3.20240666 - samples/sec: 10.18 - lr: 0.000250
2021-08-15 23:05:25,366 epoch 17 - iter 84/219 - loss 3.12559359 - samples/sec: 10.16 - lr: 0.000250
2021-08-15 23:06:40,607 epoch 17 - iter 105/219 - loss 3.13637060 - samples/sec: 8.93 - lr: 0.000250
2021-08-15 23:07:51,640 epoch 17 - iter 126/219 - loss 3.10351694 - samples/sec: 9.46 - lr: 0.000250
2021-08-15 23:09:09,140 epoch 17 - iter 147/219 - loss 3.12589597 - samples/sec: 8.67 - lr: 0.000250
2021-08-15 23:10:17,346 epoch 17 - iter 168/219 - loss 3.10826659 - samples/sec: 9.85 - lr: 0.000250
2021-08-15 23:11:23,594 epoch 17 - iter 189/219 - loss 3.09855119 - samples/sec: 10.14 - lr: 0.000250
2021-08-15 23:12:33,178 epoch 17 - iter 210/219 - loss 3.08162946 - samples/sec: 9.66 - lr: 0.000250
2021-08-15 23:13:01,079 ----------------------------------------------------------------------------------------------------
2021-08-15 23:13:01,080 EPOCH 17 done: loss 3.0902 - lr 0.0002500


2021-08-16 00:20:12,679 EPOCH 22 done: loss 2.9756 - lr 0.0001250
2021-08-16 00:21:12,171 DEV : loss 2.7848010063171387 - score 0.6269
2021-08-16 00:21:12,351 BAD EPOCHS (no improvement): 3
2021-08-16 00:21:12,352 ----------------------------------------------------------------------------------------------------
2021-08-16 00:22:27,633 epoch 23 - iter 21/219 - loss 2.94114256 - samples/sec: 8.93 - lr: 0.000125
2021-08-16 00:23:41,807 epoch 23 - iter 42/219 - loss 2.92557812 - samples/sec: 9.06 - lr: 0.000125
2021-08-16 00:24:50,071 epoch 23 - iter 63/219 - loss 2.95369274 - samples/sec: 9.84 - lr: 0.000125
2021-08-16 00:25:59,282 epoch 23 - iter 84/219 - loss 2.93718009 - samples/sec: 9.71 - lr: 0.000125
2021-08-16 00:27:11,824 epoch 23 - iter 105/219 - loss 2.98857424 - samples/sec: 9.26 - lr: 0.000125
2021-08-16 00:28:25,477 epoch 23 - iter 126/219 - loss 2.97587980 - samples/sec: 9.12 - lr: 0.000125
2021-08-16 00:29:31,664 epoch 23 - iter 147/219 - loss 2.92886341 - samples/sec: 10

In [7]:
from tqdm import tqdm
from flair.models import SequenceTagger
from flair.data import Sentence
from toxic_spans.evaluation.semeval2021 import f1
import pandas as pd

In [8]:
model = SequenceTagger.load('checkpoint/best-model.pt')

2021-08-16 02:32:41,084 loading file checkpoint/best-model.pt


In [9]:
def evaluate(model, sentences, alignments):
    predictions = []
    for idx, sentence in enumerate(sentences):
        model.predict(sentence)
        prediction = []
        for span in sentence.get_spans():
            for token in span:
                prediction.extend(alignments[idx][token.idx-1])
        predictions.append(prediction)
    return predictions

In [None]:
df = pd.read_csv('toxic_spans/data/tsd_test.csv')['text']
sentences, alignments = [], []
for text in df:
    pos = [i for i in range(len(text))]
    text, pos = preprocess(text, pos)
    tokens, alignment = tokenize(text, pos)
    sentences.append(Sentence(tokens))
    alignments.append(alignment)
predictions = evaluate(model, sentences, alignments)

In [32]:
df = pd.read_csv('toxic_spans/data/tsd_test.csv')['spans']
df = df.map(literal_eval).tolist()

In [33]:
score = 0
for i in range(len(df)):
    score += f1(predictions[i], df[i])
score /= len(df)
print(score)

0.6974124686544945


In [51]:
for text in df['text']:
    print(text)

That's right. They are not normal. And I am starting from the premise that they are ABNORMAL. Proceed wth the typical racist, bigot, sexist rubbish.
Thanks!
"Watch people die from taking away their healthcare"

DING DING DING!  Winner of stupid post of the day award!
tens years ago i contacted the PDR and suggested that the time might be good to work with alaska on building a gas line.. alaska rejected them without even consideration despite china being flush with cash and hungry for gas.. and set up another infamous boondoggle.. the transcanada-exxon rip off that we are still paying for and have yet to receive anything of value.. hundreds of millions of dollars on studies.. and buyouts.. i hope china says f you alaska.. you are nothing but ignorant people..
The parallels between the ANC and the Sicilian Mafia are glaring. The ANC has always been run by a few "families" who treat the state as 'turf' ; as just one big piggy bank for their self-enrichment. The government basically believ