In [46]:
from collections import Counter
from spacy.tokens import DocBin
import spacy
import csv
import ast

In [50]:
def make_spans(lst):
    """
    Split list of numbers into a list of (start,end) tuples,
    e.g. [1,2,3,4,10,11,12] of indexes becomes [(1,5), (10,13)] (exclusive end index)
    """
    start = None
    spans = []
    last_d = None
    for d in lst:
        if last_d is None:
            start = d
        elif d > last_d + 1:
            spans.append( (start, last_d + 1) )
            start = d
        last_d = d
    if start is not None:
        spans.append( (start, last_d + 1) )
    return spans

def read_data(fname: str):
    with open(fname, newline='') as csvfile:
        reader = csv.reader(csvfile)
        _ = next(reader)  # skip the headers
        for row in reader:
            lst = ast.literal_eval(row[0])
            text = row[1]
            yield lst, text

def create_docbin(fname: str, basename: str, nlp, span_label='sc'):
    doc_bin = DocBin()
    for spans, text in read_data(fname):
        ms = make_spans(spans)
        doc = nlp(text)
        span_lst = []
        for start, end in ms:
            span = doc.char_span(start, end, label='sc')
            if span is not None:
                span_lst.append(span)
        # span_lst is now a list of spaCy 'Span' objects
        # Set the Spans as document entities.
        doc.set_ents(span_lst)
        doc.spans[span_label] = list(doc.ents)
        doc_bin.add(doc)
    doc_bin.to_disk(f'../data/{basename}.spacy')

In [51]:
nlp = spacy.load("en_core_web_sm")
create_docbin('../data/tsd_train.csv', 'train', nlp)
create_docbin('../data/tsd_trial.csv', 'dev', nlp)
create_docbin('../data/tsd_test.csv', 'eval', nlp)

In [52]:
docbin = DocBin().from_disk('../data/eval.spacy')

In [53]:
doc = list(docbin.get_docs(nlp.vocab))[0]

In [54]:
doc.spans

{'sc': [ABNORMAL, sexist rubbish]}