In [1]:
import pandas as pd
import numpy as np
from transformers import BertTokenizer
from transformers import RobertaTokenizerFast

In [2]:
def _improve_answer_span(doc_tokens, input_start, input_end, tokenizer, orig_answer_text):
    """Returns tokenized answer spans that better match the annotated answer."""
    tok_answer_text = " ".join(tokenizer.tokenize(orig_answer_text))

    for new_start in range(input_start, input_end + 1):
        for new_end in range(input_end, new_start - 1, -1):
            text_span = " ".join(doc_tokens[new_start : (new_end + 1)])
            if text_span == tok_answer_text:
                return (new_start, new_end)

    return (input_start, input_end)

In [3]:
train = pd.read_csv('../input/tweet-sentiment-extraction/train.csv')

In [4]:
tokenizer = RobertaTokenizerFast.from_pretrained('../../bert_models/roberta_base/')

In [13]:
tokenizer.tokenize("   \"hello  world\"")

['Ġ', 'Ġ', 'Ġ"', 'hello', 'Ġ', 'Ġworld', '"']

In [18]:
inputs = tokenizer.encode_plus('   hello,  world', return_offsets_mapping=True, add_special_tokens=False)

In [19]:
inputs

{'input_ids': [1437, 1437, 20760, 6, 1437, 232], 'attention_mask': [1, 1, 1, 1, 1, 1], 'offset_mapping': [(0, 0), (2, 2), (3, 8), (8, 9), (10, 10), (11, 16)]}

In [14]:
def get_extra_space_count(x):
    prev_space = True
    space_counts = []
    count = 0
    for c in x:
        if c==' ':
            if prev_space:
                count+=1
            space_counts.append(count)
            prev_space = True
        else:
            space_counts.append(count)
            prev_space = False
    return space_counts
            

In [15]:
print(get_extra_space_count('   hello  world'))

[1, 2, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4]
