In [11]:
import pandas as pd
import numpy as np
from transformers import BertTokenizer
from transformers import RobertaTokenizerFast, RobertaTokenizer

In [12]:
train = pd.read_csv('../input/tweet-sentiment-extraction/train.csv')

In [15]:
tokenizer = RobertaTokenizer.from_pretrained('../../bert_models/roberta_base/')

In [16]:
tokenizer.tokenize("   \"hello  world\"")

['Ġ', 'Ġ', 'Ġ"', 'hello', 'Ġ', 'Ġworld', '"']

In [17]:
tokenizer.tokenize("   hello  world")

['Ġ', 'Ġ', 'Ġhello', 'Ġ', 'Ġworld']

In [24]:
tokenizer.tokenize("  a")

['Ġ', 'Ġa']

In [26]:
inputs = tokenizer.encode_plus('   hello,  world', add_special_tokens=False)

In [27]:
inputs

{'input_ids': [1437, 1437, 20760, 6, 1437, 232], 'attention_mask': [1, 1, 1, 1, 1, 1]}

In [28]:
words = []
offsets = []
prev_punc = True
text = "   hello,  world"
for idx, c in enumerate(text):
    
    if c in [' ','.',',','!','?','(',')',';',':','-','=',"/","<","`"]:
        prev_punc = True
        words.append(c)
        offsets.append(idx)
    else:
        if prev_punc:
            words.append(c)
            offsets.append(idx)
            prev_punc = False
        else:
            words[-1]+=c
            
print(words)
print(offsets)

[' ', ' ', ' ', 'hello', ',', ' ', ' ', 'world']
[0, 1, 2, 3, 8, 9, 10, 11]


In [30]:
tokens = []
for word_idx, word in enumerate(words):
    if word_idx>0 and words[word_idx-1]==' ':
        prefix = ' '
    else:
        prefix = ''
    if word==' ':
        tokens.append("Ġ")
    else:
        for token in tokenizer.tokenize(prefix+word):
            tokens.append(token)
print(tokens)

['Ġ', 'Ġ', 'Ġ', 'Ġhello', ',', 'Ġ', 'Ġ', 'Ġworld']


In [14]:
def get_extra_space_count(x):
    prev_space = True
    space_counts = []
    count = 0
    for c in x:
        if c==' ':
            if prev_space:
                count+=1
            space_counts.append(count)
            prev_space = True
        else:
            space_counts.append(count)
            prev_space = False
    return space_counts
            

In [15]:
print(get_extra_space_count('   hello  world'))

[1, 2, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4]
