In [1]:
import pandas as pd
import numpy as np
from transformers import BertTokenizer
from transformers import RobertaTokenizerFast, RobertaTokenizer

In [2]:
train = pd.read_csv('../input/tweet-sentiment-extraction/train.csv')

In [3]:
tokenizer = RobertaTokenizer.from_pretrained('../../bert_models/roberta_base/')

In [21]:
tokenizer.tokenize("neutral</s></s>happy")

['neutral', '</s>', '</s>', 'happy']

In [8]:
token = tokenizer.tokenize("   hello  world")

In [25]:
tokenizer.tokenize(" sadness")

['Ġsadness']

In [11]:
token

['Ġ', 'Ġ', 'Ġhello', 'Ġ', 'Ġworld']

In [27]:
sentiment2= ['neutral', 'sadness', 'worry', 'hate', 'happiness', 'empty',
       'surprise', 'love', 'fun', 'relief', 'enthusiasm', 'boredom',
       'anger','None']
for s in sentiment2:
    inputs = tokenizer.encode_plus(' positive'+'</s></s> '+s, token, add_special_tokens=True)
    print(inputs['input_ids'])

[0, 1313, 2, 2, 7974, 2, 2, 1437, 1437, 20760, 1437, 232, 2]
[0, 1313, 2, 2, 17437, 2, 2, 1437, 1437, 20760, 1437, 232, 2]
[0, 1313, 2, 2, 4022, 2, 2, 1437, 1437, 20760, 1437, 232, 2]
[0, 1313, 2, 2, 4157, 2, 2, 1437, 1437, 20760, 1437, 232, 2]
[0, 1313, 2, 2, 11098, 2, 2, 1437, 1437, 20760, 1437, 232, 2]
[0, 1313, 2, 2, 5802, 2, 2, 1437, 1437, 20760, 1437, 232, 2]
[0, 1313, 2, 2, 2755, 2, 2, 1437, 1437, 20760, 1437, 232, 2]
[0, 1313, 2, 2, 657, 2, 2, 1437, 1437, 20760, 1437, 232, 2]
[0, 1313, 2, 2, 1531, 2, 2, 1437, 1437, 20760, 1437, 232, 2]
[0, 1313, 2, 2, 3500, 2, 2, 1437, 1437, 20760, 1437, 232, 2]
[0, 1313, 2, 2, 11240, 2, 2, 1437, 1437, 20760, 1437, 232, 2]
[0, 1313, 2, 2, 40326, 2, 2, 1437, 1437, 20760, 1437, 232, 2]
[0, 1313, 2, 2, 6378, 2, 2, 1437, 1437, 20760, 1437, 232, 2]
[0, 1313, 2, 2, 9291, 2, 2, 1437, 1437, 20760, 1437, 232, 2]


In [20]:
inputs

{'input_ids': [0, 7974, 9291, 2, 2, 1437, 1437, 20760, 1437, 232, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [28]:
words = []
offsets = []
prev_punc = True
text = "   hello,  world"
for idx, c in enumerate(text):
    
    if c in [' ','.',',','!','?','(',')',';',':','-','=',"/","<","`"]:
        prev_punc = True
        words.append(c)
        offsets.append(idx)
    else:
        if prev_punc:
            words.append(c)
            offsets.append(idx)
            prev_punc = False
        else:
            words[-1]+=c
            
print(words)
print(offsets)

[' ', ' ', ' ', 'hello', ',', ' ', ' ', 'world']
[0, 1, 2, 3, 8, 9, 10, 11]


In [30]:
tokens = []
for word_idx, word in enumerate(words):
    if word_idx>0 and words[word_idx-1]==' ':
        prefix = ' '
    else:
        prefix = ''
    if word==' ':
        tokens.append("Ġ")
    else:
        for token in tokenizer.tokenize(prefix+word):
            tokens.append(token)
print(tokens)

['Ġ', 'Ġ', 'Ġ', 'Ġhello', ',', 'Ġ', 'Ġ', 'Ġworld']


In [14]:
def get_extra_space_count(x):
    prev_space = True
    space_counts = []
    count = 0
    for c in x:
        if c==' ':
            if prev_space:
                count+=1
            space_counts.append(count)
            prev_space = True
        else:
            space_counts.append(count)
            prev_space = False
    return space_counts
            

In [15]:
print(get_extra_space_count('   hello  world'))

[1, 2, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4]
