In [5]:
import pandas as pd
from transformers import RobertaTokenizer

In [3]:
train = pd.read_csv('../input/tweet-sentiment-extraction/train.csv')

In [6]:
tokenizer = RobertaTokenizer.from_pretrained('../../bert_models/roberta_large/')

In [7]:
train.shape

(27486, 4)

In [8]:
train.dropna(subset=['text','selected_text'], how='any', inplace=True)

In [9]:
train.shape

(27485, 4)

In [10]:
train.head()

Unnamed: 0,textID,text,selected_text,sentiment
0,a3d0a7d5ad,Spent the entire morning in a meeting w/ a ven...,my boss was not happy w/ them. Lots of fun.,neutral
1,251b6a6766,Oh! Good idea about putting them on ice cream,Good,positive
2,c9e8d1ef1c,says good (or should i say bad?) afternoon! h...,says good (or should i say bad?) afternoon!,neutral
3,f14f087215,i dont think you can vote anymore! i tried,i dont think you can vote anymore!,negative
4,bf7473b12d,haha better drunken tweeting you mean?,better,positive


In [11]:
train['text'] = train['text'].apply(lambda x: x.strip())

In [12]:
train['start_pos'] = train.apply(lambda x: x['text'].find(x['selected_text']), axis=1)
train['end_pos'] = train.apply(lambda x: x['start_pos']+len(x['selected_text']), axis=1)

In [13]:
train['start_pos'].describe()

count    27485.000000
mean        15.604839
std         26.179125
min          0.000000
25%          0.000000
50%          0.000000
75%         23.000000
max        134.000000
Name: start_pos, dtype: float64

In [14]:
data = []
for text, sp, ep in zip(train['text'].tolist(), train['start_pos'].tolist(), train['end_pos'].tolist()):
    split_text = text.split()
    tokens, labels, invert_map = [], [], []
    cur_length = 0
    for idx, w in enumerate(split_text):
        if cur_length+idx+len(w)>=sp and cur_length+idx<ep:
            started = True# space
        else:
            started = False
        for token in tokenizer.tokenize(w):
            tokens.append(token)
            invert_map.append(idx)
            if started:
                labels.append(len(tokens)-1)
        cur_length+=len(w)
    data.append((tokens, min(labels), max(labels), invert_map, tokens[min(labels)],tokens[max(labels)]))
#     if len(data)>5:
#         break

In [15]:
train.loc[0, 'text'].split()[data[0][3][25]]

'them.'

In [16]:
data[:5]

[(['Sp',
   'ent',
   'the',
   'ent',
   'ire',
   'morning',
   'in',
   'a',
   'me',
   'eting',
   'w',
   '/',
   'a',
   'v',
   'endor',
   ',',
   'and',
   'my',
   'boss',
   'was',
   'not',
   'happy',
   'w',
   '/',
   'them',
   '.',
   'Lots',
   'of',
   'fun',
   '.',
   'I',
   'had',
   'other',
   'pl',
   'ans',
   'for',
   'my',
   'morning'],
  17,
  29,
  [0,
   0,
   1,
   2,
   2,
   3,
   4,
   5,
   6,
   6,
   7,
   7,
   8,
   9,
   9,
   9,
   10,
   11,
   12,
   13,
   14,
   15,
   16,
   16,
   17,
   17,
   18,
   19,
   20,
   20,
   21,
   22,
   23,
   24,
   24,
   25,
   26,
   27],
  'my',
  '.'),
 (['Oh',
   '!',
   'Good',
   'ide',
   'a',
   'about',
   'put',
   'ting',
   'them',
   'on',
   'ice',
   'cream'],
  2,
  2,
  [0, 0, 1, 2, 2, 3, 4, 4, 5, 6, 7, 8],
  'Good',
  'Good'),
 (['s',
   'ays',
   'good',
   '(',
   'or',
   'should',
   'i',
   'say',
   'bad',
   '?)',
   'after',
   'noon',
   '!',
   'http',
   '://',
   'pl',


In [17]:
tokens, start, end, invert_map, _, _ = zip(*data)

In [18]:
len(tokens)

27485

In [19]:
train['tokens'] = tokens

In [20]:
train['start'] = start
train['end'] = end
train['invert_map'] = invert_map

In [21]:
senti2label = {
    'positive':2,
    'negative':0,
    'neutral':1
}
train['senti_label']=train['sentiment'].apply(lambda x: senti2label[x])

In [22]:
train.reset_index(drop=True, inplace=True)

In [23]:
train.head()

Unnamed: 0,textID,text,selected_text,sentiment,start_pos,end_pos,tokens,start,end,invert_map,senti_label
0,a3d0a7d5ad,Spent the entire morning in a meeting w/ a ven...,my boss was not happy w/ them. Lots of fun.,neutral,55,98,"[Sp, ent, the, ent, ire, morning, in, a, me, e...",17,29,"[0, 0, 1, 2, 2, 3, 4, 5, 6, 6, 7, 7, 8, 9, 9, ...",1
1,251b6a6766,Oh! Good idea about putting them on ice cream,Good,positive,4,8,"[Oh, !, Good, ide, a, about, put, ting, them, ...",2,2,"[0, 0, 1, 2, 2, 3, 4, 4, 5, 6, 7, 8]",2
2,c9e8d1ef1c,says good (or should i say bad?) afternoon! h...,says good (or should i say bad?) afternoon!,neutral,0,43,"[s, ays, good, (, or, should, i, say, bad, ?),...",0,12,"[0, 0, 1, 2, 2, 3, 4, 5, 6, 6, 7, 7, 7, 8, 8, ...",1
3,f14f087215,i dont think you can vote anymore! i tried,i dont think you can vote anymore!,negative,0,34,"[i, d, ont, think, you, can, vote, any, more, ...",0,9,"[0, 1, 1, 2, 3, 4, 5, 6, 6, 6, 7, 8, 8]",0
4,bf7473b12d,haha better drunken tweeting you mean?,better,positive,5,11,"[h, aha, better, dr, unk, en, t, weet, ing, yo...",2,2,"[0, 0, 1, 2, 2, 2, 3, 3, 3, 4, 5, 5]",2


In [24]:
from sklearn.model_selection import StratifiedKFold

In [25]:
kf = StratifiedKFold(n_splits=5)

In [26]:
train['fold'] = 0
for fold, (train_idx, valid_idx) in enumerate(kf.split(train, train['senti_label'])):
    train.loc[valid_idx, 'fold'] = fold

In [27]:
train['fold'].value_counts()

0    5499
1    5498
4    5496
3    5496
2    5496
Name: fold, dtype: int64

In [29]:
train.to_pickle('../input/train_roberta.pkl')

In [32]:
tokenizer.tokenize('negative')

['negative']

In [35]:
tokenizer.encode_plus('positive', 'miss you my dear', max_length=20, pad_to_max_length=True)

{'input_ids': [0,
  1313,
  2,
  2,
  2649,
  47,
  127,
  14880,
  2,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1],
 'token_type_ids': [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 'attention_mask': [1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0]}