In [6]:
import pandas as pd
from transformers import RobertaTokenizer
import numpy as np

In [7]:
def _improve_answer_span(doc_tokens, input_start, input_end, tokenizer, orig_answer_text):
    """Returns tokenized answer spans that better match the annotated answer."""
    tok_answer_text = " ".join(tokenizer.tokenize(orig_answer_text))

    for new_start in range(input_start, input_end + 1):
        for new_end in range(input_end, new_start - 1, -1):
            text_span = " ".join(doc_tokens[new_start : (new_end + 1)])
            if text_span == tok_answer_text:
                return (new_start, new_end)

    return (input_start, input_end)

In [8]:
train = pd.read_csv('../input/tweet-sentiment-extraction/train.csv')

In [9]:
tokenizer = RobertaTokenizer.from_pretrained('../../bert_models/roberta_large/')

In [10]:
train.shape

(27481, 4)

In [11]:
train.dropna(subset=['text','selected_text'], how='any', inplace=True)

In [12]:
train.shape

(27480, 4)

In [13]:
train.head()

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,088c60f138,my boss is bullying me...,bullying me,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative


In [14]:
train['text'] = train['text'].apply(lambda x: ' '.join(x.strip().split()))
train['selected_text'] = train['selected_text'].apply(lambda x: ' '.join(x.strip().split()))

In [15]:
train['start_pos'] = train.apply(lambda x: x['text'].find(x['selected_text']), axis=1)
train['end_pos'] = train.apply(lambda x: x['start_pos']+len(x['selected_text']), axis=1)

In [16]:
train['start_pos'].describe()

count    27480.000000
mean        15.510153
std         26.048759
min          0.000000
25%          0.000000
50%          0.000000
75%         23.000000
max        133.000000
Name: start_pos, dtype: float64

In [52]:
data = []
improve_count = 0
for text, sp, ep, st in zip(train['text'].tolist(), train['start_pos'].tolist(), train['end_pos'].tolist(),
                           train['selected_text'].tolist()):
    split_text = text.split()
    tokens, labels, invert_map, first_token = [], [], [], []
    cur_length = 0
    temp = np.zeros(len(text))
    temp[sp:ep]=1
    for idx, w in enumerate(split_text):
        if sum(temp[cur_length+idx:cur_length+idx+len(w)])>0:
            started = True# space
        else:
            started = False
        for token in tokenizer.tokenize(' '+w):
            tokens.append(token)
            invert_map.append(idx)
            if started:
                labels.append(len(tokens)-1)
        cur_length+=len(w)
    start_token_idx = min(labels)
    end_token_idx = max(labels)
    start_word_idx = invert_map[start_token_idx]
    end_word_idx = invert_map[end_token_idx]+1
    assert ' '.join(split_text[start_word_idx:end_word_idx]).find(st)>=0
    if ' '.join(split_text[start_word_idx:end_word_idx])!=st:
        flag=True
#         print(' '.join(split_text[start_word_idx:end_word_idx]),'|',st)
    else:
        flag=False
    start_token_idx1, end_token_idx2 = _improve_answer_span(tokens, start_token_idx, end_token_idx,
                                                         tokenizer, ' '+st)
    if start_token_idx1!=start_token_idx or end_token_idx2!=end_token_idx:
        improve_count+=1
#         print(tokens[start_token_idx1:end_token_idx2+1])
    data.append((tokens, start_token_idx1, end_token_idx2, invert_map, tokens[min(labels)],tokens[max(labels)], flag))
#     if len(data)>100:
#         break

In [53]:
improve_count

342

In [54]:
tokens, start, end, invert_map, _, _, not_match = zip(*data)

In [55]:
len(tokens)

27480

In [56]:
train['tokens'] = tokens

In [57]:
train['start'] = start
train['end'] = end
train['invert_map'] = invert_map
train['not_match']=not_match

In [58]:
senti2label = {
    'positive':2,
    'negative':0,
    'neutral':1
}
train['senti_label']=train['sentiment'].apply(lambda x: senti2label[x])

In [59]:
train['not_match'].mean()

0.10673216885007278

In [60]:
train.reset_index(drop=True, inplace=True)

In [61]:
# train['sentiment'] = train['sentiment'].apply(lambda x: tokenizer.tokenize(' '+x))

In [62]:
train.head()

Unnamed: 0,textID,text,selected_text,sentiment,start_pos,end_pos,tokens,start,end,invert_map,not_match,senti_label
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral,0,35,"[ĠI, `, d, Ġhave, Ġresponded, ,, Ġif, ĠI, Ġwer...",0,9,"[0, 0, 0, 1, 2, 2, 3, 4, 5, 6]",False,1
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative,0,8,"[ĠSo, oo, ĠS, AD, ĠI, Ġwill, Ġmiss, Ġyou, Ġher...",0,3,"[0, 0, 1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 9]",False,0
2,088c60f138,my boss is bullying me...,bullying me,negative,11,22,"[Ġmy, Ġboss, Ġis, Ġbullying, Ġme, ...]",3,4,"[0, 1, 2, 3, 4, 4]",True,0
3,9642c003ef,what interview! leave me alone,leave me alone,negative,16,30,"[Ġwhat, Ġinterview, !, Ġleave, Ġme, Ġalone]",3,5,"[0, 1, 1, 2, 3, 4]",False,0
4,358bd9e861,"Sons of ****, why couldn`t they put them on th...","Sons of ****,",negative,0,13,"[ĠSons, Ġof, Ġ****, ,, Ġwhy, Ġcouldn, `, t, Ġt...",0,3,"[0, 1, 2, 2, 3, 4, 4, 4, 5, 6, 7, 8, 9, 10, 11...",False,0


In [63]:
from sklearn.model_selection import StratifiedKFold

In [64]:
kf = StratifiedKFold(n_splits=5)

In [65]:
train['fold'] = 0
for fold, (train_idx, valid_idx) in enumerate(kf.split(train, train['senti_label'])):
    train.loc[valid_idx, 'fold'] = fold

In [66]:
train['fold'].value_counts()

0    5498
1    5497
4    5495
3    5495
2    5495
Name: fold, dtype: int64

In [67]:
train.to_pickle('../input/train_roberta_v3.pkl')