In [1]:
import pandas as pd
import numpy as np
from transformers import BertTokenizer
from transformers import RobertaTokenizer
from transformers import RobertaTokenizerFast

In [33]:
from torch import nn
import torch

In [42]:
a = "haha..xx!!? Oh, I'm good!"

In [61]:
def prepare(text):
    words = text.split()
    retval, first_char, invert_map = [], [], []
    current_pos = 0
    for w in words:
        word_ret = [""]
        word_invert = [current_pos]
        for p, c in enumerate(w):
            if c in ['.',',','!','?']:
                if word_ret[-1]=="":
                    word_ret[-1]+=c
                    word_invert[-1]=current_pos+p
                else:
                    word_ret.append(c)
                    word_invert.append(current_pos+p)
                word_ret.append("")
                word_invert.append(current_pos+p+1)
            else:
                word_ret[-1]+=c
        if len(word_ret[-1])==0:
            word_ret.pop(-1)
            word_invert.pop(-1)
        word_first = [False if i>0 else True for i in range(len(word_ret)) ]
        retval.extend(word_ret)
        first_char.extend(word_first)
        invert_map.extend(word_invert)
        current_pos+=len(w)+1
    return retval, first_char, invert_map

In [62]:
words, first_char, invert_map = prepare(a)
print(words)
print(first_char)
print(invert_map)

['haha', '.', '.', 'xx', '!', '!', '?', 'Oh', ',', "I'm", 'good', '!']
[True, False, False, False, False, False, False, True, False, True, True, False]
[0, 4, 5, 6, 8, 9, 10, 12, 14, 16, 20, 24]


In [56]:
def decode(tokens, first_char, start, end):
    retval = ""
    for i in range(start, end+1):
        if first_char[i]:
            retval+= (" "+tokens[i])
        else:
            retval += tokens[i]
    return " ".join(retval.split())

In [58]:
print(decode(words, first_char, 2,9))

.xx!!? Oh, I'm


In [2]:
def _improve_answer_span(doc_tokens, input_start, input_end, tokenizer, orig_answer_text):
    """Returns tokenized answer spans that better match the annotated answer."""
    tok_answer_text = " ".join(tokenizer.tokenize(orig_answer_text))

    for new_start in range(input_start, input_end + 1):
        for new_end in range(input_end, new_start - 1, -1):
            text_span = " ".join(doc_tokens[new_start : (new_end + 1)])
            if text_span == tok_answer_text:
                return (new_start, new_end)

    return (input_start, input_end)

In [3]:
train = pd.read_csv('../input/tweet-sentiment-extraction/train.csv')

In [4]:
tokenizer = RobertaTokenizer.from_pretrained('../../bert_models/roberta_base/')

In [5]:
train.shape

(27481, 4)

In [6]:
train.dropna(subset=['text','selected_text'], how='any', inplace=True)

In [7]:
train.shape

(27480, 4)

In [8]:
train['text'] = train['text'].apply(lambda x: ' '.join(x.lower().strip().split()))
train['selected_text'] = train['selected_text'].apply(lambda x: ' '.join(x.lower().strip().split()))

In [9]:
train.head()

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"i`d have responded, if i were going","i`d have responded, if i were going",neutral
1,549e992a42,sooo sad i will miss you here in san diego!!!,sooo sad,negative
2,088c60f138,my boss is bullying me...,bullying me,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative
4,358bd9e861,"sons of ****, why couldn`t they put them on th...","sons of ****,",negative


In [10]:
def clean(x):
    sp_x = x.split()
    if len(sp_x[0])==1 and len(sp_x)>1 and sp_x[0].lower() not in ['i','a','u'] and not sp_x[0].isdigit():
        print(x, '|', ' '.join(sp_x[1:]))
        return ' '.join(sp_x[1:])
    return x

In [11]:
train['c_selected_text'] = train['selected_text'].apply(lambda x: clean(x))
# train['c_selected_text'] = train['selected_text']

d i`m not thrilled at all with mine. | i`m not thrilled at all with mine.
s awesome | awesome
, sorry guys | sorry guys
y adore | adore
e nice | nice
p sounds like fun | sounds like fun
e fun | fun
d thank you! | thank you!
g what is this powerblog challenge you keep talking about? i`m a newbie followe | what is this powerblog challenge you keep talking about? i`m a newbie followe
g harmed | harmed
, he was soooo friendly. | he was soooo friendly.
m worried | worried
k will check it out... | will check it out...
s i just don`t entertain him :/ | i just don`t entertain him :/
- rest if you need it. | rest if you need it.
! i`m scared of him | i`m scared of him
d there wasn`t a chance | there wasn`t a chance
. tomorrow will be tough! | tomorrow will be tough!
. sorry | sorry
@ barbs trying to figure out y the dsl aint connecting. i need my google. going home to my computer if it don`t start working soon. smh | barbs trying to figure out y the dsl aint connecting. i need my google. going 

o talented | talented
o thanks | thanks
. haha | haha
m smile | smile
, i was disappointed | i was disappointed
. he`s mad at us. | he`s mad at us.
g can`t wait for my blankets | can`t wait for my blankets
r that makes me sad and i feel like my life is dull and uninteresting. | that makes me sad and i feel like my life is dull and uninteresting.
& hopefully | hopefully
. i miss girlfriends!! | i miss girlfriends!!
y bad arthritus | bad arthritus
! i approve whole heartedly. | i approve whole heartedly.
. hope | hope
t cancelled | cancelled
s fabulous! | fabulous!
h i am eternally pleased | i am eternally pleased
d cant find the song lol | cant find the song lol
o help | help
e worst | worst
! come pick up your gift ;) | come pick up your gift ;)
, love to you | love to you
y good | good
? already finished chatting . haha .. ? | already finished chatting . haha .. ?
? a good distraction | a good distraction
, you will love tonight`s dc | you will love tonight`s dc
s great | great
y can`

In [12]:
train['start_pos'] = train.apply(lambda x: x['text'].find(x['c_selected_text']), axis=1)
train['end_pos'] = train.apply(lambda x: x['start_pos']+len(x['c_selected_text']), axis=1)

In [13]:
def contains(a, b):
    for i in range(0, len(a)-len(b)+1):
        flag = True
        for j in range(0, len(b)):
            if a[i+j]!=b[j]:
                flag = False
                break
        if flag:
            return True, i
    return False, -1

In [14]:
data = []
improve_count = 0
for text, sp, ep, st in zip(train['text'].tolist(), train['start_pos'].tolist(), train['end_pos'].tolist(),
                           train['c_selected_text'].tolist()):
    split_text = text.split()
    tokens, labels, invert_map, first_token, in_st = [], [], [], [], []
    
    # token in selected_tokens
    temp = np.zeros(len(text))
    
    end_pos = 0
    start_pos = text.find(st, end_pos)
    while start_pos>=0:
        end_pos = start_pos+len(st)
        temp[start_pos:end_pos]=1
        start_pos = text.find(st, end_pos)
    
    st = st.replace("`","'")
    st_words = st.split()
    st_tokens = tokenizer.tokenize(' '+st)
    if len(st_words)/len(split_text)>0.9:
        all_sentence = 1
    else:
        all_sentence = 0
    cur_length = 0
    
    
    for idx, w in enumerate(split_text):
        w = w.replace("`","'")
        if sum(temp[cur_length+idx:cur_length+idx+len(w)])>0:
            started = True# space
        else:
            started = False
        for idx2, token in enumerate(tokenizer.tokenize(' '+w)):
            first_token.append(True if idx2==0 else False)
            tokens.append(token)
            invert_map.append(idx)
            if started:
                labels.append(len(tokens)-1)
                in_st.append(1)
            else:
                in_st.append(0)
        cur_length+=len(w)
    start_token_idx = min(labels)
    end_token_idx = max(labels)
    start_token_idx, end_token_idx = _improve_answer_span(tokens, start_token_idx, end_token_idx,
                                                         tokenizer, ' '+st)
#     start_token_idx, end_token_idx = _improve_answer_span(tokens, start_token_idx, end_token_idx,
#                                                          tokenizer, st)
    
    start_word_idx = invert_map[start_token_idx]
    end_word_idx = invert_map[end_token_idx]+1
    
    data.append((tokens, start_token_idx, end_token_idx, invert_map, first_token, in_st, all_sentence))

In [15]:
tokens, start, end, invert_map, first_token, in_st, all_sentence = zip(*data)

In [16]:
len(tokens)

27480

In [17]:
train['tokens'] = tokens

In [18]:
train['start'] = start
train['end'] = end
train['invert_map'] = invert_map
train['first_token'] = first_token
train['in_st'] = in_st
train['all_sentence'] = all_sentence

In [19]:
senti2label = {
    'positive':2,
    'negative':0,
    'neutral':1
}
train['senti_label']=train['sentiment'].apply(lambda x: senti2label[x])

In [20]:
train.reset_index(drop=True, inplace=True)

In [21]:
train.head()

Unnamed: 0,textID,text,selected_text,sentiment,c_selected_text,start_pos,end_pos,tokens,start,end,invert_map,first_token,in_st,all_sentence,senti_label
0,cb774db0d1,"i`d have responded, if i were going","i`d have responded, if i were going",neutral,"i`d have responded, if i were going",0,35,"[Ġi, 'd, Ġhave, Ġresponded, ,, Ġif, Ġi, Ġwere,...",0,8,"[0, 0, 1, 2, 2, 3, 4, 5, 6]","[True, False, True, True, False, True, True, T...","[1, 1, 1, 1, 1, 1, 1, 1, 1]",1,1
1,549e992a42,sooo sad i will miss you here in san diego!!!,sooo sad,negative,sooo sad,0,8,"[Ġso, oo, Ġsad, Ġi, Ġwill, Ġmiss, Ġyou, Ġhere,...",0,2,"[0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 9, 9]","[True, False, True, True, True, True, True, Tr...","[1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",0,0
2,088c60f138,my boss is bullying me...,bullying me,negative,bullying me,11,22,"[Ġmy, Ġboss, Ġis, Ġbullying, Ġme, ...]",3,4,"[0, 1, 2, 3, 4, 4]","[True, True, True, True, True, False]","[0, 0, 0, 1, 1, 1]",0,0
3,9642c003ef,what interview! leave me alone,leave me alone,negative,leave me alone,16,30,"[Ġwhat, Ġinterview, !, Ġleave, Ġme, Ġalone]",3,5,"[0, 1, 1, 2, 3, 4]","[True, True, False, True, True, True]","[0, 0, 0, 1, 1, 1]",0,0
4,358bd9e861,"sons of ****, why couldn`t they put them on th...","sons of ****,",negative,"sons of ****,",0,13,"[Ġsons, Ġof, Ġ****, ,, Ġwhy, Ġcouldn, 't, Ġthe...",0,3,"[0, 1, 2, 2, 3, 4, 4, 5, 6, 7, 8, 9, 10, 11, 1...","[True, True, True, False, True, True, False, T...","[1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",0,0


## evaluate 

In [23]:
def jaccard_list(l1, l2):
    a = set(l1)
    b = set(l2)
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

In [28]:
def get_jaccard(x):
    label = x['selected_text'].split()
    words = x['text'].split()
    start = x['invert_map'][x['start']]
    end = x['invert_map'][x['end']]
    label2 = words[start:end+1]
    return jaccard_list(label, label2)

In [29]:
train['label_jaccard'] = train.apply(lambda x: get_jaccard(x), axis=1)

In [30]:
train['label_jaccard'].mean()

0.9455688809151337

In [56]:
from sklearn.model_selection import StratifiedKFold

In [57]:
train.shape

(27480, 15)

In [58]:
local_test_id = pd.read_pickle('../input/localtest_ids.pkl')

In [59]:
local_test = train[train['textID'].isin(local_test_id['textID'].tolist())]

In [60]:
local_test.head()

Unnamed: 0,textID,text,selected_text,sentiment,c_selected_text,start_pos,end_pos,tokens,start,end,invert_map,first_token,in_st,all_sentence,senti_label
2,088c60f138,my boss is bullying me...,bullying me,negative,bullying me,11,22,"[Ġmy, Ġboss, Ġis, Ġbullying, Ġme, ...]",3,4,"[0, 1, 2, 3, 4, 4]","[True, True, True, True, True, False]","[0, 0, 0, 1, 1, 1]",0,0
13,04dd1d2e34,i want to go to music tonight but i lost my vo...,lost,negative,lost,36,40,"[Ġi, Ġwant, Ġto, Ġgo, Ġto, Ġmusic, Ġtonight, Ġ...",9,9,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 11]","[True, True, True, True, True, True, True, Tru...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0]",0,0
14,bbe3cbf620,test test from the lg env2,test test from the lg env2,neutral,test test from the lg env2,0,26,"[Ġtest, Ġtest, Ġfrom, Ġthe, Ġl, g, Ġenv, 2]",0,7,"[0, 1, 2, 3, 4, 4, 5, 5]","[True, True, True, True, True, False, True, Fa...","[1, 1, 1, 1, 1, 1, 1, 1]",1,1
19,40e7becabf,hes just not that into you,hes just not that into you,neutral,hes just not that into you,0,26,"[Ġhes, Ġjust, Ġnot, Ġthat, Ġinto, Ġyou]",0,5,"[0, 1, 2, 3, 4, 5]","[True, True, True, True, True, True]","[1, 1, 1, 1, 1, 1]",1,1
31,7d8c4c11e4,i hope unni will make the audition . fighting ...,hope,positive,hope,2,6,"[Ġi, Ġhope, Ġun, ni, Ġwill, Ġmake, Ġthe, Ġaudi...",1,1,"[0, 1, 2, 2, 3, 4, 5, 6, 7, 8, 9, 9, 9, 10, 10...","[True, True, True, False, True, True, True, Tr...","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",0,2


In [61]:
# local_train = train[~train['textID'].isin(local_test['textID'].tolist())]
local_train = train
local_train.reset_index(drop=True, inplace=True)

In [62]:
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=1991)

In [63]:
local_train['fold'] = 0
for fold, (train_idx, valid_idx) in enumerate(kf.split(local_train, local_train['senti_label'])):
    local_train.loc[valid_idx, 'fold'] = fold

In [64]:
local_train['fold'].value_counts()

0    5498
1    5497
4    5495
3    5495
2    5495
Name: fold, dtype: int64

In [65]:
local_train.to_pickle('../input/train_roberta2.pkl')

In [30]:
local_test.to_pickle('../input/localtest_roberta2.pkl')

NameError: name 'local_test' is not defined