In [1]:
import pandas as pd
import numpy as np
from transformers import BertTokenizer
from transformers import RobertaTokenizer
from transformers import RobertaTokenizerFast

%matplotlib inline

In [2]:
def prepare(text):
    words = text.split()
    retval, first_char, invert_map = [], [], []
    current_pos = 0
    for w in words:
        word_ret = [""]
        word_invert = [current_pos]
        for p, c in enumerate(w):
            if c in ['.',',','!','?','(',')',';',':','-']:
                if word_ret[-1]=="":
                    word_ret[-1]+=c
                    word_invert[-1]=current_pos+p
                else:
                    word_ret.append(c)
                    word_invert.append(current_pos+p)
                word_ret.append("")
                word_invert.append(current_pos+p+1)
            else:
                word_ret[-1]+=c
        if len(word_ret[-1])==0:
            word_ret.pop(-1)
            word_invert.pop(-1)
        word_first = [False if i>0 else True for i in range(len(word_ret)) ]
        retval.extend(word_ret)
        first_char.extend(word_first)
        invert_map.extend(word_invert)
        current_pos+=len(w)+1
    assert len(retval)==len(first_char)
    return retval, first_char, invert_map

In [3]:
# words, first_char, invert_map = prepare(a)
# print(words)
# print(first_char)
# print(invert_map)

In [4]:
def decode(tokens, first_char, start, end):
    retval = ""
    for i in range(start, end+1):
        if first_char[i]:
            retval+= (" "+tokens[i])
        else:
            retval += tokens[i]
    return " ".join(retval.split())

In [5]:
def _improve_answer_span(doc_tokens, input_start, input_end, tokenizer, orig_answer_text):
    """Returns tokenized answer spans that better match the annotated answer."""
    tok_answer_text = " ".join(tokenizer.tokenize(orig_answer_text))

    for new_start in range(input_start, input_end + 1):
        for new_end in range(input_end, new_start - 1, -1):
            text_span = " ".join(doc_tokens[new_start : (new_end + 1)])
            if text_span == tok_answer_text:
                return (new_start, new_end)

    return (input_start, input_end)

In [6]:
train = pd.read_csv('../input/tweet-sentiment-extraction/train_folds.csv')

In [7]:
tokenizer = RobertaTokenizer.from_pretrained('../../bert_models/roberta_base/')

In [8]:
train.shape

(27480, 5)

In [9]:
train.dropna(subset=['text','selected_text'], how='any', inplace=True)

In [10]:
train.shape

(27480, 5)

In [11]:
train['text'] = train['text'].apply(lambda x: ' '.join(x.lower().strip().split()))
train['selected_text'] = train['selected_text'].apply(lambda x: ' '.join(x.lower().strip().split()))

In [12]:
train.head()

Unnamed: 0,textID,text,selected_text,sentiment,kfold
0,f7fdea625a,i`m so bored i can barely even tweet. i have n...,bored,negative,0
1,c19de2c75b,awwwwwwwwww thats jus...awwwww.did she get to ...,awwwwwwwwww thats jus...awwwww.did she get to ...,neutral,0
2,0f963af18f,i did not twitt yesterday cause it was a very ...,i can not sleep,negative,0
3,0583c78cc1,congratulation`s to phil packer on completing ...,congratulation`s,positive,0
4,1cdb444ea5,o`charleys? pretty good. especially when its f...,pretty good.,positive,0


In [13]:
def clean(x):
    sp_x = x.split()
    if len(sp_x[0])==1 and len(sp_x)>1 and sp_x[0].lower() not in ['i','a','u'] and not sp_x[0].isdigit():
        print(x, '|', ' '.join(sp_x[1:]))
        return ' '.join(sp_x[1:])
    return x

In [14]:
train['c_selected_text'] = train['selected_text'].apply(lambda x: clean(x))
# train['c_selected_text'] = train['selected_text']

e crazy | crazy
d lost | lost
- haha, his heads bigger than yours! awwwh, lol | haha, his heads bigger than yours! awwwh, lol
l miss | miss
: they r sending tw business ppl 2 the mainland 2 return the favor - re: chinese police representatives in tw | they r sending tw business ppl 2 the mainland 2 return the favor - re: chinese police representatives in tw
e such a nasty display picture | such a nasty display picture
t i can`t write fast enough | i can`t write fast enough
. you`re a fantastic actor | you`re a fantastic actor
v v bad sunburn | v bad sunburn
s awesome | awesome
e happy | happy
. probably spelt it wrong lol. | probably spelt it wrong lol.
- you can do yiiiit! night | you can do yiiiit! night
m cool | cool
. sorry | sorry
l die | die
s good | good
g a | a
. i still can`t believe they won`t renew tscc | i still can`t believe they won`t renew tscc
d im all theirs | im all theirs
d isn`t perfect | isn`t perfect
? nice! | nice!
. not cool. | not cool.
! tear | tear
, i feel b

In [15]:
train['start_pos'] = train.apply(lambda x: x['text'].find(x['c_selected_text']), axis=1)
train['end_pos'] = train.apply(lambda x: x['start_pos']+len(x['c_selected_text']), axis=1)

In [16]:
def contains(a, b):
    for i in range(0, len(a)-len(b)+1):
        flag = True
        for j in range(0, len(b)):
            if a[i+j]!=b[j]:
                flag = False
                break
        if flag:
            return True, i
    return False, -1

In [17]:
data = []
improve_count = 0
for text, sp, ep, st in zip(train['text'].tolist(), train['start_pos'].tolist(), train['end_pos'].tolist(),
                           train['c_selected_text'].tolist()):
    split_text = text.split()
    tokens, labels, token_invert_map, in_st = [], [], [], []
    
    # token in selected_tokens
    temp = np.zeros(len(text))
    
    end_pos = 0
    start_pos = text.find(st, end_pos)
    while start_pos>=0:
        end_pos = start_pos+len(st)
        temp[start_pos:end_pos]=1
        start_pos = text.find(st, end_pos)
    
    st = st.replace("`","'")
    st_words = st.split()
    st_tokens = tokenizer.tokenize(' '+st)
    if len(st_words)/len(split_text)>0.9:
        all_sentence = 1
    else:
        all_sentence = 0
    cur_length = 0
    
    words, first_char, invert_map = prepare(text)
    
    for idx, w in enumerate(words):
        w = w.replace("`","'")
        if sum(temp[invert_map[idx]:invert_map[idx]+len(w)])>0:
            started = True# space
        else:
            started = False
        if first_char[idx]:
            prefix = " "
        else:
            prefix = ""
        for idx2, token in enumerate(tokenizer.tokenize(prefix+w)):
            tokens.append(token)
            token_invert_map.append(idx)
            if started:
                if len(labels)==0 or labels[-1]==len(tokens)-2:
                    labels.append(len(tokens)-1)
                in_st.append(1)
            else:
                in_st.append(0)
        cur_length+=len(w)
    start_token_idx = min(labels)
    end_token_idx = max(labels)
#     start_token_idx, end_token_idx = _improve_answer_span(tokens, start_token_idx, end_token_idx,
#                                                          tokenizer, ' '+st)
#     start_token_idx, end_token_idx = _improve_answer_span(tokens, start_token_idx, end_token_idx,
#                                                          tokenizer, st)
    
    start_word_idx = token_invert_map[start_token_idx]
    end_word_idx = token_invert_map[end_token_idx]+1
    
    data.append((words, first_char, tokens, start_token_idx, end_token_idx, token_invert_map, in_st, all_sentence))

In [18]:
words, first_char, tokens, start, end, token_invert_map, in_st, all_sentence = zip(*data)

In [19]:
len(tokens)

27480

In [20]:
train['words'] = words
train['first_char'] = first_char
train['tokens'] = tokens

In [21]:
train['start'] = start
train['end'] = end
train['invert_map'] = token_invert_map
train['in_st'] = in_st
train['all_sentence'] = all_sentence

In [22]:
senti2label = {
    'positive':2,
    'negative':0,
    'neutral':1
}
train['senti_label']=train['sentiment'].apply(lambda x: senti2label[x])

In [23]:
train.head()

Unnamed: 0,textID,text,selected_text,sentiment,kfold,c_selected_text,start_pos,end_pos,words,first_char,tokens,start,end,invert_map,in_st,all_sentence,senti_label
0,f7fdea625a,i`m so bored i can barely even tweet. i have n...,bored,negative,0,bored,7,12,"[i`m, so, bored, i, can, barely, even, tweet, ...","[True, True, True, True, True, True, True, Tru...","[Ġi, 'm, Ġso, Ġbored, Ġi, Ġcan, Ġbarely, Ġeven...",3,3,"[0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, ...","[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,0
1,c19de2c75b,awwwwwwwwww thats jus...awwwww.did she get to ...,awwwwwwwwww thats jus...awwwww.did she get to ...,neutral,0,awwwwwwwwww thats jus...awwwww.did she get to ...,0,63,"[awwwwwwwwww, thats, jus, ., ., ., awwwww, ., ...","[True, True, True, False, False, False, False,...","[Ġa, ww, ww, ww, ww, ww, Ġthats, Ġj, us, ., .,...",0,23,"[0, 0, 0, 0, 0, 0, 1, 2, 2, 3, 4, 5, 6, 6, 6, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",1,1
2,0f963af18f,i did not twitt yesterday cause it was a very ...,i can not sleep,negative,0,i can not sleep,85,100,"[i, did, not, twitt, yesterday, cause, it, was...","[True, True, True, True, True, True, True, Tru...","[Ġi, Ġdid, Ġnot, Ġtw, itt, Ġyesterday, Ġcause,...",21,24,"[0, 1, 2, 3, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,0
3,0583c78cc1,congratulation`s to phil packer on completing ...,congratulation`s,positive,0,congratulation`s,0,16,"[congratulation`s, to, phil, packer, on, compl...","[True, True, True, True, True, True, True, Tru...","[Ġcongrat, ulation, 's, Ġto, Ġphil, Ġpack, er,...",0,2,"[0, 0, 0, 1, 2, 3, 3, 4, 5, 6, 7, 7, 8, 9, 10,...","[1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,2
4,1cdb444ea5,o`charleys? pretty good. especially when its f...,pretty good.,positive,0,pretty good.,12,24,"[o`charleys, ?, pretty, good, ., especially, w...","[True, False, True, True, False, True, True, T...","[Ġo, ', char, leys, ?, Ġpretty, Ġgood, ., Ġesp...",5,7,"[0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11...","[0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, ...",0,2


## evaluate 

In [24]:
def jaccard_list(l1, l2):
    a = set(l1)
    b = set(l2)
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

In [25]:
def get_jaccard(x):
    label = x['selected_text'].split()
    words = x['words']
    start = x['invert_map'][x['start']]
    end = x['invert_map'][x['end']]
    label2 = decode(words, x['first_char'], start, end).split()
    return jaccard_list(label, label2)

In [26]:
train['label_jaccard'] = train.apply(lambda x: get_jaccard(x), axis=1)

In [27]:
train['label_jaccard'].mean()

0.9707626025143626

In [28]:
train[train['label_jaccard']<0.5].sample(n=10)

Unnamed: 0,textID,text,selected_text,sentiment,kfold,c_selected_text,start_pos,end_pos,words,first_char,tokens,start,end,invert_map,in_st,all_sentence,senti_label,label_jaccard
24727,4eceb29bf2,thanks our little girl just loves animals so a...,ry cute,positive,4,ry cute,78,85,"[thanks, our, little, girl, just, loves, anima...","[True, True, True, True, True, True, True, Tru...","[Ġthanks, Ġour, Ġlittle, Ġgirl, Ġjust, Ġloves,...",21,22,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 10, 10, 10,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,2,0.333333
20327,172a9fc80f,i really like lady gaga`s 'paparazzi'... #what...,like la,positive,3,like la,9,16,"[i, really, like, lady, gaga`s, 'paparazzi', ....","[True, True, True, True, True, True, False, Fa...","[Ġi, Ġreally, Ġlike, Ġlady, Ġg, aga, 's, Ġ', p...",2,3,"[0, 1, 2, 3, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 6, ...","[0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,2,0.333333
16463,708e67409c,_bhb i went to that concert and i remember der...,miss se,negative,3,miss se,74,81,"[_bhb, i, went, to, that, concert, and, i, rem...","[True, True, True, True, True, True, True, Tru...","[Ġ_, bh, b, Ġi, Ġwent, Ġto, Ġthat, Ġconcert, Ġ...",19,20,"[0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 9, 10, 11...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,0,0.333333
26189,a7f72a928a,woooooooooo are you coming to nottingham at an...,t? lovelovelove,positive,4,t? lovelovelove,52,67,"[woooooooooo, are, you, coming, to, nottingham...","[True, True, True, True, True, True, True, Tru...","[Ġw, oooooooo, oo, Ġare, Ġyou, Ġcoming, Ġto, Ġ...",12,20,"[0, 0, 0, 1, 2, 3, 4, 5, 5, 5, 6, 7, 8, 9, 10,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, ...",0,2,0.0
22564,030c9cd0c9,#happymothersday to all the moms out there,happymothersday,positive,4,happymothersday,1,16,"[#happymothersday, to, all, the, moms, out, th...","[True, True, True, True, True, True, True]","[Ġ#, h, app, ym, other, s, day, Ġto, Ġall, Ġth...",0,6,"[0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6]","[1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0]",0,2,0.0
9950,3bfc997081,ugggh idk how to do that but i only wanna stop...,a stop,negative,1,a stop,40,46,"[ugggh, idk, how, to, do, that, but, i, only, ...","[True, True, True, True, True, True, True, Tru...","[Ġu, gg, gh, Ġid, k, Ġhow, Ġto, Ġdo, Ġthat, Ġb...",12,13,"[0, 0, 0, 1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, ...",0,0,0.333333
10597,772aeb5f05,pulled from interesting meeting to an urgent s...,interesting me,positive,1,interesting me,12,26,"[pulled, from, interesting, meeting, to, an, u...","[True, True, True, True, True, True, True, Tru...","[Ġpulled, Ġfrom, Ġinteresting, Ġmeeting, Ġto, ...",2,3,"[0, 1, 2, 3, 4, 5, 6, 7, 8]","[0, 0, 1, 1, 0, 0, 0, 0, 0]",0,2,0.333333
9780,aa730765ab,kenny u alive!!!...i`m here getting da hair do...,a sad,negative,1,a sad,88,93,"[kenny, u, alive, !, !, !, ., ., ., i`m, here,...","[True, True, True, False, False, False, False,...","[Ġk, enny, Ġu, Ġalive, !, !, !, ., ., ., i, 'm...",31,32,"[0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 9, 10, 11, 1...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,0,0.333333
16773,da59221e53,"toooooom! do a tour in the philippines, pleeea...",`m risking,negative,3,`m risking,51,61,"[toooooom, !, do, a, tour, in, the, philippine...","[True, False, True, True, True, True, True, Tr...","[Ġtoo, oo, oom, !, Ġdo, Ġa, Ġtour, Ġin, Ġthe, ...",17,19,"[0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 7, 7, 8, 9, 9, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,0,0.333333
12500,43e6d9aeaa,well i guess they think of everything thanks s...,g thank,positive,2,thank,38,43,"[well, i, guess, they, think, of, everything, ...","[True, True, True, True, True, True, True, Tru...","[Ġwell, Ġi, Ġguess, Ġthey, Ġthink, Ġof, Ġevery...",7,7,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ...",0,2,0.0


In [29]:
train.rename(index=str, columns={'kfold':'fold'}, inplace=True)

In [30]:
train.head()

Unnamed: 0,textID,text,selected_text,sentiment,fold,c_selected_text,start_pos,end_pos,words,first_char,tokens,start,end,invert_map,in_st,all_sentence,senti_label,label_jaccard
0,f7fdea625a,i`m so bored i can barely even tweet. i have n...,bored,negative,0,bored,7,12,"[i`m, so, bored, i, can, barely, even, tweet, ...","[True, True, True, True, True, True, True, Tru...","[Ġi, 'm, Ġso, Ġbored, Ġi, Ġcan, Ġbarely, Ġeven...",3,3,"[0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, ...","[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,0,1.0
1,c19de2c75b,awwwwwwwwww thats jus...awwwww.did she get to ...,awwwwwwwwww thats jus...awwwww.did she get to ...,neutral,0,awwwwwwwwww thats jus...awwwww.did she get to ...,0,63,"[awwwwwwwwww, thats, jus, ., ., ., awwwww, ., ...","[True, True, True, False, False, False, False,...","[Ġa, ww, ww, ww, ww, ww, Ġthats, Ġj, us, ., .,...",0,23,"[0, 0, 0, 0, 0, 0, 1, 2, 2, 3, 4, 5, 6, 6, 6, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",1,1,1.0
2,0f963af18f,i did not twitt yesterday cause it was a very ...,i can not sleep,negative,0,i can not sleep,85,100,"[i, did, not, twitt, yesterday, cause, it, was...","[True, True, True, True, True, True, True, Tru...","[Ġi, Ġdid, Ġnot, Ġtw, itt, Ġyesterday, Ġcause,...",21,24,"[0, 1, 2, 3, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,0,1.0
3,0583c78cc1,congratulation`s to phil packer on completing ...,congratulation`s,positive,0,congratulation`s,0,16,"[congratulation`s, to, phil, packer, on, compl...","[True, True, True, True, True, True, True, Tru...","[Ġcongrat, ulation, 's, Ġto, Ġphil, Ġpack, er,...",0,2,"[0, 0, 0, 1, 2, 3, 3, 4, 5, 6, 7, 7, 8, 9, 10,...","[1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,2,1.0
4,1cdb444ea5,o`charleys? pretty good. especially when its f...,pretty good.,positive,0,pretty good.,12,24,"[o`charleys, ?, pretty, good, ., especially, w...","[True, False, True, True, False, True, True, T...","[Ġo, ', char, leys, ?, Ġpretty, Ġgood, ., Ġesp...",5,7,"[0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11...","[0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, ...",0,2,1.0


In [31]:
train.to_pickle('../input/train_roberta4.pkl')

In [43]:
# local_test.to_pickle('../input/localtest_roberta2.pkl')