In [1]:
import pandas as pd

import numpy as np
from transformers import BertTokenizer
from transformers import RobertaTokenizer
from transformers import RobertaTokenizerFast

In [2]:
def clean(x):
    sp_x = x.split()
    if len(sp_x[0]) == 1 and len(sp_x) > 1 and sp_x[0].lower() not in ['i', 'a', 'u'] and not sp_x[0].isdigit():
        return ' '.join(sp_x[1:])
    return x

In [3]:
pred = pd.read_pickle('../input/tweet-sentiment-extraction/preds.pkl')

In [4]:
pred['text'] = pred['text'].apply(lambda x: ' '.join(x.strip().split()))
pred['selected_text'] = pred['selected_text'].apply(
                lambda x: ' '.join(x.strip().split()))
pred['selected_text'] = pred['selected_text'].apply(lambda x: clean(x))

In [5]:
def prepare(text):
    words = text.split()
    retval, first_char, invert_map = [], [], []
    current_pos = 0
    for w in words:
        word_ret = [""]
        word_invert = [current_pos]
        for p, c in enumerate(w):
            if c in ['.',',','!','?','(',')',';',':','-','=',"/","<","`"]:
                if word_ret[-1]=="":
                    word_ret[-1]+=c
                    word_invert[-1]=current_pos+p
                else:
                    word_ret.append(c)
                    word_invert.append(current_pos+p)
                word_ret.append("")
                word_invert.append(current_pos+p+1)
            else:
                word_ret[-1]+=c
        if len(word_ret[-1])==0:
            word_ret.pop(-1)
            word_invert.pop(-1)
        word_first = [False if i>0 else True for i in range(len(word_ret)) ]
        retval.extend(word_ret)
        first_char.extend(word_first)
        invert_map.extend(word_invert)
        current_pos+=len(w)+1
    assert len(retval)==len(first_char)
    return retval, first_char, invert_map

In [6]:
texts = pred['text'].tolist()
sts = pred['selected_text'].tolist()
tokenizer = RobertaTokenizer.from_pretrained('../../bert_models/roberta_base/')

In [7]:
data = []
for text in texts:
    words, first_char, invert_map = prepare(text)

    tokens, token_invert_map = [], []
    for idx, w in enumerate(words):
        # get tokens
        w = w.replace("'", "\"")
        w = w.replace("`", "'")
        w = w.replace("ï¿½", "")
        if first_char[idx]:
            prefix = " "
        else:
            prefix = ""
        for token in tokenizer.tokenize(prefix+w):
            tokens.append(token)
            token_invert_map.append(idx)
    data.append((words, first_char, tokens,
                 token_invert_map, invert_map))
words, first_chars, tokens, invert_maps, word_invert_maps = zip(*data)
pred['words'] = words

pred['first_char'] = first_chars
pred['invert_map'] = invert_maps  # token id to word id
# word to pos in sentence
pred['word_invert_map'] = word_invert_maps

In [8]:
start_word_idx = []
end_word_idx = []
whole_sentence = []
for idx in range(len(pred)):
    text = texts[idx]
    st = sts[idx]
    word = words[idx]
    invert_map = word_invert_maps[idx]

    temp = np.zeros(len(text))

    end_pos = 0
    start_pos = text.find(st, end_pos)
    first_end = start_pos+len(st)
    temp[start_pos:first_end] = 1

    label = []
    for word_idx, w in enumerate(word):
        if sum(temp[invert_map[word_idx]:invert_map[word_idx]+len(w)]) > 0:
            label.append(word_idx)
    cur_start_word_idx = min(label)
    cur_end_word_idx = max(label)

    start_word_idx.append(cur_start_word_idx)
    end_word_idx.append(cur_end_word_idx)

pred['start'] = start_word_idx
pred['end'] = end_word_idx


In [9]:
def decode(tokens, first_char, start, end):
    retval = ""
    for i in range(start, end+1):
        if first_char[i]:
            retval+= (" "+tokens[i])
        else:
            retval += tokens[i]
    return " ".join(retval.split())

In [10]:
def jaccard_list(l1, l2):
    a = set(l1)
    b = set(l2)
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

In [11]:
def get_clean_label(x):
    label = x['selected_text'].split()
    words = x['words']
    start = x['start']
    end = x['end']
    label2 = decode(words, x['first_char'], start, end)
    return label2
    
def get_jaccard(x):
    label2 = get_clean_label(x).split()
    return jaccard_list(label, label2)
pred['label_jaccard'] = pred.apply(lambda x: get_jaccard(x), axis=1)

In [12]:
def jaccard_string(s1, s2):
    a = set(s1.lower().split())
    b = set(s2.lower().split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

In [13]:
pred['jaccard'] = pred.apply(lambda x: jaccard_string(x['selected_text'], x['pred']), axis=1)

In [14]:
pred.head()

Unnamed: 0,textID,text,selected_text,sentiment,kfold,start_pred,end_pred,pred,score,words,first_char,invert_map,word_invert_map,start,end,label_jaccard,jaccard
0,f7fdea625a,i`m so bored i can barely even tweet. i have n...,bored,negative,0,"[0.45137984, 0.0014947603, 0.0007963988, 0.052...","[8.418362e-05, 3.527233e-05, 0.0001381696, 0.0...",bored,1.226244,"[i, `, m, so, bored, i, can, barely, even, twe...","[True, False, False, True, True, True, True, T...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[0, 1, 2, 4, 7, 13, 15, 19, 26, 31, 36, 38, 40...",4,4,1.0,1.0
1,c19de2c75b,awwwwwwwwww thats jus...awwwww.did she get to ...,awwwwwwwwww thats jus...awwwww.did she get to ...,neutral,0,"[0.998109, 8.778066e-05, 2.1932923e-05, 1.3662...","[1.3598212e-06, 9.413518e-06, 1.3074637e-05, 2...",awwwwwwwwww thats jus...awwwww.did she get to ...,1.919354,"[awwwwwwwwww, thats, jus, ., ., ., awwwww, ., ...","[True, True, True, False, False, False, False,...","[0, 0, 0, 0, 0, 0, 1, 2, 2, 3, 4, 5, 6, 6, 6, ...","[0, 12, 18, 21, 22, 23, 24, 30, 31, 35, 39, 43...",0,15,1.0,1.0
2,0f963af18f,I did not twitt yesterday cause it was a very ...,I can not sleep,negative,0,"[0.05527878, 0.001542015, 0.00093309575, 0.000...","[0.00043069865, 0.000490151, 0.00037452488, 7....",I can not sleep,0.950716,"[I, did, not, twitt, yesterday, cause, it, was...","[True, True, True, True, True, True, True, Tru...","[0, 1, 2, 3, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, ...","[0, 2, 6, 10, 16, 26, 32, 35, 39, 41, 46, 51, ...",20,23,1.0,1.0
3,0583c78cc1,Congratulation`s to phil packer on completing ...,Congratulation`s,positive,0,"[0.7692519, 0.0009937803, 0.00046654698, 0.000...","[0.0029487284, 0.01649865, 0.044223037, 0.0125...",Congratulation`s to phil packer on completing ...,0.978717,"[Congratulation, `, s, to, phil, packer, on, c...","[True, False, False, True, True, True, True, T...","[0, 0, 0, 1, 2, 3, 4, 5, 5, 6, 7, 8, 9, 9, 10,...","[0, 14, 15, 17, 20, 25, 32, 35, 46, 50, 57, 66...",0,2,1.0,0.066667
4,1cdb444ea5,O`Charleys? Pretty good. Especially when its f...,Pretty good.,positive,0,"[0.047815684, 0.00011195579, 6.788112e-05, 0.0...","[4.4626497e-05, 1.5289357e-05, 1.3610286e-05, ...",Pretty good.,1.53809,"[O, `, Charleys, ?, Pretty, good, ., Especiall...","[True, False, False, False, True, True, False,...","[0, 1, 2, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, ...","[0, 1, 2, 10, 12, 19, 23, 25, 36, 41, 45, 49, ...",4,6,1.0,1.0


In [15]:
pred['delta_jaccard'] = pred['label_jaccard']-pred['jaccard']

In [21]:
pred[pred['label_jaccard']<0.5].sample(n=10)

Unnamed: 0,textID,text,selected_text,sentiment,kfold,start_pred,end_pred,pred,score,words,first_char,invert_map,word_invert_map,start,end,label_jaccard,jaccard,delta_jaccard
26652,782c3332f0,"awwwwwww, our kitties are the BEST!",he BEST!,positive,4,"[0.2955215, 0.0003622917, 0.00020382882, 0.000...","[6.2780244e-05, 0.0001590589, 0.0003108243, 0....",BEST!,1.373746,"[awwwwwww, ,, our, kitties, are, the, BEST, !]","[True, False, True, True, True, True, True, Fa...","[0, 0, 0, 0, 1, 2, 3, 3, 3, 4, 5, 6, 7]","[0, 8, 10, 14, 22, 26, 30, 34]",5,7,0.333333,0.5,-0.166667
24699,8a053811f7,:kwanghock it is the Hao Da Za Ji Pa?? I miss ...,I mis,negative,4,"[0.012450337, 0.0004115422, 0.00034273556, 0.0...","[7.2034614e-05, 2.8769706e-05, 2.1856718e-05, ...",miss,1.235155,"[:, kwanghock, it, is, the, Hao, Da, Za, Ji, P...","[True, False, True, True, True, True, True, Tr...","[0, 1, 1, 1, 1, 2, 3, 4, 5, 5, 6, 7, 8, 9, 10,...","[0, 1, 11, 14, 17, 21, 25, 28, 31, 34, 36, 37,...",12,13,0.333333,0.0,0.333333
14948,7096e797f5,16 too crazy about miley and JB and in love wi...,in lo,positive,2,"[0.17058401, 0.010298211, 0.06925307, 0.000224...","[0.00070555095, 0.00042235508, 0.095842354, 0....",in love,0.949138,"[16, too, crazy, about, miley, and, JB, and, i...","[True, True, True, True, True, True, True, Tru...","[0, 1, 2, 3, 4, 4, 5, 6, 6, 7, 8, 9, 10, 11, 1...","[0, 3, 7, 13, 19, 25, 29, 32, 36, 39, 44, 49, ...",8,9,0.333333,0.333333,0.0
25722,24ad5b316e,sleeping... would`ve been home sooner but we a...,killed ba,negative,4,"[0.028967204, 8.0367936e-05, 0.00022147846, 0....","[0.0005293132, 0.00061285787, 0.00045297862, 0...",killed,1.145224,"[sleeping, ., ., ., would, `, ve, been, home, ...","[True, False, False, False, True, False, False...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[0, 8, 9, 10, 12, 17, 18, 21, 26, 31, 38, 42, ...",13,14,0.333333,0.5,-0.166667
21954,f2eb0b5e90,hi hun!! i really loved your tutorial yday! so...,he bes,positive,3,"[0.028770316, 0.004409347, 0.0005281592, 0.001...","[0.0004945374, 0.0001978512, 0.00081191584, 0....",loved,0.839457,"[hi, hun, !, !, i, really, loved, your, tutori...","[True, True, False, False, True, True, True, T...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 9, 10, 11, 11, ...","[0, 3, 6, 7, 9, 11, 18, 24, 29, 38, 42, 44, 50...",18,19,0.0,0.0,0.0
968,5419aaf31e,Packing up and leaving inlaws house heading ho...,nice we,positive,0,"[0.037790995, 0.00010328459, 0.00027092957, 0....","[8.8559216e-05, 0.0004358559, 0.00033933003, 0...",nice,1.032005,"[Packing, up, and, leaving, inlaws, house, hea...","[True, True, True, True, True, True, True, Tru...","[0, 0, 1, 2, 3, 4, 4, 5, 6, 7, 8, 9, 10, 11, 1...","[0, 8, 11, 15, 23, 30, 36, 44, 49, 55, 59, 60,...",15,16,0.333333,0.5,-0.166667
26278,2af5610991,"that was flippin` sweet, dudes. thanks for sha...",es. thank,positive,4,"[0.3027268, 0.0035552427, 0.0994104, 0.0005752...","[0.0003879755, 0.0002432866, 0.00013025648, 0....",thanks for sharing,0.826731,"[that, was, flippin, `, sweet, ,, dudes, ., th...","[True, True, True, False, True, False, True, F...","[0, 1, 2, 2, 2, 3, 4, 5, 6, 7, 8, 9, 10]","[0, 5, 9, 16, 18, 23, 25, 30, 32, 39, 43]",6,8,0.0,0.0,0.0
23861,1211a1d91f,too bad Vo got me sick I think & I don`t even ...,too ba,negative,4,"[0.5777725, 0.24063675, 0.00682713, 0.01830984...","[0.0012079808, 0.4352212, 0.026310198, 0.00130...",too bad,1.012891,"[too, bad, Vo, got, me, sick, I, think, &, I, ...","[True, True, True, True, True, True, True, Tru...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[0, 4, 8, 11, 15, 18, 23, 25, 31, 33, 35, 38, ...",0,1,0.333333,0.333333,0.0
25337,c573f1463f,Night Night Everyone. HAPPY MOTHERS DAY to all...,HAPP,positive,4,"[0.06773844, 0.0009980967, 0.01128484, 0.00095...","[0.0022727137, 0.0022228332, 0.00018893919, 0....",HAPPY,1.357043,"[Night, Night, Everyone, ., HAPPY, MOTHERS, DA...","[True, True, True, False, True, True, True, Tr...","[0, 1, 2, 3, 4, 4, 4, 5, 5, 5, 6, 7, 8, 9]","[0, 6, 12, 20, 22, 28, 36, 40, 43, 47]",4,4,0.0,0.0,0.0
3232,691f395bee,"Pegel..cape ya keliling kuil..but,totally awes...",awesome!wa,positive,0,"[0.03055521, 0.0004171589, 0.000773601, 0.0040...","[6.388052e-05, 0.00046102278, 0.00046494222, 0...","awesome!was amazed by the temple,culture,ambie...",0.695011,"[Pegel, ., ., cape, ya, keliling, kuil, ., ., ...","[True, False, False, False, True, True, True, ...","[0, 0, 1, 2, 3, 4, 5, 5, 5, 6, 6, 6, 7, 8, 9, ...","[0, 5, 6, 7, 12, 15, 24, 28, 29, 30, 33, 34, 4...",12,14,0.0,0.0,0.0


In [16]:
pred[pred['delta_jaccard']<0.2][['text','sentiment','selected_text','pred','jaccard','label_jaccard','score']].sample(n=10)

Unnamed: 0,text,sentiment,selected_text,pred,jaccard,label_jaccard,score
7958,Movie night with kate,neutral,Movie night with kate,Movie night with kate,1.0,1.0,1.997785
16179,Score two days I get more food stamps good cau...,positive,good,good,1.0,1.0,1.488206
3348,What the heck? Why?? What`s going on?,negative,What the heck?,What the heck?,1.0,1.0,1.71108
22655,nice! I wish twitter would tile mine,positive,nice!,nice!,1.0,1.0,1.632294
2244,Editing all the photos I took at my brothers s...,neutral,Editing all the photos I took at my brothers s...,Editing all the photos I took at my brothers s...,1.0,1.0,1.948877
8366,"Well, yeah. But my mom doesn`t wanna drive all...",neutral,"Well, yeah. But my mom doesn`t wanna drive all...","Well, yeah. But my mom doesn`t wanna drive all...",0.9,0.9,1.951087
8769,"So-so, thanks I`ve uploaded a new campaign vid...",neutral,"So-so, thanks I`ve uploaded a new campaign vid...","So-so, thanks I`ve uploaded a new campaign vid...",1.0,1.0,1.823022
21953,Just got home i love stake and shake milkshakes,positive,love,love,1.0,1.0,1.336289
23187,: yay! it will be good to have you back,positive,good,good,1.0,1.0,1.02429
24947,yaaw some one want call with me?? 5529634599,neutral,yaaw some one want call with me?? 5529634599,yaaw some one want call with me?? 5529634599,1.0,1.0,1.904842


In [18]:
pred[pred['delta_jaccard']>0.8][['text','sentiment','selected_text','pred','jaccard','label_jaccard','score']].sample(n=10)

Unnamed: 0,text,sentiment,selected_text,pred,jaccard,label_jaccard,score
9720,I really hope you see my tweets. Sent you so m...,positive,please? *prays*,I really hope,0.0,1.0,0.781391
20678,Date Like A Man So You Dont Get Played Like A ...,negative,Date Like A Man So You Dont Get Played Like A ...,****,0.1,1.0,1.211964
25347,is not feelin well... i feel sooooo weak....i ...,negative,weak..,i feel sooooo weak....i hate bein sick in the ...,0.0,1.0,0.733103
8721,Chem wasn`t better than physics. And now I`m s...,negative,Chem wasn`t better than physics. And now I`m s...,tired.,0.1,1.0,0.886198
17204,"lol. When I went to buy my new laptop in Feb.,...",negative,sadfaced,sadfaced.,0.0,1.0,1.404858
10831,I am up and feeling pretty **** gud! I dont ev...,positive,I am up and feeling pretty **** gud! I dont ev...,nice feeling,0.133333,1.0,0.976072
1839,its Monday and all is good,positive,its Monday and all is good,good,0.166667,1.0,1.603872
11246,I can`t believe how tired I am right now... I ...,negative,Exhaaaausted,tired I am right now... I don`t know if I can ...,0.0,1.0,0.73642
4988,sadly awake. wondering about contact info for ...,negative,sadly awake.,sadly awake. wondering about contact info for ...,0.086957,1.0,0.952517
18027,I don`t like the possibility of the left side ...,negative,don`t like,I don`t like the possibility of the left side ...,0.181818,1.0,0.864112


In [18]:
pred.loc[5121, 'score']

1.204892635345459

In [19]:
pred.loc[5121,'words']

['I',
 'have',
 'such',
 'fantastic',
 'friends',
 ',',
 'including',
 'several',
 'ones',
 'met',
 'through',
 'here',
 '!',
 'thanks',
 'for',
 'being',
 'in',
 'my',
 'life',
 '-',
 'you',
 'are',
 'such',
 'amazing',
 'people',
 '!']

In [22]:
pred.loc[5264,'start_pred']

array([9.81185138e-02, 1.41678040e-03, 4.93733585e-03, 1.77191396e-03,
       6.82277530e-02, 8.49719346e-03, 3.49782445e-02, 5.02023160e-01,
       4.99137794e-04, 9.88097279e-04, 2.13963143e-03, 1.15508868e-04,
       1.77697721e-03, 8.86982400e-03, 3.39606334e-03, 3.18561494e-03,
       7.11473497e-03, 2.19332110e-02, 1.46088466e-01, 7.16459937e-04,
       1.19606929e-03, 3.02357739e-03, 2.29963195e-03, 1.07984255e-04,
       7.18443841e-03, 1.12657312e-04, 8.25565308e-03, 4.41139378e-03,
       6.29264468e-05, 4.17986093e-03, 9.03637338e-05, 8.02920293e-03,
       7.13601694e-05, 6.63818559e-03, 1.92632055e-04, 2.70112883e-04,
       3.70642096e-02, 1.51517124e-05, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00], dtype=float32)

In [23]:
pred.loc[5264,'end_pred']

array([3.56165925e-04, 2.31539452e-04, 8.73471654e-05, 1.59801613e-03,
       3.96914460e-04, 1.68386148e-04, 2.09067672e-04, 5.45909703e-01,
       1.93442404e-02, 1.57455006e-03, 1.45811267e-04, 4.44929227e-02,
       4.84313816e-03, 5.02101087e-04, 4.03384533e-04, 1.88563732e-04,
       1.12367103e-04, 5.48380078e-04, 1.91551939e-01, 8.76957644e-03,
       1.06053101e-03, 8.19328893e-03, 1.50607055e-04, 2.42818426e-02,
       3.08785646e-04, 9.46327951e-03, 1.39125797e-03, 8.18730987e-05,
       2.96283583e-03, 1.36234943e-04, 2.57435534e-03, 1.20302160e-04,
       2.61232071e-03, 1.66190744e-04, 1.52182893e-03, 9.62494873e-03,
       1.13905616e-01, 9.78956268e-06, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00], dtype=float32)