In [1]:
import pandas as pd

import numpy as np
from transformers import BertTokenizer
from transformers import RobertaTokenizer
from transformers import RobertaTokenizerFast

In [2]:
def clean(x):
    sp_x = x.split()
    if len(sp_x[0]) == 1 and len(sp_x) > 1 and sp_x[0].lower() not in ['i', 'a', 'u'] and not sp_x[0].isdigit():
        return ' '.join(sp_x[1:])
    return x

In [3]:
pred = pd.read_pickle('../input/tweet-sentiment-extraction/preds.pkl')

In [4]:
pred['text'] = pred['text'].apply(lambda x: ' '.join(x.strip().split()))
pred['selected_text'] = pred['selected_text'].apply(
                lambda x: ' '.join(x.strip().split()))
pred['selected_text'] = pred['selected_text'].apply(lambda x: clean(x))

In [5]:
def prepare(text):
    words = text.split()
    retval, first_char, invert_map = [], [], []
    current_pos = 0
    for w in words:
        word_ret = [""]
        word_invert = [current_pos]
        for p, c in enumerate(w):
            if c in ['.',',','!','?','(',')',';',':','-','=',"/","<","`"]:
                if word_ret[-1]=="":
                    word_ret[-1]+=c
                    word_invert[-1]=current_pos+p
                else:
                    word_ret.append(c)
                    word_invert.append(current_pos+p)
                word_ret.append("")
                word_invert.append(current_pos+p+1)
            else:
                word_ret[-1]+=c
        if len(word_ret[-1])==0:
            word_ret.pop(-1)
            word_invert.pop(-1)
        word_first = [False if i>0 else True for i in range(len(word_ret)) ]
        retval.extend(word_ret)
        first_char.extend(word_first)
        invert_map.extend(word_invert)
        current_pos+=len(w)+1
    assert len(retval)==len(first_char)
    return retval, first_char, invert_map

In [6]:
texts = pred['text'].tolist()
sts = pred['selected_text'].tolist()
tokenizer = RobertaTokenizer.from_pretrained('../../bert_models/roberta_base/')

In [7]:
data = []
for text in texts:
    words, first_char, invert_map = prepare(text)

    tokens, token_invert_map = [], []
    for idx, w in enumerate(words):
        # get tokens
        w = w.replace("'", "\"")
        w = w.replace("`", "'")
        w = w.replace("ï¿½", "")
        if first_char[idx]:
            prefix = " "
        else:
            prefix = ""
        for token in tokenizer.tokenize(prefix+w):
            tokens.append(token)
            token_invert_map.append(idx)
    data.append((words, first_char, tokens,
                 token_invert_map, invert_map))
words, first_chars, tokens, invert_maps, word_invert_maps = zip(*data)
pred['words'] = words

pred['first_char'] = first_chars
pred['invert_map'] = invert_maps  # token id to word id
# word to pos in sentence
pred['word_invert_map'] = word_invert_maps

In [8]:
start_word_idx = []
end_word_idx = []
whole_sentence = []
for idx in range(len(pred)):
    text = texts[idx]
    st = sts[idx]
    word = words[idx]
    invert_map = word_invert_maps[idx]

    temp = np.zeros(len(text))

    end_pos = 0
    start_pos = text.find(st, end_pos)
    first_end = start_pos+len(st)
    temp[start_pos:first_end] = 1

    label = []
    for word_idx, w in enumerate(word):
        if sum(temp[invert_map[word_idx]:invert_map[word_idx]+len(w)]) > 0:
            label.append(word_idx)
    cur_start_word_idx = min(label)
    cur_end_word_idx = max(label)

    start_word_idx.append(cur_start_word_idx)
    end_word_idx.append(cur_end_word_idx)

pred['start'] = start_word_idx
pred['end'] = end_word_idx


In [9]:
def decode(tokens, first_char, start, end):
    retval = ""
    for i in range(start, end+1):
        if first_char[i]:
            retval+= (" "+tokens[i])
        else:
            retval += tokens[i]
    return " ".join(retval.split())

In [10]:
def jaccard_list(l1, l2):
    a = set(l1)
    b = set(l2)
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

In [11]:
def get_jaccard(x):
    label = x['selected_text'].split()
    words = x['words']
    start = x['start']
    end = x['end']
    label2 = decode(words, x['first_char'], start, end).split()
    return jaccard_list(label, label2)
pred['label_jaccard'] = pred.apply(lambda x: get_jaccard(x), axis=1)

In [12]:
def jaccard_string(s1, s2):
    a = set(s1.lower().split())
    b = set(s2.lower().split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

In [13]:
pred['jaccard'] = pred.apply(lambda x: jaccard_string(x['selected_text'], x['pred']), axis=1)

In [14]:
pred.head()

Unnamed: 0,textID,text,selected_text,sentiment,kfold,start_pred,end_pred,pred,score,words,first_char,invert_map,word_invert_map,start,end,label_jaccard,jaccard
0,f7fdea625a,i`m so bored i can barely even tweet. i have n...,bored,negative,0,"[0.347703, 0.000886331, 0.001373101, 0.0729962...","[7.24877e-05, 6.3530555e-05, 0.0004330402, 0.0...",bored,1.205407,"[i, `, m, so, bored, i, can, barely, even, twe...","[True, False, False, True, True, True, True, T...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[0, 1, 2, 4, 7, 13, 15, 19, 26, 31, 36, 38, 40...",4,4,1.0,1.0
1,c19de2c75b,awwwwwwwwww thats jus...awwwww.did she get to ...,awwwwwwwwww thats jus...awwwww.did she get to ...,neutral,0,"[0.9957065, 0.00022696702, 2.7850607e-05, 1.21...","[4.615283e-06, 5.261232e-06, 4.9657247e-06, 6....",awwwwwwwwww thats jus...awwwww.did she get to ...,1.957682,"[awwwwwwwwww, thats, jus, ., ., ., awwwww, ., ...","[True, True, True, False, False, False, False,...","[0, 0, 0, 0, 0, 0, 1, 2, 2, 3, 4, 5, 6, 6, 6, ...","[0, 12, 18, 21, 22, 23, 24, 30, 31, 35, 39, 43...",0,15,1.0,1.0
2,0f963af18f,I did not twitt yesterday cause it was a very ...,I can not sleep,negative,0,"[0.09947917, 0.0034957984, 0.0021510508, 0.002...","[0.00034915112, 0.0003514515, 0.0006098294, 0....",I can not sleep I can not sleep I`m like in z...,1.616367,"[I, did, not, twitt, yesterday, cause, it, was...","[True, True, True, True, True, True, True, Tru...","[0, 1, 2, 3, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, ...","[0, 2, 6, 10, 16, 26, 32, 35, 39, 41, 46, 51, ...",20,23,1.0,0.444444
3,0583c78cc1,Congratulation`s to phil packer on completing ...,Congratulation`s,positive,0,"[0.67805344, 0.0010435485, 0.0007062184, 0.000...","[0.0012406113, 0.0075232503, 0.07546322, 0.016...",Congratulation`s Congratulation`s to phil pac...,1.781777,"[Congratulation, `, s, to, phil, packer, on, c...","[True, False, False, True, True, True, True, T...","[0, 0, 0, 1, 2, 3, 4, 5, 5, 6, 7, 8, 9, 9, 10,...","[0, 14, 15, 17, 20, 25, 32, 35, 46, 50, 57, 66...",0,2,1.0,0.083333
4,1cdb444ea5,O`Charleys? Pretty good. Especially when its f...,Pretty good.,positive,0,"[0.046210214, 0.00013773396, 0.00040426903, 0....","[5.95386e-05, 6.303577e-05, 7.8312936e-05, 0.0...",Pretty good.,1.373376,"[O, `, Charleys, ?, Pretty, good, ., Especiall...","[True, False, False, False, True, True, False,...","[0, 1, 2, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, ...","[0, 1, 2, 10, 12, 19, 23, 25, 36, 41, 45, 49, ...",4,6,1.0,1.0


In [15]:
pred['delta_jaccard'] = pred['label_jaccard']-pred['jaccard']

In [16]:
pred[pred['delta_jaccard']<0.2][['text','sentiment','selected_text','pred','jaccard','label_jaccard','score']].sample(n=10)

Unnamed: 0,text,sentiment,selected_text,pred,jaccard,label_jaccard,score
24289,My wiki project is going to have to wait anoth...,neutral,My wiki project is going to have to wait anoth...,My wiki project is going to have to wait anoth...,1.0,1.0,1.96201
609,"don`t wait on Red, could be ages. I know what ...",neutral,"don`t wait on Red, could be ages. I know what ...","don`t wait on Red, could be ages. I know what ...",1.0,1.0,1.98527
18339,"Sons of ****, why couldn`t they put them on th...",negative,"Sons of ****,","Sons of ****,",1.0,1.0,1.336935
3886,sure ill follow you hun ohh thank you are you ...,positive,thank you,thank you,1.0,1.0,1.20201
8160,I have a great house to sell you! Comes comple...,positive,great,great,1.0,1.0,1.167444
18943,Sickkkk ....and I need 11/20--NOW.,negative,Sickkkk,Sickkkk,1.0,1.0,1.816747
19358,Preston stayed home and I don`t know why,neutral,Preston stayed home and I don`t know why,Preston stayed home and I don`t know why,1.0,1.0,1.99744
12300,Kicking Back In The Holidays No More School fo...,neutral,Kicking Back In The Holidays No More School fo...,Kicking Back In The Holidays No More School fo...,1.0,1.0,1.998184
23849,"Itï¿½s a beautiful blog, but Iï¿½m not able to...",neutral,"Itï¿½s a beautiful blog, but Iï¿½m not able to...","Itï¿½s a beautiful blog, but Iï¿½m not able to...",1.0,1.0,1.905221
26948,supposed to be great weather today & 2moro; ju...,neutral,supposed to be great weather today & 2moro; ju...,supposed to be great weather today & 2moro; ju...,1.0,1.0,1.913535


In [17]:
pred[pred['delta_jaccard']>0.8][['text','sentiment','selected_text','pred','jaccard','label_jaccard','score']].sample(n=10)

Unnamed: 0,text,sentiment,selected_text,pred,jaccard,label_jaccard,score
2778,wow! that is an *awesome* review; carry it wit...,positive,*awesome*,*awesome* *awesome* review; carry it with you...,0.071429,1.0,1.440395
16995,start again... so now theres 6 people coming w...,negative,short stack.,but still only 3 for ss,0.0,1.0,1.097829
994,is sooo stressed about everything I have to do,negative,is sooo stressed about everything I have to do,stressed,0.111111,1.0,1.458515
276,"I know, It`s so crazy! I love using it too Hav...",positive,"I know, It`s so crazy! I love using it too Hav...",I love,0.105263,1.0,1.036791
20499,Have $#@&!!! Malware on my work PC! I miss hav...,negative,I miss having an IT dept,miss Malware on my work PC! I miss,0.181818,1.0,1.52176
26296,U no that little prob with ur twitter that hap...,positive,mayb this will help u fix it,help,0.142857,1.0,1.271648
5264,they are all over one is a fan with a vip and ...,positive,winner,fan,0.0,1.0,1.047755
24183,Just started feeling bad again ugh. I hate it ...,negative,started feeling bad,bad again ugh. I hate it when I don`t feel go...,0.083333,1.0,1.081932
13181,thanks gail imma try this one day! looks yummy...,positive,looks yummy,thanks gail imma try this one day! looks yummy!,0.1,1.0,1.056089
6130,give them my best!,positive,best,give them my best!,0.0,1.0,1.425461


In [18]:
pred.loc[5121, 'score']

1.204892635345459

In [19]:
pred.loc[5121,'words']

['I',
 'have',
 'such',
 'fantastic',
 'friends',
 ',',
 'including',
 'several',
 'ones',
 'met',
 'through',
 'here',
 '!',
 'thanks',
 'for',
 'being',
 'in',
 'my',
 'life',
 '-',
 'you',
 'are',
 'such',
 'amazing',
 'people',
 '!']

In [22]:
pred.loc[5264,'start_pred']

array([9.81185138e-02, 1.41678040e-03, 4.93733585e-03, 1.77191396e-03,
       6.82277530e-02, 8.49719346e-03, 3.49782445e-02, 5.02023160e-01,
       4.99137794e-04, 9.88097279e-04, 2.13963143e-03, 1.15508868e-04,
       1.77697721e-03, 8.86982400e-03, 3.39606334e-03, 3.18561494e-03,
       7.11473497e-03, 2.19332110e-02, 1.46088466e-01, 7.16459937e-04,
       1.19606929e-03, 3.02357739e-03, 2.29963195e-03, 1.07984255e-04,
       7.18443841e-03, 1.12657312e-04, 8.25565308e-03, 4.41139378e-03,
       6.29264468e-05, 4.17986093e-03, 9.03637338e-05, 8.02920293e-03,
       7.13601694e-05, 6.63818559e-03, 1.92632055e-04, 2.70112883e-04,
       3.70642096e-02, 1.51517124e-05, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00], dtype=float32)

In [23]:
pred.loc[5264,'end_pred']

array([3.56165925e-04, 2.31539452e-04, 8.73471654e-05, 1.59801613e-03,
       3.96914460e-04, 1.68386148e-04, 2.09067672e-04, 5.45909703e-01,
       1.93442404e-02, 1.57455006e-03, 1.45811267e-04, 4.44929227e-02,
       4.84313816e-03, 5.02101087e-04, 4.03384533e-04, 1.88563732e-04,
       1.12367103e-04, 5.48380078e-04, 1.91551939e-01, 8.76957644e-03,
       1.06053101e-03, 8.19328893e-03, 1.50607055e-04, 2.42818426e-02,
       3.08785646e-04, 9.46327951e-03, 1.39125797e-03, 8.18730987e-05,
       2.96283583e-03, 1.36234943e-04, 2.57435534e-03, 1.20302160e-04,
       2.61232071e-03, 1.66190744e-04, 1.52182893e-03, 9.62494873e-03,
       1.13905616e-01, 9.78956268e-06, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00], dtype=float32)