In [1]:
import pandas as pd

import numpy as np
from transformers import BertTokenizer
from transformers import RobertaTokenizer
from transformers import RobertaTokenizerFast

In [2]:
def clean(x):
    sp_x = x.split()
    if len(sp_x[0]) == 1 and len(sp_x) > 1 and sp_x[0].lower() not in ['i', 'a', 'u'] and not sp_x[0].isdigit():
        return ' '.join(sp_x[1:])
    return x

In [3]:
pred = pd.read_pickle('../input/tweet-sentiment-extraction/preds.pkl')

In [4]:
pred['text'] = pred['text'].apply(lambda x: ' '.join(x.strip().split()))
pred['selected_text'] = pred['selected_text'].apply(
                lambda x: ' '.join(x.strip().split()))
pred['selected_text'] = pred['selected_text'].apply(lambda x: clean(x))

In [5]:
pred['whole'] = pred.apply(lambda x: len(x['selected_text'])/len(x['text'])>0.99, axis=1)

In [6]:
pred.groupby('sentiment')['whole'].mean()

sentiment
negative    0.145997
neutral     0.892147
positive    0.129807
Name: whole, dtype: float64

In [7]:
pred.shape

(27480, 11)

In [8]:
pred[(pred['sentiment']=='neutral')]['whole'].sum()

9918

In [9]:
pred[(pred['sentiment']=='neutral')]['whole'].count()

11117

In [10]:
pred[(pred['sentiment']=='positive')]['whole'].mean()

0.12980657189466324

In [11]:
def prepare(text):
    words = text.split()
    retval, first_char, invert_map = [], [], []
    current_pos = 0
    for w in words:
        word_ret = [""]
        word_invert = [current_pos]
        for p, c in enumerate(w):
            if c in ['.',',','!','?','(',')',';',':','-','=',"/","<","`"]:
                if word_ret[-1]=="":
                    word_ret[-1]+=c
                    word_invert[-1]=current_pos+p
                else:
                    word_ret.append(c)
                    word_invert.append(current_pos+p)
                word_ret.append("")
                word_invert.append(current_pos+p+1)
            else:
                word_ret[-1]+=c
        if len(word_ret[-1])==0:
            word_ret.pop(-1)
            word_invert.pop(-1)
        word_first = [False if i>0 else True for i in range(len(word_ret)) ]
        retval.extend(word_ret)
        first_char.extend(word_first)
        invert_map.extend(word_invert)
        current_pos+=len(w)+1
    assert len(retval)==len(first_char)
    return retval, first_char, invert_map

In [12]:
texts = pred['text'].tolist()
sts = pred['selected_text'].tolist()
tokenizer = RobertaTokenizer.from_pretrained('../../bert_models/roberta_base/')

In [13]:
data = []
for text in texts:
    words, first_char, invert_map = prepare(text)

    tokens, token_invert_map = [], []
    for idx, w in enumerate(words):
        # get tokens
        w = w.replace("'", "\"")
        w = w.replace("`", "'")
        w = w.replace("ï¿½", "")
        if first_char[idx]:
            prefix = " "
        else:
            prefix = ""
        for token in tokenizer.tokenize(prefix+w):
            tokens.append(token)
            token_invert_map.append(idx)
    data.append((words, first_char, tokens,
                 token_invert_map, invert_map))
words, first_chars, tokens, invert_maps, word_invert_maps = zip(*data)
pred['words'] = words

pred['first_char'] = first_chars
pred['invert_map'] = invert_maps  # token id to word id
# word to pos in sentence
pred['word_invert_map'] = word_invert_maps

In [14]:
start_word_idx = []
end_word_idx = []
whole_sentence = []
for idx in range(len(pred)):
    text = texts[idx]
    st = sts[idx]
    word = words[idx]
    invert_map = word_invert_maps[idx]

    temp = np.zeros(len(text))

    end_pos = 0
    start_pos = text.find(st, end_pos)
    first_end = start_pos+len(st)
    temp[start_pos:first_end] = 1

    label = []
    for word_idx, w in enumerate(word):
        if sum(temp[invert_map[word_idx]:invert_map[word_idx]+len(w)]) > 0:
            label.append(word_idx)
    cur_start_word_idx = min(label)
    cur_end_word_idx = max(label)

    start_word_idx.append(cur_start_word_idx)
    end_word_idx.append(cur_end_word_idx)

pred['start'] = start_word_idx
pred['end'] = end_word_idx


In [15]:
def decode(tokens, first_char, start, end):
    retval = ""
    for i in range(start, end+1):
        if first_char[i]:
            retval+= (" "+tokens[i])
        else:
            retval += tokens[i]
    return " ".join(retval.split())

In [16]:
def jaccard_list(l1, l2):
    a = set(l1)
    b = set(l2)
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

In [17]:
def get_clean_label(x):
    
    words = x['words']
    start = x['start']
    end = x['end']
    label2 = decode(words, x['first_char'], start, end)
    return label2
    
def get_jaccard(x):
    label = x['selected_text'].split()
    label2 = x['label2'].split()
    return jaccard_list(label, label2)
pred['label2'] = pred.apply(lambda x: get_clean_label(x), axis=1)
pred['label_jaccard'] = pred.apply(lambda x: get_jaccard(x), axis=1)

In [18]:
def jaccard_string(s1, s2):
    a = set(s1.lower().split())
    b = set(s2.lower().split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

In [20]:
pred.head()

Unnamed: 0,textID,text,selected_text,sentiment,kfold,start_pred,end_pred,pred,score,whole_pred,whole,words,first_char,invert_map,word_invert_map,start,end,label2,label_jaccard
0,f7fdea625a,i`m so bored i can barely even tweet. i have n...,bored,negative,0,,,,,,False,"[i, `, m, so, bored, i, can, barely, even, twe...","[True, False, False, True, True, True, True, T...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[0, 1, 2, 4, 7, 13, 15, 19, 26, 31, 36, 38, 40...",4,4,bored,1.0
1,c19de2c75b,awwwwwwwwww thats jus...awwwww.did she get to ...,awwwwwwwwww thats jus...awwwww.did she get to ...,neutral,0,,,,,,True,"[awwwwwwwwww, thats, jus, ., ., ., awwwww, ., ...","[True, True, True, False, False, False, False,...","[0, 0, 0, 0, 0, 0, 1, 2, 2, 3, 4, 5, 6, 6, 6, ...","[0, 12, 18, 21, 22, 23, 24, 30, 31, 35, 39, 43...",0,15,awwwwwwwwww thats jus...awwwww.did she get to ...,1.0
2,0f963af18f,I did not twitt yesterday cause it was a very ...,I can not sleep,negative,0,,,,,,False,"[I, did, not, twitt, yesterday, cause, it, was...","[True, True, True, True, True, True, True, Tru...","[0, 1, 2, 3, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, ...","[0, 2, 6, 10, 16, 26, 32, 35, 39, 41, 46, 51, ...",20,23,I can not sleep,1.0
3,0583c78cc1,Congratulation`s to phil packer on completing ...,Congratulation`s,positive,0,,,,,,False,"[Congratulation, `, s, to, phil, packer, on, c...","[True, False, False, True, True, True, True, T...","[0, 0, 0, 1, 2, 3, 4, 5, 5, 6, 7, 8, 9, 9, 10,...","[0, 14, 15, 17, 20, 25, 32, 35, 46, 50, 57, 66...",0,2,Congratulation`s,1.0
4,1cdb444ea5,O`Charleys? Pretty good. Especially when its f...,Pretty good.,positive,0,,,,,,False,"[O, `, Charleys, ?, Pretty, good, ., Especiall...","[True, False, False, False, True, True, False,...","[0, 1, 2, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, ...","[0, 1, 2, 10, 12, 19, 23, 25, 36, 41, 45, 49, ...",4,6,Pretty good.,1.0


In [19]:
pred['jaccard'] = pred.apply(lambda x: jaccard_string(x['selected_text'], x['pred']), axis=1)

AttributeError: ("'float' object has no attribute 'lower'", 'occurred at index 0')

In [None]:
pred.groupby('sentiment')['jaccard'].mean()

In [None]:
pred.groupby('sentiment')['jaccard'].mean()

In [None]:
pred['delta_jaccard'] = pred['label_jaccard']-pred['jaccard']

In [86]:
pred[pred['label_jaccard']<0.5][['text','selected_text','sentiment','label2','pred','label_jaccard']].sample(n=10)

Unnamed: 0,text,selected_text,sentiment,label2,pred,label_jaccard
21540,Good Morning! It`s been a while since I`ve 'ta...,he awesome,positive,the awesome,Good Morning! It`s been a while since I`ve 'ta...,0.333333
9769,watching a youtube vid of sing paranoid live i...,so cut,positive,so cute,cute,0.333333
27314,that would mean me babe! but **** it my name i...,IN COO,positive,FREAKIN COOL,COOL!,0.0
9260,_de_B You have to email her - she only gets wh...,Shame ab,negative,Shame about,Shame,0.333333
10153,"Had a crazy night, lost keys, walked home, mis...",I`m sa,negative,I`m sad,I`m sad,0.333333
23698,_babsi there`s a manual process... but it`s ta...,crappy ms,negative,crappy msft,crappy msft product,0.333333
7932,moving back home today. pro: obnoxiously close...,obnoxiously cl,negative,obnoxiously closer,obnoxiously,0.333333
4146,I`m Kinda sleepy.. I Was up too late texting a...,nice bo,positive,nice boy,nice,0.333333
4997,"oooh...I wish I could`ve gone there today, too...",as looking forwar,positive,was looking forward,wish,0.2
23860,We`re idiots. Ok mostly I was skint but hell I...,re idiot,negative,re idiots,We`re idiots.,0.333333


In [87]:
print(pred[(pred['label_jaccard']>0.5)&(pred['jaccard']<0.5)].shape)
pred[(pred['label_jaccard']>0.5)&(pred['jaccard']<0.5)][['text','selected_text','sentiment','label2','pred','label_jaccard']].sample(n=10)

(7350, 21)


Unnamed: 0,text,selected_text,sentiment,label2,pred,label_jaccard
1575,Everyone on the eastcoast... the rain hates us...,the rain hates us,negative,the rain hates us,hates,1.0
10558,"Once again, I`m dealing with cancer in my dire...",Bugger Cancel sucks!,negative,Bugger Cancel sucks!,sucks!,1.0
857,Ouuchh! I hurt my index finger!! ahhhh,I hurt my index finger!! ahhhh,negative,I hurt my index finger!! ahhhh,hurt,1.0
26594,"Goodmorning twitter, oh my gosh, i woke up soo...","Goodmorning twitter, oh my gosh, i woke up soo...",positive,"Goodmorning twitter, oh my gosh, i woke up soo...",Happy,1.0
3366,good luck with you finals!!!,good luck with you finals!!!,positive,good luck with you finals!!!,good luck,1.0
14707,This will be the worst day ever....graduation,worst day ever...,negative,worst day ever...,worst,1.0
559,it`s not sooo noticable it depends on how you ...,isn`t perfect,negative,isn`t perfect,it`s not sooo noticable it depends on how you ...,1.0
3785,_Nicole wow you are lucky happy birthday!,wow you are lucky,positive,wow you are lucky,lucky happy birthday!,1.0
17465,...hey danny .. did u run already ??? hope you...,?? hope you have a good day ;) i love,positive,?? hope you have a good day ;) i love,i love you,1.0
6622,yea just fun dive..a bit scary sometimes but f...,.a bit scary sometimes but fun.,neutral,.a bit scary sometimes but fun.,yea just fun dive..a bit scary sometimes but f...,1.0


In [119]:
pred[(pred['label_jaccard']>0.5)&(pred['jaccard']<0.1)][['text','selected_text','sentiment','label2','pred','whole_pred']].sample(n=10)

Unnamed: 0,text,selected_text,sentiment,label2,pred,whole_pred
6773,omg me 2 !! Haha i was half an hour late 4 wor...,whoops,negative,whoops,omg me 2 !! Haha i was half an hour late,0.068344
24104,._.; Thanxxx ! Now with that message I just wa...,Thanxxx,positive,Thanxxx,Thanxxx ! Now with that message I just wanna l...,0.092494
15121,wishing I were somewhere else besides here!! D...,Don`t worry this won`t dampen my day and neith...,positive,Don`t worry this won`t dampen my day and neith...,wishing,0.160372
21717,"thats it, rub it in that ur seeing morrissey! ...","thats it, rub it in that ur seeing morrissey! ...",negative,"thats it, rub it in that ur seeing morrissey! ...",cry,0.045176
23605,Congrats Hughesy to you and Holly on the safe ...,Congrats Hughesy to you and Holly on the safe ...,positive,Congrats Hughesy to you and Holly on the safe ...,Congrats,0.091529
20280,boo! I must`ve deleted it from my phone! can`t...,can`t find it in my emails either!,negative,can`t find it in my emails either!,boo!,0.180488
8521,jungle book 2 is sooooo cute.. i have nothing ...,cute..,positive,cute..,cute.,0.052669
2605,nothings better then going out for chinese at ...,with ma fave cousins at,positive,with ma fave cousins at,nothings better,0.055982
21960,think you should catch up on your sleep befor ...,haha goodnight,positive,haha goodnight,goodnight<3,0.040349
8617,Chowder is the ****!!!!!!,****!,negative,****!,****!!!!!!,0.185898


In [89]:
print(pred.loc[27089]['text'])

mom just woke me u[p and i am so mad i was dreaming about shoes she whants me to go to the river it is so stupid


In [90]:
pred[pred['delta_jaccard']<0.2][['text','sentiment','selected_text','pred','jaccard','label_jaccard','whole_pred']].sample(n=10)

Unnamed: 0,text,sentiment,selected_text,pred,jaccard,label_jaccard,whole_pred
23632,"your the voice i hear inside my head, the reas...",neutral,"your the voice i hear inside my head, the reas...","your the voice i hear inside my head, the reas...",1.0,1.0,0.996764
6495,Reading through today`s paper along with some ...,neutral,Reading through today`s paper along with some ...,Reading through today`s paper along with some ...,1.0,1.0,0.985243
12568,"that is definitely an easier way of saying it,...",positive,easier,easier,1.0,1.0,0.178897
21215,is drawing a picture for her bubba,neutral,is drawing a picture for her bubba,is drawing a picture for her bubba,1.0,1.0,0.997497
2823,macbook dying. switching to iphone.,negative,dying.,dying.,1.0,1.0,0.06934
7961,I`m getting more and more followers... look ou...,neutral,I`m getting more and more followers... look ou...,I`m getting more and more followers... look ou...,1.0,1.0,0.996389
18410,Had to cancel subscriptions today to NZZ and E...,neutral,Had to cancel subscriptions today to NZZ and E...,Had to cancel subscriptions today to NZZ and E...,1.0,1.0,0.97845
26303,"back,had a great MAD monday meeting tomorrow",positive,great,great,1.0,1.0,0.11682
20911,- me and my mom`s cars were broken into. I fee...,negative,I feel violated.,I feel violated.,1.0,1.0,0.082451
199,Going to miss my roomie ... We will no longer ...,negative,miss,miss,1.0,1.0,0.068783


In [18]:
pred[pred['delta_jaccard']>0.8][['text','sentiment','selected_text','pred','jaccard','label_jaccard','score']].sample(n=10)

Unnamed: 0,text,sentiment,selected_text,pred,jaccard,label_jaccard,score
9720,I really hope you see my tweets. Sent you so m...,positive,please? *prays*,I really hope,0.0,1.0,0.781391
20678,Date Like A Man So You Dont Get Played Like A ...,negative,Date Like A Man So You Dont Get Played Like A ...,****,0.1,1.0,1.211964
25347,is not feelin well... i feel sooooo weak....i ...,negative,weak..,i feel sooooo weak....i hate bein sick in the ...,0.0,1.0,0.733103
8721,Chem wasn`t better than physics. And now I`m s...,negative,Chem wasn`t better than physics. And now I`m s...,tired.,0.1,1.0,0.886198
17204,"lol. When I went to buy my new laptop in Feb.,...",negative,sadfaced,sadfaced.,0.0,1.0,1.404858
10831,I am up and feeling pretty **** gud! I dont ev...,positive,I am up and feeling pretty **** gud! I dont ev...,nice feeling,0.133333,1.0,0.976072
1839,its Monday and all is good,positive,its Monday and all is good,good,0.166667,1.0,1.603872
11246,I can`t believe how tired I am right now... I ...,negative,Exhaaaausted,tired I am right now... I don`t know if I can ...,0.0,1.0,0.73642
4988,sadly awake. wondering about contact info for ...,negative,sadly awake.,sadly awake. wondering about contact info for ...,0.086957,1.0,0.952517
18027,I don`t like the possibility of the left side ...,negative,don`t like,I don`t like the possibility of the left side ...,0.181818,1.0,0.864112


In [18]:
pred.loc[5121, 'score']

1.204892635345459

In [19]:
pred.loc[5121,'words']

['I',
 'have',
 'such',
 'fantastic',
 'friends',
 ',',
 'including',
 'several',
 'ones',
 'met',
 'through',
 'here',
 '!',
 'thanks',
 'for',
 'being',
 'in',
 'my',
 'life',
 '-',
 'you',
 'are',
 'such',
 'amazing',
 'people',
 '!']

In [22]:
pred.loc[5264,'start_pred']

array([9.81185138e-02, 1.41678040e-03, 4.93733585e-03, 1.77191396e-03,
       6.82277530e-02, 8.49719346e-03, 3.49782445e-02, 5.02023160e-01,
       4.99137794e-04, 9.88097279e-04, 2.13963143e-03, 1.15508868e-04,
       1.77697721e-03, 8.86982400e-03, 3.39606334e-03, 3.18561494e-03,
       7.11473497e-03, 2.19332110e-02, 1.46088466e-01, 7.16459937e-04,
       1.19606929e-03, 3.02357739e-03, 2.29963195e-03, 1.07984255e-04,
       7.18443841e-03, 1.12657312e-04, 8.25565308e-03, 4.41139378e-03,
       6.29264468e-05, 4.17986093e-03, 9.03637338e-05, 8.02920293e-03,
       7.13601694e-05, 6.63818559e-03, 1.92632055e-04, 2.70112883e-04,
       3.70642096e-02, 1.51517124e-05, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00], dtype=float32)

In [23]:
pred.loc[5264,'end_pred']

array([3.56165925e-04, 2.31539452e-04, 8.73471654e-05, 1.59801613e-03,
       3.96914460e-04, 1.68386148e-04, 2.09067672e-04, 5.45909703e-01,
       1.93442404e-02, 1.57455006e-03, 1.45811267e-04, 4.44929227e-02,
       4.84313816e-03, 5.02101087e-04, 4.03384533e-04, 1.88563732e-04,
       1.12367103e-04, 5.48380078e-04, 1.91551939e-01, 8.76957644e-03,
       1.06053101e-03, 8.19328893e-03, 1.50607055e-04, 2.42818426e-02,
       3.08785646e-04, 9.46327951e-03, 1.39125797e-03, 8.18730987e-05,
       2.96283583e-03, 1.36234943e-04, 2.57435534e-03, 1.20302160e-04,
       2.61232071e-03, 1.66190744e-04, 1.52182893e-03, 9.62494873e-03,
       1.13905616e-01, 9.78956268e-06, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00], dtype=float32)