In [53]:
import pandas as pd
import numpy as np

%matplotlib inline

In [54]:
train = pd.read_pickle('../input/tweet-sentiment-extraction/preds.pkl')

In [55]:
train.dropna(subset=['text'], inplace=True)

In [56]:
def get_pos(x, y):
    return x.find(y)

def get_extra_space_count(x):
    prev_space = True
    space_counts = []
    count = 0
    for c in x:
        if c==' ':
            if prev_space:
                count+=1
            space_counts.append(count)
            prev_space = True
        else:
            space_counts.append(count)
            prev_space = False
    return space_counts

In [57]:
train['clean_text'] = train['text'].apply(lambda x: ' '.join(x.strip().split()))
train['len_delta'] = train['text'].str.len()-train['clean_text'].str.len()
train['whole'] = (train.selected_text.str.len()/train.text.str.len())>0.9

In [58]:
train['text'] = train['text'].str.rstrip()
train['selected_text'] = train['selected_text'].str.rstrip()

train['extra_space'] = train['text'].apply(lambda x: get_extra_space_count(x))


train['clean_st'] = train['selected_text'].apply(lambda x: ' '.join(x.strip().split()))
train['start_pos_origin'] = train.apply(lambda x: get_pos(x['text'], x['selected_text']), axis=1)
train['end_pos_origin'] = train['start_pos_origin']+train['selected_text'].str.len()
train['to_end'] = train['end_pos_origin']>=train['text'].str.len()

train['start_pos_clean'] = train.apply(lambda x: get_pos(x['clean_text'], x['clean_st']), axis=1)
train['end_pos_clean'] = train['start_pos_clean']+train['clean_st'].str.len()

train['shift'] = train.apply(lambda x: x['extra_space'][x['end_pos_origin']-1], axis=1)

In [59]:
def broken_start(x, y):
    
    if y>0 and x[y-1] not in [' ']: #.isalpha():
        return True
    return False

def broken_end(x, y):
    if y<len(x) and x[y]!=' ':
        return True
    return False

In [60]:
train['broken_start'] = train.apply(lambda x: broken_start(x['clean_text'], x['start_pos_clean']), axis=1)
train['broken_end'] = train.apply(lambda x: broken_end(x['clean_text'], x['end_pos_clean']), axis=1)
train['broken'] = train['broken_start']|train['broken_end']

In [61]:
train['shift'].value_counts()

0     12813
1     10739
2      3068
3       613
4       146
5        62
6        20
7         8
8         4
9         3
15        1
12        1
11        1
20        1
Name: shift, dtype: int64

In [62]:
def get_clean_label(x):
    shift = x['shift']
    if shift < 1 or x['start_pos_clean'] == 0:
        return x['selected_text']

    # 不修复shift=1不断头的
    if shift==1 and  not x['broken_start']:
        return x['selected_text']

    text = x['text']
    start = x['start_pos_origin']
    end = x['end_pos_origin']

#     while(len(text[start+shift-1:end+shift-1].strip()) == 0):
#         shift += 1
    if shift==1:
        new_st = text[start+shift:end].strip()
    else:
        # 对于shift>1的，都应该修复，除非修复之后还是断头
        parts = x['selected_text'].split()
        
        if len(parts)==1 or len(parts[0])>shift:
            return x['selected_text']
        else:
            new_st = text[start+shift:end+shift-1].strip()
    assert len(new_st)>0
    return new_st

In [63]:
train['new_st'] = train.apply(lambda x: get_clean_label(x), axis=1)

In [64]:
train.head()

Unnamed: 0,textID,text,selected_text,sentiment,kfold,pred,raw_pred,score,whole_pred,clean_text,...,start_pos_origin,end_pos_origin,to_end,start_pos_clean,end_pos_clean,shift,broken_start,broken_end,broken,new_st
0,f7fdea625a,i`m so bored i can barely even tweet. i have n...,bored,negative,0,bored,bored,1.260304,0.068662,i`m so bored i can barely even tweet. i have n...,...,7,12,False,7,12,0,False,False,False,bored
1,c19de2c75b,awwwwwwwwww thats jus...awwwww.did she get t...,awwwwwwwwww thats jus...awwwww.did she get to...,neutral,0,awwwwwwwwww thats jus...awwwww.did she get t...,awwwwwwwwww thats jus...awwwww.did she get t...,1.927268,0.918263,awwwwwwwwww thats jus...awwwww.did she get to ...,...,1,65,True,0,63,2,False,False,False,awwwwwwwwww thats jus...awwwww.did she get to...
2,0f963af18f,I did not twitt yesterday cause it was a very ...,I can not sleep,negative,0,I can not sleep,I can not sleep,0.706122,0.060505,I did not twitt yesterday cause it was a very ...,...,86,101,False,85,100,1,False,False,False,I can not sleep
3,0583c78cc1,Congratulation`s to phil packer on completing ...,Congratulation`s,positive,0,Congratulation`s to phil packer on completing ...,Congratulation`s to phil packer on completing ...,0.820241,0.080782,Congratulation`s to phil packer on completing ...,...,0,16,False,0,16,0,False,False,False,Congratulation`s
4,1cdb444ea5,O`Charleys? Pretty good. Especially when its f...,Pretty good.,positive,0,Pretty good.,Pretty good.,1.386945,0.057058,O`Charleys? Pretty good. Especially when its f...,...,12,24,False,12,24,0,False,False,False,Pretty good.


In [65]:
train[(train['shift']==3)&(train['sentiment']!='neutral')][['text', 'selected_text', 'sentiment','pred','raw_pred','shift','new_st']].sample(n=5)

Unnamed: 0,text,selected_text,sentiment,pred,raw_pred,shift,new_st
3416,Jeff can`t get his visa in time to come visit ...,`m so sa,negative,I`m so sad.,I`m so sad.,3,so sad.
8555,New work wellness challenge not going well. I...,Failed on first day. Twic,negative,not going well.,not going well.,3,Failed on first day. Twic
1052,i hope so I KNOW WOO! haha 2 times it was e...,as excitin,positive,as excitin,exciting,3,exciting
27192,"Mooorning! It`s Friday, and that`s terrific! S...",that`s terrific! Smile up! :-|,positive,terrific!,terrific!,3,that`s terrific! Smile up! :-|
15525,May the 4th be with you Happy Star Wars Day...,Happy Star Wars Day !!,positive,Happy,Happy,3,Happy Star Wars Day !!


In [69]:
print(train.loc[2106])

textID                                                     47aec7e3be
text                                           omg   i found it  thnx
selected_text                                   omg   i found it  thn
sentiment                                                    positive
kfold                                                               0
pred                                                          t  thnx
raw_pred                                                         thnx
score                                                         1.64235
whole_pred                                                   0.254472
clean_text                                        omg i found it thnx
len_delta                                                           3
whole                                                            True
extra_space         [0, 0, 0, 0, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ...
clean_st                                           omg i found it thn
start_pos_origin    

In [66]:
loc = 2106
print("'"+train.loc[loc, 'text']+"'")
print("'"+train.loc[loc, 'pred']+"'")
print("'"+train.loc[loc, 'raw_pred']+"'")

'omg   i found it  thnx'
't  thnx'
'thnx'


In [22]:
def get_extra_space_count(x):
    prev_space = True
    space_counts = []
    count = 0
    for c in x:
        if c == ' ':
            if prev_space:
                count += 1
            space_counts.append(count)
            prev_space = True
        else:
            space_counts.append(count)
            prev_space = False
    return space_counts

In [67]:
es = get_extra_space_count(train.loc[2106, 'text'])

In [46]:
text = train.loc[loc, 'text']

In [48]:
from transformers import RobertaTokenizer
tokenizer = RobertaTokenizer.from_pretrained('../../bert_models/roberta_base/')

In [49]:
prev_punc = True
word = []
offset = []
token = []
invert_map = []
for idx, c in enumerate(text):

    if c in [' ','.',',','!','?','(',')',';',':','-','=',"/","<","`"]:
        prev_punc = True
        word.append(c)
        offset.append(idx)
    else:
        if prev_punc:
            word.append(c)
            offset.append(idx)
            prev_punc = False
        else:
            word[-1]+=c
offset = [(x, x+len(y)) for x, y in zip(offset, word)]
for word_idx, w in enumerate(word):
    if word_idx>0 and word[word_idx-1]==' ':
        prefix = ' '
    else:
        prefix = ''
    if word==' ':
        token.append("Ġ")
        invert_map.append(word_idx)
    else:
        for t in tokenizer.tokenize(prefix+w):
            token.append(t)
            invert_map.append(word_idx)

In [52]:
word

['omg', ' ', ' ', ' ', 'i', ' ', 'found', ' ', 'it', ' ', ' ', 'thnx']

In [50]:
offset

[(0, 3),
 (3, 4),
 (4, 5),
 (5, 6),
 (6, 7),
 (7, 8),
 (8, 13),
 (13, 14),
 (14, 16),
 (16, 17),
 (17, 18),
 (18, 22)]

In [51]:
len(text)

22

In [68]:
es[offset[-1][0]]

3