In [1]:
import pandas as pd
import numpy as np

%matplotlib inline

In [2]:
train = pd.read_csv('../input/tweet-sentiment-extraction/train_folds.csv')

In [3]:
train.dropna(subset=['text'], inplace=True)

In [4]:
def get_pos(x, y):
    return x.find(y)

def get_extra_space_count(x):
    prev_space = True
    space_counts = []
    count = 0
    for c in x:
        if c==' ':
            if prev_space:
                count+=1
            space_counts.append(count)
            prev_space = True
        else:
            space_counts.append(count)
            prev_space = False
    return space_counts

In [5]:
train['clean_text'] = train['text'].apply(lambda x: ' '.join(x.strip().split()))
train['len_delta'] = train['text'].str.len()-train['clean_text'].str.len()
train['whole'] = (train.selected_text.str.len()/train.text.str.len())>0.9

In [6]:
train['text'] = train['text'].str.rstrip()
train['selected_text'] = train['selected_text'].str.rstrip()

train['extra_space'] = train['text'].apply(lambda x: get_extra_space_count(x))


train['clean_st'] = train['selected_text'].apply(lambda x: ' '.join(x.strip().split()))
train['start_pos_origin'] = train.apply(lambda x: get_pos(x['text'], x['selected_text']), axis=1)
train['end_pos_origin'] = train['start_pos_origin']+train['selected_text'].str.len()
train['to_end'] = train['end_pos_origin']>=train['text'].str.len()

train['start_pos_clean'] = train.apply(lambda x: get_pos(x['clean_text'], x['clean_st']), axis=1)
train['end_pos_clean'] = train['start_pos_clean']+train['clean_st'].str.len()

train['shift'] = train.apply(lambda x: x['extra_space'][x['end_pos_origin']-1], axis=1)

In [7]:
def broken_start(x, y):
    
    if y>0 and x[y-1] not in [' ']: #.isalpha():
        return True
    return False

def broken_end(x, y):
    if y<len(x) and x[y]!=' ':
        return True
    return False

In [8]:
train['broken_start'] = train.apply(lambda x: broken_start(x['clean_text'], x['start_pos_clean']), axis=1)
train['broken_end'] = train.apply(lambda x: broken_end(x['clean_text'], x['end_pos_clean']), axis=1)
train['broken'] = train['broken_start']|train['broken_end']

In [9]:
train['shift'].value_counts()

0     12813
1     10739
2      3068
3       613
4       146
5        62
6        20
7         8
8         4
9         3
15        1
12        1
11        1
20        1
Name: shift, dtype: int64

In [25]:
def get_clean_label(x):
    shift = x['shift']
    if shift < 1 or x['start_pos_clean'] == 0:
        return x['selected_text']

    # 不修复shift=1不断头的
    if shift==1 and  not x['broken_start']:
        return x['selected_text']

    text = x['text']
    start = x['start_pos_origin']
    end = x['end_pos_origin']

#     while(len(text[start+shift-1:end+shift-1].strip()) == 0):
#         shift += 1
    if shift==1:
        new_st = text[start+shift:end].strip()
    else:
        # 对于shift>1的，都应该修复，除非修复之后还是断头
        parts = x['selected_text'].split()
        
        if len(parts)==1 or len(parts[0])>shift:
            return x['selected_text']
        else:
            new_st = text[start+shift:end+shift-1].strip()
    assert len(new_st)>0
    return new_st

In [26]:
train['new_st'] = train.apply(lambda x: get_clean_label(x), axis=1)

In [27]:
train[(train['start_pos_clean']>0)].groupby(['shift','sentiment'])['broken_start'].agg(['mean','count'])

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,count
shift,sentiment,Unnamed: 2_level_1,Unnamed: 3_level_1
0,negative,0.04512,2992
0,neutral,0.092742,248
0,positive,0.043856,2417
1,negative,0.106172,1912
1,neutral,0.154286,175
1,positive,0.082378,2355
2,negative,0.635945,434
2,neutral,0.322034,59
2,positive,0.555398,704
3,negative,0.652778,72


In [28]:
train.groupby(['shift','sentiment'])['whole'].agg(['mean','count'])

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,count
shift,sentiment,Unnamed: 2_level_1,Unnamed: 3_level_1
0,negative,0.152639,4396
0,neutral,0.931582,4721
0,positive,0.123377,3696
1,negative,0.146442,2670
1,neutral,0.921269,4382
1,positive,0.135069,3687
2,negative,0.164666,583
2,neutral,0.921824,1535
2,positive,0.150526,950
3,negative,0.193878,98


## 断头词

In [29]:
with open('broken_samples.txt','w') as f:
    indexs = train[(train['shift']>=2)&(train['start_pos_clean']>0)&~train.to_end&train.broken_start].index
    for loc in indexs:
        f.write(str(train.loc[loc,'shift']))
        f.write(train.loc[loc, 'text'])
        f.write("\n")
    #     print(train.loc[loc, 'ltext'])
    #     print(train.loc[loc, 'ltext_strip'])
        f.write(train.loc[loc, 'selected_text'])
        f.write("\n")
        f.write(train.loc[loc, 'new_st'])
        f.write("\n")
        f.write('-'*20)
        f.write("\n")

In [30]:
with open('non_broken_samples.txt','w') as f:
    indexs = train[(train['shift']>=2)&(train['start_pos_clean']>0)&~train.to_end&~train.broken_start].index
    for loc in indexs:
        f.write(str(train.loc[loc,'shift']))
        f.write(train.loc[loc, 'text'])
        f.write("\n")
    #     print(train.loc[loc, 'ltext'])
    #     print(train.loc[loc, 'ltext_strip'])
        f.write(train.loc[loc, 'selected_text'])
        f.write("\n")
        f.write(train.loc[loc, 'new_st'])
        f.write("\n")
        f.write('-'*20)
        f.write("\n")

In [31]:
with open('1_broken_samples.txt','w') as f:
    indexs = train[(train['shift']==1)&(train['start_pos_clean']>0)&train.broken_start].index
    for loc in indexs:
        f.write(str(train.loc[loc,'shift'])+' ')
        f.write(train.loc[loc, 'text'])
        f.write("\n")
    #     print(train.loc[loc, 'ltext'])
    #     print(train.loc[loc, 'ltext_strip'])
        f.write(train.loc[loc, 'selected_text'])
        f.write("\n")
        f.write(train.loc[loc, 'new_st'])
        f.write("\n")
        f.write('-'*20)
        f.write("\n")

In [32]:
with open('1_non_broken_samples.txt','w') as f:
    indexs = train[(train['shift']==1)&(train['start_pos_clean']>0)&~train.broken_start].index
    for loc in indexs:
        f.write(str(train.loc[loc,'shift'])+' ')
        f.write(train.loc[loc, 'text'])
        f.write("\n")
    #     print(train.loc[loc, 'ltext'])
    #     print(train.loc[loc, 'ltext_strip'])
        f.write(train.loc[loc, 'selected_text'])
        f.write("\n")
        f.write(train.loc[loc, 'new_st'])
        f.write("\n")
        f.write('-'*20)
        f.write("\n")

In [33]:
with open('2_non_broken_samples.txt','w') as f:
    indexs = train[(train['shift']==2)&(train['start_pos_clean']>0)&~train.broken_start].index
    for loc in indexs:
        f.write(str(train.loc[loc,'shift'])+' ')
        f.write(train.loc[loc, 'text'])
        f.write("\n")
    #     print(train.loc[loc, 'ltext'])
    #     print(train.loc[loc, 'ltext_strip'])
        f.write(train.loc[loc, 'selected_text'])
        f.write("\n")
        f.write(train.loc[loc, 'new_st'])
        f.write("\n")
        f.write('-'*20)
        f.write("\n")

In [34]:
with open('2_broken_samples.txt','w') as f:
    indexs = train[(train['shift']==2)&(train['start_pos_clean']>0)&train.broken_start].index
    for loc in indexs:
        f.write(str(train.loc[loc,'shift'])+' ')
        f.write(train.loc[loc, 'text'])
        f.write("\n")
    #     print(train.loc[loc, 'ltext'])
    #     print(train.loc[loc, 'ltext_strip'])
        f.write(train.loc[loc, 'selected_text'])
        f.write("\n")
        f.write(train.loc[loc, 'new_st'])
        f.write("\n")
        f.write('-'*20)
        f.write("\n")

In [35]:
with open('3_broken_samples.txt','w') as f:
    indexs = train[(train['shift']==3)&(train['start_pos_clean']>0)&train.broken_start].index
    for loc in indexs:
        f.write(str(train.loc[loc,'shift'])+' ')
        f.write(train.loc[loc, 'text'])
        f.write("\n")
    #     print(train.loc[loc, 'ltext'])
    #     print(train.loc[loc, 'ltext_strip'])
        f.write(train.loc[loc, 'selected_text'])
        f.write("\n")
        f.write(train.loc[loc, 'new_st'])
        f.write("\n")
        f.write('-'*20)
        f.write("\n")

In [34]:
with open('0_broken_samples.txt','w') as f:
    indexs = train[(train['shift']==0)&(train['start_pos_clean']>0)&~train.to_end&train.broken_start].index
    for loc in indexs:
        f.write(str(train.loc[loc,'shift'])+' ')
        f.write(train.loc[loc, 'text'])
        f.write("\n")
    #     print(train.loc[loc, 'ltext'])
    #     print(train.loc[loc, 'ltext_strip'])
        f.write(train.loc[loc, 'selected_text'])
        f.write("\n")
        f.write(train.loc[loc, 'new_st'])
        f.write("\n")
        f.write('-'*20)
        f.write("\n")

In [75]:
train[train.text.str.find("i wanna do your job HAND IT OVER")>0]

Unnamed: 0,textID,text,selected_text,sentiment,kfold,clean_text,len_delta,whole,extra_space,clean_st,start_pos_origin,end_pos_origin,to_end,start_pos_clean,end_pos_clean,shift,broken_start,broken_end,broken,new_st
25217,12f21c8f19,star wars ............ is **** BOO??? i wanna...,l,positive,4,star wars ............ is **** BOO??? i wanna ...,4,False,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",l,105,106,False,103,104,2,True,False,True,l


In [76]:
get_clean_label(train.loc[25217])

'l'

In [89]:
indexs = train[(train['shift']==2)&(train['start_pos_clean']>0)&(train['sentiment']=='positive')&train.broken_start].sample(n=5).index
for loc in indexs:
    print(train.loc[loc, 'text'])
#     print(train.loc[loc, 'ltext'])
#     print(train.loc[loc, 'ltext_strip'])
    print(train.loc[loc, 'selected_text'])
    print(train.loc[loc, 'new_st'])
    print('-'*20)

 In my case, it`s going to be exams showing me a thing or two.  I`m very much hopeful about November `09, though.
h hopeful
hopeful
--------------------
rock week  i don`t think danny`s gonna go home this week  he`s never been in the bottom three.. and he`s a really good singer
d he`s a really good singer
he`s a really good singer
--------------------
 I didn`t know you had a blog!!!  That`s so awesome
o awesome
awesome
--------------------
 Yes,all on my own  OH gone to bed after a minor soap marathon. Photo editing night 4 me. Ohhh Inn on the lake, nice food.
e, nice food
nice food.
--------------------
  told you, you would sweep haha :-p
d sweep haha :-p
sweep haha :-p
--------------------


## 有偏移但没断头词

In [58]:
indexs = train[(train['shift']==2)&(train['start_pos_clean']>0)&~train.to_end&(train['sentiment']=='positive')&~train.broken].sample(n=5).index
for loc in indexs:
    print(train.loc[loc, 'text'])
#     print(train.loc[loc, 'ltext'])
#     print(train.loc[loc, 'ltext_strip'])
    print(train.loc[loc, 'selected_text'])
    print(train.loc[loc, 'new_st'])
    print('-'*20)

bought my pink ipod nano 2 days ago  will be delivered this week. YAY!  & hopefully getting ears pierced again in a few weeks XD
& hopefully
hopefully
--------------------
  i`ve seen inkheart- i liked it too
i liked
liked
--------------------
 Is it true there won`t be any Fantastic 4 sequels?  I wished they introduced a Franklin Richards character... will they?
I wished
wished
--------------------
 you should say mlia instead of fml  i hope you find it soon
i hope
hope
--------------------
 Thank you  I liked Joe`s clothes` being ripped off and then Kevin screaming, 'Who are you!?!' XD
I liked
liked
--------------------


## 完整句子

In [28]:
indexs = train[train.whole&(train['sentiment']!='neutral')&(train['len_delta']==0)].sample(n=10).index
for loc in indexs:
    print(train.loc[loc, 'text'])
    print(train.loc[loc, 'sentiment'])
#     print(train.loc[loc, 'ltext'])
#     print(train.loc[loc, 'ltext_strip'])
    print(train.loc[loc, 'selected_text'])
#     print(train.loc[loc, 'new_st'])
    print('-'*20)

I really fancy a frappuccino from Starbucks right now
positive
I really fancy a frappuccino from Starbucks right now
--------------------
My face an my arms. Tragic. Seriously.
negative
My face an my arms. Tragic. Seriously.
--------------------
Happy Mother`s Day, Moms!!! You are wonderful!! Have a great day
positive
Happy Mother`s Day, Moms!!! You are wonderful!! Have a great day
--------------------
good morning. have to get ready to go to the hospital and get a cat scan. best wishes to you.
positive
good morning. have to get ready to go to the hospital and get a cat scan. best wishes to you.
--------------------
bored , cleanin the house
negative
bored , cleanin the house
--------------------
not so good mood..
negative
not so good mood..
--------------------
Im trying to move and get up but it just hurts to much...
negative
Im trying to move and get up but it just hurts to much...
--------------------
iPhone just fell
negative
iPhone just fell
--------------------
been in bed for 

In [18]:
train[train.whole&(train['sentiment']!='neutral')]['len_delta'].value_counts()

0    1115
1     893
2     241
3      51
4       5
5       4
6       2
Name: len_delta, dtype: int64

In [19]:
train[(train['sentiment']!='neutral')]['len_delta'].value_counts()

1     6525
0     6502
2     2534
3      536
4      154
5       67
6       23
7        8
8        5
15       2
14       2
10       2
12       1
19       1
9        1
Name: len_delta, dtype: int64

## 下面的不用看

In [195]:
loc = 1309	
print(train.loc[loc, 'text'])
print(train.loc[loc, 'ltext'])
print(train.loc[loc, 'ltext_strip'])
print(train.loc[loc, 'selected_text'])
print(train.loc[loc, 'new_st'])
print(train.loc[loc, 'to_end'])

  Those dog pic had me  ROLF!  The one  with the snow stuck too it  has that 'if looks could kill look' way funny
  Those dog pic had me  ROLF!  The one  with the snow stuck too it  has that 'if looks could kill look' way
Those dog pic had me ROLF! The one with the snow stuck too it has that 'if looks could kill look' way
look' way
way funn
False


In [30]:
train[train.broken_start].sample(n=5)

Unnamed: 0,textID,text,selected_text,sentiment,kfold,clean_text,clean_st,start_pos_origin,end_pos_origin,start_pos_clean,end_pos_clean,broken_start,broken_end,shift,new_st
24887,f804ea3526,No but I just checked and got it LOL. You are...,t LOL. You are ok!,positive,4,No but I just checked and got it LOL. You are ...,t LOL. You are ok!,32,50,31,49,True,False,1,LOL. You are ok!
22400,dc582b9f85,YAY!! that`s so cool aww that woulda been sw...,! I`m just glad,positive,4,YAY!! that`s so cool aww that woulda been swee...,! I`m just glad,68,83,66,81,True,False,2,I`m just glad
3256,028ea6ef68,I have to choose between and _FC on Sunday an...,s. I`m shattered,negative,0,I have to choose between and _FC on Sunday and...,s. I`m shattered,52,68,50,66,True,True,3,I`m shattered.
11964,2f0047bcad,I did laundry tonight too. Guess I can admit...,w that you paved the way.,positive,2,I did laundry tonight too. Guess I can admit t...,w that you paved the way.,54,79,52,77,True,False,2,that you paved the way.
25190,c567514034,hahaha well its try its so ugly,s so ugly,negative,4,hahaha well its try its so ugly,s so ugly,24,33,22,31,True,False,2,so ugly


In [87]:
for loc in train[(train['shift']==2)&(train['start_pos_clean']>0)&~train.broken].index: #&(train['sentiment']=='positive')
    print(train.loc[loc, 'text'])
    print(train.loc[loc, 'selected_text'])
    new_st = train.loc[loc, 'text'][train.loc[loc, 'start_pos_origin']+1:train.loc[loc, 'end_pos_origin']+1]
    print(new_st)
    print('-'*20)

  - I always appreciate your 'musings.'
I always appreciate
 always appreciate 
--------------------
  Those dog pic had me  ROLF!  The one  with the snow stuck too it  has that 'if looks could kill look' way funny
look' way
ook' way 
--------------------
  oist the 1st tym, di pa ko 18, the 2nd tym, may sakit na siya... don`t blame me
. don`t blame me
 don`t blame me
--------------------
  heheheheh... lol... I always figured he`d send them your way if he got any dupes... I felt bad not sending him stuff...
. I felt bad not sending him
 I felt bad not sending him 
--------------------
  you`re missing out, bb! i`m such a cereal nut, i think i like every kind available.
i like
 like 
--------------------
  I know! they were running out though, Soph had to get a massive one...
a massive
 massive 
--------------------
  she was sleep and then she woke up to check on me see if i was cold or hot i love my mom
i love
 love 
--------------------
  but i love your haaaaair!
i love
 love 
----

In [176]:
train[train['new_st']!=train['selected_text']].sample(n=10)[['selected_text', 'new_st']]

Unnamed: 0,selected_text,new_st
9414,aw! I`m gonna miss my old username... nick jon...,aw! I`m gonna miss my old username... nick jon...
25849,http://twitpic.com/4w9zb - must... stick.. my....,http://twitpic.com/4w9zb - must... stick.. my....
3182,I`m grateful that my kids also don`t like them...,I`m grateful that my kids also don`t like them...
27356,paramore makes me want to punch babies lol,paramore makes me want to punch babies lol
17567,__Cullen_ lol emmett wont do a dance off again,__Cullen_ lol emmett wont do a dance off again
14203,"Um, why arent episodes 1-14 of Lost season 5 o...","Um, why arent episodes 1-14 of Lost season 5 o..."
22377,feeling sorry for Ian. He broke up with his gf...,feeling sorry for Ian. He broke up with his gf...
8188,my iMac`s died keeps cutting out for no reason.,my iMac`s died keeps cutting out for no reason.
19134,your in london? are you doing any shows here?,your in london? are you doing any shows here?
13157,ha english portfolio!!! i finally finshed you ...,ha english portfolio!!! i finally finshed you ...


In [86]:
train[train['text'].str.find('Whaaaaaaaaaat')>0]['new_st']

11679    B! Super
Name: new_st, dtype: object

In [32]:
train[train['clean_text'].str.find('macaroons')>0]

Unnamed: 0,textID,text,selected_text,sentiment,kfold,clean_text,clean_st,start_pos_origin,end_pos_origin,start_pos_clean,end_pos_clean,broken_start,broken_end,new_st
20423,09d0f8f088,two macaroons go into a bar....one says oh yo...,wo,positive,3,two macaroons go into a bar....one says oh you...,wo,2,4,1,3,True,False,o


In [33]:
print(train.loc[20423, 'text'])
print(train.loc[20423, 'selected_text'])

 two macaroons go into a bar....one says oh your a nut.  wow I need to get out more.
wo


In [165]:
train[train['text'].str.find("Thanks for sharing with your friends!")>0]

Unnamed: 0,textID,text,selected_text,sentiment,kfold,clean_text,clean_st,start_pos_origin,end_pos_origin,start_pos_clean,end_pos_clean,broken_start,broken_end,shift,new_st,broken,ltext,ltext_strip,to_end
13020,792063a20e,_Geronimo Thanks for sharing with your ...,imo,positive,2,_Geronimo Thanks for sharing with your friends!,imo,10,13,6,9,True,False,4,,True,_Geronimo,_Geronimo,False
