In [1]:
import pandas as pd
import numpy as np
from transformers import BertTokenizer
from transformers import RobertaTokenizer

In [2]:
def _improve_answer_span(doc_tokens, input_start, input_end, tokenizer, orig_answer_text):
    """Returns tokenized answer spans that better match the annotated answer."""
    tok_answer_text = " ".join(tokenizer.tokenize(orig_answer_text))

    for new_start in range(input_start, input_end + 1):
        for new_end in range(input_end, new_start - 1, -1):
            text_span = " ".join(doc_tokens[new_start : (new_end + 1)])
            if text_span == tok_answer_text:
                return (new_start, new_end)

    return (input_start, input_end)

In [3]:
train = pd.read_csv('../input/tweet-sentiment-extraction/train.csv')

In [4]:
tokenizer = BertTokenizer.from_pretrained('../../bert_models/bert_base_uncased/')

In [6]:
tokenizer = BertTokenizer.from_pretrained('../../bert_models/chinese_roberta_large/')

In [8]:
tokenizer.tokenize('a你b妹')

['a', '你', 'b', '妹']

In [7]:
train.shape

(27481, 4)

In [8]:
train.dropna(subset=['text','selected_text'], how='any', inplace=True)

In [9]:
train.shape

(27480, 4)

In [10]:
train.head()

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,088c60f138,my boss is bullying me...,bullying me,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative


In [11]:
train['text'] = train['text'].apply(lambda x: ' '.join(x.strip().split()))
train['selected_text'] = train['selected_text'].apply(lambda x: ' '.join(x.strip().split()))

In [12]:
def clean(x):
    sp_x = x.split()
    if sp_x[0] in [',','.','!','?',':',';'] and len(sp_x)>1:
        print(x, '|', ' '.join(sp_x[1:]))
        return ' '.join(sp_x[1:])
    return x

In [13]:
train['selected_text'] = train['selected_text'].apply(lambda x: clean(x))

, sorry guys | sorry guys
, he was soooo friendly. | he was soooo friendly.
! I`m scared of him | I`m scared of him
. Tomorrow will be tough! | Tomorrow will be tough!
. Sorry | Sorry
. Okay kool... I might be touring all summer long but we can make it happen! | Okay kool... I might be touring all summer long but we can make it happen!
! Gonna scare Rachel with Quarantine tonight, this shall be fun | Gonna scare Rachel with Quarantine tonight, this shall be fun
. He gives Twitter tips. Hope this helps | He gives Twitter tips. Hope this helps
. I hope Avalina isn`t a dud! | I hope Avalina isn`t a dud!
, Thx | Thx
: I ADMIRE YOU! you`re amazing! you inspire me to write <3 | I ADMIRE YOU! you`re amazing! you inspire me to write <3
, enjoy | enjoy
. It wasn`t all that great but Channing Tatum is amazing! | It wasn`t all that great but Channing Tatum is amazing!
! sadly | sadly
? American Pie FOREVER. I`m watching it right now. They`re running naked around on the Streets! Would you do that?

In [14]:
train['start_pos'] = train.apply(lambda x: x['text'].find(x['selected_text']), axis=1)
train['end_pos'] = train.apply(lambda x: x['start_pos']+len(x['selected_text']), axis=1)

In [15]:
def contains(a, b):
    for i in range(0, len(a)-len(b)+1):
        flag = True
        for j in range(0, len(b)):
            if a[i+j]!=b[j]:
                flag = False
                break
        if flag:
            return True, i
    return False, -1

In [16]:
data = []
improve_count = 0
for text, sp, ep, st in zip(train['text'].tolist(), train['start_pos'].tolist(), train['end_pos'].tolist(),
                           train['selected_text'].tolist()):
    split_text = text.split()
    tokens, labels, invert_map, first_token, in_st = [], [], [], [], []
    st_tokens = tokenizer.tokenize(' '+st)
    cur_length = 0
    
    # token in selected_tokens
    temp = np.zeros(len(text))
    temp[sp:ep]=1
    for idx, w in enumerate(split_text):
        if sum(temp[cur_length+idx:cur_length+idx+len(w)])>0:
            started = True# space
        else:
            started = False
        for idx2, token in enumerate(tokenizer.tokenize(w)):
            first_token.append(True if idx2==0 else False)
            tokens.append(token)
            invert_map.append(idx)
            if started:
                labels.append(len(tokens)-1)
                if token in st_tokens:
                    in_st.append(1)
                else:
                    in_st.append(0)
            else:
                in_st.append(-100)
        cur_length+=len(w)
    start_token_idx = min(labels)
    end_token_idx = max(labels)
    start_token_idx, end_token_idx = _improve_answer_span(tokens, start_token_idx, end_token_idx,
                                                         tokenizer, st)
    
    token_contain, token_sp = contains(tokens, st_tokens)
    if token_contain:
        start_token_idx = token_sp
        end_token_idx = token_sp+len(st_tokens)-1
    
    start_word_idx = invert_map[start_token_idx]
    end_word_idx = invert_map[end_token_idx]+1
    assert ' '.join(split_text[start_word_idx:end_word_idx]).lower().find(st.lower())>=0
    
    data.append((tokens, start_token_idx, end_token_idx, invert_map, first_token, in_st))

In [17]:
tokens, start, end, invert_map, first_token, in_st = zip(*data)

In [18]:
len(tokens)

27480

In [19]:
train['tokens'] = tokens

In [20]:
train['start'] = start
train['end'] = end
train['invert_map'] = invert_map
train['first_token'] = first_token
train['in_st'] = in_st

In [21]:
senti2label = {
    'positive':2,
    'negative':0,
    'neutral':1
}
train['senti_label']=train['sentiment'].apply(lambda x: senti2label[x])

In [22]:
train.reset_index(drop=True, inplace=True)

In [23]:
train.head()

Unnamed: 0,textID,text,selected_text,sentiment,start_pos,end_pos,tokens,start,end,invert_map,first_token,in_st,senti_label
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral,0,35,"[i, `, d, have, responded, ,, if, i, were, going]",0,9,"[0, 0, 0, 1, 2, 2, 3, 4, 5, 6]","[True, False, False, True, True, False, True, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]",1
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative,0,8,"[soo, ##o, sad, i, will, miss, you, here, in, ...",0,2,"[0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 9, 9, 9]","[True, False, True, True, True, True, True, Tr...","[1, 1, 1, -100, -100, -100, -100, -100, -100, ...",0
2,088c60f138,my boss is bullying me...,bullying me,negative,11,22,"[my, boss, is, bullying, me, ., ., .]",3,4,"[0, 1, 2, 3, 4, 4, 4, 4]","[True, True, True, True, True, False, False, F...","[-100, -100, -100, 1, 1, 0, 0, 0]",0
3,9642c003ef,what interview! leave me alone,leave me alone,negative,16,30,"[what, interview, !, leave, me, alone]",3,5,"[0, 1, 1, 2, 3, 4]","[True, True, False, True, True, True]","[-100, -100, -100, 1, 1, 1]",0
4,358bd9e861,"Sons of ****, why couldn`t they put them on th...","Sons of ****,",negative,0,13,"[sons, of, *, *, *, *, ,, why, couldn, `, t, t...",0,6,"[0, 1, 2, 2, 2, 2, 2, 3, 4, 4, 4, 5, 6, 7, 8, ...","[True, True, True, False, False, False, False,...","[1, 1, 1, 1, 1, 1, 1, -100, -100, -100, -100, ...",0


In [24]:
from sklearn.model_selection import StratifiedKFold

In [25]:
train.shape

(27480, 13)

In [26]:
roberta_localtest = pd.read_pickle('../input/localtest_roberta.pkl')

In [27]:
local_test = train[train['textID'].isin(roberta_localtest['textID'].tolist())]

In [28]:
local_train = train[~train['textID'].isin(roberta_localtest['textID'].tolist())]
local_train.reset_index(drop=True, inplace=True)

In [29]:
kf = StratifiedKFold(n_splits=5)

In [30]:
local_train['fold'] = 0
for fold, (train_idx, valid_idx) in enumerate(kf.split(local_train, local_train['senti_label'])):
    local_train.loc[valid_idx, 'fold'] = fold

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [31]:
local_train['fold'].value_counts()

0    4801
3    4800
2    4800
1    4800
4    4799
Name: fold, dtype: int64

In [32]:
local_train.to_pickle('../input/train_bert.pkl')

In [33]:
local_test.to_pickle('../input/localtest_bert.pkl')