# Import Library

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import tensorflow.keras.backend as K
from sklearn.model_selection import StratifiedKFold
from transformers import *
import tokenizers
import math
from copy import deepcopy as dc
import gc
print('TF version',tf.__version__)

TF version 2.2.0


# Import Pre-trained roverta model

In [3]:
MAX_LEN = 96
PATH = '../1.Data/'
tokenizer = tokenizers.ByteLevelBPETokenizer(
    vocab_file=PATH+'vocab-roberta-base.json', 
    merges_file=PATH+'merges-roberta-base.txt', 
    lowercase=True,
    add_prefix_space=True
)
EPOCHS = 1 # originally 3
BATCH_SIZE = 32 # originally 32
PAD_ID = 1
SEED = 88888
LABEL_SMOOTHING = 0.1
tf.random.set_seed(SEED)
np.random.seed(SEED)
sentiment_id = {'positive': 1313, 'neutral': 7974, 'negative': 2430}

# Data Augumentation

In [4]:
train = pd.read_csv('../1.Data/train.csv').fillna('')
n = train.shape[0]

 token "_________________________________" will be used to replace the selected_text in the text during modifications, so that we don't lose track.

In [5]:
from nltk.corpus import wordnet, stopwords
stop = stopwords.words('english')
stop += ["_________________________________", "u"]
import string
punct = list(string.punctuation)
punct.remove("-")
punct.append(" ")

For each word of a sentence that isn't a stopword, we will randomly choose between itself and all his synonyms, in order to replace it in the modified sentence. The first function is for getting synonyms of a word, and probabilities of selecting each one of them when we will randomly rebuild a modified sentence.

# Word Stemming

In [6]:
import nltk

def get_synonyms(word):
    """
    Get synonyms of a word
    """
    stop_words = set(stopwords.words('english')) 
    
    if word.lower() in stop:
        return [word], [1]
    
    synonyms = set()
    for syn in wordnet.synsets(word): 
        for l in syn.lemmas(): 
            # Remove Hypoon
            synonym = l.name().replace("_", " ").replace("-", " ").lower()
            
            # Only remain characters
            if synonym not in stop_words:
                synonym = "".join([char for char in synonym if char in ' qwertyuiopasdfghjklzxcvbnm'])
                try:
                    if nltk.pos_tag(synonym)[0][0] in ['VB','VBD','VBG','VBN','VBP','VBZ','RB','RBR','RBS','RP','JJ','JJR','JJS','CC']:
                        synonyms.add(synonym) 
                except:
                    None
    if word not in synonyms:
        synonyms.add(word)
        
    n = len(synonyms)
    
    if n == 1: # we didn't find any synonyms for that word, therefore we will try to check if it's not because of some punctuation interfering
        word_ = "".join(list(filter(lambda x: x not in punct, word)))
        if word_.lower() in stop:
            return [word, word_], [0.5, 0.5]
        for syn in wordnet.synsets(word_): 
            for l in syn.lemmas(): 
                synonym = l.name().replace("_", " ").replace("-", " ").lower()
                synonym = "".join([char for char in synonym if char in ' qwertyuiopasdfghjklzxcvbnm'])
                synonyms.add(synonym) 
        if word_ not in synonyms:
            synonyms.add(word_)
            
    n = len(synonyms)
    if n == 1:
        probabilities = [1]
    else:
        probabilities = [0.5 if w==word else 0.5/(n-1) for w in synonyms]
    
    return list(synonyms), probabilities

# Example of synonym generated

In [7]:
for word in ['sad', 'SAD', 'Sad...', 'saaaaad']:
    print(f'For word {word}, synonyms and corresponding probabilities are :')
    print(get_synonyms(word))
    print('-'*20)



For word sad, synonyms and corresponding probabilities are :
(['sad', 'lamentable', 'distressing', 'pitiful', 'deplorable', 'sorry'], [0.5, 0.1, 0.1, 0.1, 0.1, 0.1])
--------------------
For word SAD, synonyms and corresponding probabilities are :
(['sad', 'lamentable', 'distressing', 'SAD', 'pitiful', 'deplorable', 'sorry'], [0.08333333333333333, 0.08333333333333333, 0.08333333333333333, 0.5, 0.08333333333333333, 0.08333333333333333, 0.08333333333333333])
--------------------
For word Sad..., synonyms and corresponding probabilities are :
(['sad', 'lamentable', 'Sad', 'distressing', 'pitiful', 'deplorable', 'sorry', 'Sad...'], [0.07142857142857142, 0.07142857142857142, 0.07142857142857142, 0.07142857142857142, 0.07142857142857142, 0.07142857142857142, 0.07142857142857142, 0.5])
--------------------
For word saaaaad, synonyms and corresponding probabilities are :
(['saaaaad'], [1])
--------------------


In [8]:
def swap_words(words):
    words = words.split()
    if len(words) < 2:
        return " ".join(words), False
    random_idx = np.random.randint(0, len(words)-1)
    words[random_idx], words[random_idx+1] = words[random_idx+1], words[random_idx] 
    return " ".join(words), True

In [9]:
for _ in range(5):
    print(swap_words('The sun is shining today, this makes me feel so good !')[0])

The sun is shining today, this me makes feel so good !
The sun is shining today, makes this me feel so good !
sun The is shining today, this makes me feel so good !
The is sun shining today, this makes me feel so good !
The sun is shining today, this makes me feel so ! good


# If our selected word token has adjective, adverb, verb, change them to their synonym to make deeper network

In [10]:
def new_row(row, n_samples=1): 
    text, selected_text, textID = row['text'], row['selected_text'], row['textID']
    
    oth_text = text.replace(selected_text, " _________________________________ ")
    
    new_selected_text = [get_synonyms(word) for word in selected_text.split()]
    
    new_oth_text = [get_synonyms(word) for word in oth_text.split()]
    
    new_sentences = [row]
    
    for i in range(n_samples):
        selected_text_ = " ".join([np.random.choice(l_syn, p=p, replace=True) for l_syn, p in new_selected_text])
        text_ = oth_text.replace("_________________________________", selected_text_)
        if not selected_text_ in text_:
            print(f'Original : {text} with target {selected_text}, oth_text {oth_text}\nTransformed : {text_} with target {selected_text_}, oth_text {oth_text_}')
            continue
        row2 = dc(row)
        row2['text'] = text_
        row2['selected_text'] = selected_text_
        row2['textID'] = f'new_{textID}'
        new_sentences.append(row2)
        
    new_rows = pd.concat(new_sentences, axis=1).transpose().drop_duplicates(subset=['text'], inplace=False, ignore_index=True)
    new_rows = new_rows.loc[new_rows['text'].apply(len)<150]
    counter = 0
    
    for i, row in new_rows.iterrows():
        if row['textID'][:4] == 'new_':
            row['textID'] = row['textID']+f'_{counter}'
            counter += 1
    return new_rows


# Example of new sentence with stemming

In [11]:
new_row(train.loc[np.random.choice(train.shape[0])], n_samples=8)

Unnamed: 0,textID,text,selected_text,sentiment
0,4ab4d74f14,so go back for more,so go back for more,neutral
1,new_4ab4d74f14_0,so offer back for more,so offer back for more,neutral
2,new_4ab4d74f14_1,so go plump for for more,so go plump for for more,neutral
3,new_4ab4d74f14_2,so go back for more,so go back for more,neutral
4,new_4ab4d74f14_3,so ecstasy back for more,so ecstasy back for more,neutral
5,new_4ab4d74f14_4,so go hind for more,so go hind for more,neutral
6,new_4ab4d74f14_5,so go support for more,so go support for more,neutral


In [12]:
tmpTrain = train[:10]

In [13]:
tmpTrain

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,088c60f138,my boss is bullying me...,bullying me,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative
5,28b57f3990,http://www.dothebouncy.com/smf - some shameles...,http://www.dothebouncy.com/smf - some shameles...,neutral
6,6e0c6d75b1,2am feedings for the baby are fun when he is a...,fun,positive
7,50e14c0bb8,Soooo high,Soooo high,neutral
8,e050245fbd,Both of you,Both of you,neutral
9,fc2cbefa9d,Journey!? Wow... u just became cooler. hehe....,Wow... u just became cooler.,positive


In [14]:
temp = [new_row(row, n_samples=2) for _, row in train.iterrows()]

augmented_data = pd.concat(temp, axis=0)#.sample(frac=1)

del temp
gc.collect()
augmented_data.drop_duplicates(subset=['text'], inplace=False, ignore_index=True)
augmented_data.reset_index(drop=True, inplace=True)


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



# Save training set

In [15]:
augmented_data.to_csv('extended_train.csv', index=False)
train = augmented_data
del augmented_data

In [21]:
train['text_len'] = train['text'].apply(lambda x: len(x))
train = train.loc[train.text_len<150]
train.drop(columns=["text_len"], inplace=True)
train.reset_index(drop=True, inplace=True)

In [22]:
train.to_csv('extended_train.csv', index = False)

In [3]:
train = pd.read_csv('extended_train.csv').fillna('')
n = train.shape[0]

# Preprocess RoBERT model

In [4]:
ct = train.shape[0]
input_ids = np.ones((ct,MAX_LEN),dtype='int32')
attention_mask = np.zeros((ct,MAX_LEN),dtype='int32')
token_type_ids = np.zeros((ct,MAX_LEN),dtype='int32')
start_tokens = np.zeros((ct,MAX_LEN),dtype='int32')
end_tokens = np.zeros((ct,MAX_LEN),dtype='int32')

for k in range(train.shape[0]):
    
    # FIND OVERLAP
    text1 = " "+" ".join(train.loc[k,'text'].split())
    text2 = " ".join(train.loc[k,'selected_text'].split())
    idx = text1.find(text2)
    chars = np.zeros((len(text1)))
    chars[idx:idx+len(text2)]=1
    if text1[idx-1]==' ': chars[idx-1] = 1 
    enc = tokenizer.encode(text1) 
        
    # ID_OFFSETS
    offsets = []; idx=0
    for t in enc.ids:
        w = tokenizer.decode([t])
        offsets.append((idx,idx+len(w)))
        idx += len(w)
    
    # START END TOKENS
    toks = []
    for i,(a,b) in enumerate(offsets):
        sm = np.sum(chars[a:b])
        if sm>0: toks.append(i) 
        
    s_tok = sentiment_id[train.loc[k,'sentiment']]
    input_ids[k,:len(enc.ids)+3] = [0, s_tok] + enc.ids + [2]
    attention_mask[k,:len(enc.ids)+3] = 1
    if len(toks)>0:
        start_tokens[k,toks[0]+2] = 1
        end_tokens[k,toks[-1]+2] = 1


In [5]:
test = pd.read_csv('../1.Data/test.csv').fillna('')


ct = test.shape[0]
input_ids_t = np.ones((ct,MAX_LEN),dtype='int32')
attention_mask_t = np.zeros((ct,MAX_LEN),dtype='int32')
token_type_ids_t = np.zeros((ct,MAX_LEN),dtype='int32')

for k in range(test.shape[0]):
        
    # INPUT_IDS
    text1 = " "+" ".join(test.loc[k,'text'].split())
    enc = tokenizer.encode(text1)                
    s_tok = sentiment_id[test.loc[k,'sentiment']]
    input_ids_t[k,:len(enc.ids)+3] = [0, s_tok] + enc.ids + [2]
    attention_mask_t[k,:len(enc.ids)+3] = 1

# Build Model

In [6]:
import pickle

def save_weights(model, dst_fn):
    weights = model.get_weights()
    with open(dst_fn, 'wb') as f:
        pickle.dump(weights, f)


def load_weights(model, weight_fn):
    with open(weight_fn, 'rb') as f:
        weights = pickle.load(f)
    model.set_weights(weights)
    return model

def loss_fn(y_true, y_pred):
    # adjust the targets for sequence bucketing
    ll = tf.shape(y_pred)[1]
    y_true = y_true[:, :ll]
    loss = tf.keras.losses.categorical_crossentropy(y_true, y_pred,
        from_logits=False, label_smoothing=LABEL_SMOOTHING)
    loss = tf.reduce_mean(loss)
    return loss


def build_model():
    ids = tf.keras.layers.Input((MAX_LEN,), dtype=tf.int32)
    att = tf.keras.layers.Input((MAX_LEN,), dtype=tf.int32)
    tok = tf.keras.layers.Input((MAX_LEN,), dtype=tf.int32)
    padding = tf.cast(tf.equal(ids, PAD_ID), tf.int32)

    lens = MAX_LEN - tf.reduce_sum(padding, -1)
    max_len = tf.reduce_max(lens)
    ids_ = ids[:, :max_len]
    att_ = att[:, :max_len]
    tok_ = tok[:, :max_len]

    config = RobertaConfig.from_pretrained(PATH+'config-roberta-base.json')
    bert_model = TFRobertaModel.from_pretrained(PATH+'pretrained-roberta-base.h5',config=config)
    x = bert_model(ids_,attention_mask=att_,token_type_ids=tok_)
    
    x1 = tf.keras.layers.Dropout(0.1)(x[0])
    x1 = tf.keras.layers.Conv1D(768, 2,padding='same')(x1)
    x1 = tf.keras.layers.LeakyReLU()(x1)
    x1 = tf.keras.layers.Conv1D(64, 2,padding='same')(x1)
    x1 = tf.keras.layers.Dense(1)(x1)
    x1 = tf.keras.layers.Flatten()(x1)
    x1 = tf.keras.layers.Activation('softmax')(x1)
    
    x2 = tf.keras.layers.Dropout(0.1)(x[0]) 
    x2 = tf.keras.layers.Conv1D(768, 2,padding='same')(x2)
    x2 = tf.keras.layers.LeakyReLU()(x2)
    x2 = tf.keras.layers.Conv1D(64, 2, padding='same')(x2)
    x2 = tf.keras.layers.Dense(1)(x2)
    x2 = tf.keras.layers.Flatten()(x2)
    x2 = tf.keras.layers.Activation('softmax')(x2)

    model = tf.keras.models.Model(inputs=[ids, att, tok], outputs=[x1,x2])
    optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5) 
    model.compile(loss=loss_fn, optimizer=optimizer)
    
    # this is required as `model.predict` needs a fixed size!
    x1_padded = tf.pad(x1, [[0, 0], [0, MAX_LEN - max_len]], constant_values=0.)
    x2_padded = tf.pad(x2, [[0, 0], [0, MAX_LEN - max_len]], constant_values=0.)
    
    padded_model = tf.keras.models.Model(inputs=[ids, att, tok], outputs=[x1_padded,x2_padded])
    return model, padded_model


# Cost Function

In [7]:
def jaccard(str1, str2): 
    a = set(str1.lower().split()) 
    b = set(str2.lower().split())
    if (len(a)==0) & (len(b)==0): return 0.5
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

# Train RoBERT

In [8]:
jac = []; VER='v0'; DISPLAY=1 # USE display=1 FOR INTERACTIVE
oof_start = np.zeros((input_ids.shape[0],MAX_LEN))
oof_end = np.zeros((input_ids.shape[0],MAX_LEN))
preds_start = np.zeros((input_ids_t.shape[0],MAX_LEN))
preds_end = np.zeros((input_ids_t.shape[0],MAX_LEN))

skf = StratifiedKFold(n_splits=5,shuffle=True,random_state=SEED)

for fold,(idxT,idxV) in enumerate(skf.split(input_ids,train.sentiment.values)):

    print('#'*25)
    print('### FOLD %i'%(fold+1))
    print('#'*25)
    
    K.clear_session()
    model, padded_model = build_model()
        
    inpT = [input_ids[idxT,], attention_mask[idxT,], token_type_ids[idxT,]]
    targetT = [start_tokens[idxT,], end_tokens[idxT,]]
    inpV = [input_ids[idxV,],attention_mask[idxV,],token_type_ids[idxV,]]
    targetV = [start_tokens[idxV,], end_tokens[idxV,]]
    # sort the validation data
    shuffleV = np.int32(sorted(range(len(inpV[0])), key=lambda k: (inpV[0][k] == PAD_ID).sum(), reverse=True))
    inpV = [arr[shuffleV] for arr in inpV]
    targetV = [arr[shuffleV] for arr in targetV]
    weight_fn = '%s-roberta-%i.h5'%(VER,fold)
    for epoch in range(1, EPOCHS + 1):
        # sort and shuffle: We add random numbers to not have the same order in each epoch
        shuffleT = np.int32(sorted(range(len(inpT[0])), key=lambda k: (inpT[0][k] == PAD_ID).sum() + np.random.randint(-3, 3), reverse=True))
        # shuffle in batches, otherwise short batches will always come in the beginning of each epoch
        num_batches = math.ceil(len(shuffleT) / BATCH_SIZE)
        batch_inds = np.random.permutation(num_batches)
        shuffleT_ = []
        for batch_ind in batch_inds:
            shuffleT_.append(shuffleT[batch_ind * BATCH_SIZE: (batch_ind + 1) * BATCH_SIZE])
        shuffleT = np.concatenate(shuffleT_)
        # reorder the input data
        inpT = [arr[shuffleT] for arr in inpT]
        targetT = [arr[shuffleT] for arr in targetT]
        model.fit(inpT, targetT, 
            epochs=epoch, initial_epoch=epoch - 1, batch_size=BATCH_SIZE, verbose=DISPLAY, callbacks=[],
            validation_data=(inpV, targetV), shuffle=False)  # don't shuffle in `fit`
        save_weights(model, weight_fn)

    print('Loading model...')
    # model.load_weights('%s-roberta-%i.h5'%(VER,fold))
    load_weights(model, weight_fn)

    print('Predicting OOF...')
    oof_start[idxV,],oof_end[idxV,] = padded_model.predict([input_ids[idxV,],attention_mask[idxV,],token_type_ids[idxV,]],verbose=DISPLAY)
    
    print('Predicting Test...')
    preds = padded_model.predict([input_ids_t,attention_mask_t,token_type_ids_t],verbose=DISPLAY)
    preds_start += preds[0]/skf.n_splits
    preds_end += preds[1]/skf.n_splits
    
    # DISPLAY FOLD JACCARD
    all = []
    for k in idxV:
        a = np.argmax(oof_start[k,])
        b = np.argmax(oof_end[k,])
        if a>b: 
            st = train.loc[k,'text'] # IMPROVE CV/LB with better choice here
        else:
            text1 = " "+" ".join(train.loc[k,'text'].split())
            enc = tokenizer.encode(text1)
            st = tokenizer.decode(enc.ids[a-2:b-1])
        all.append(jaccard(st,train.loc[k,'selected_text']))
    jac.append(np.mean(all))
    print('>>>> FOLD %i Jaccard ='%(fold+1),np.mean(all))
    print()
########

#########################
### FOLD 1
#########################
Loading model...
Predicting Test...
#########################
### FOLD 2
#########################
Loading model...
Predicting Test...
#########################
### FOLD 3
#########################
Loading model...
Predicting Test...
#########################
### FOLD 4
#########################
Loading model...
Predicting Test...
#########################
### FOLD 5
#########################
Loading model...
Predicting Test...


# Predict Test set

In [9]:
all = []
for k in range(input_ids_t.shape[0]):
    a = np.argmax(preds_start[k,])
    b = np.argmax(preds_end[k,])
    if a>b: 
        st = test.loc[k,'text']
    else:
        text1 = " "+" ".join(test.loc[k,'text'].split())
        enc = tokenizer.encode(text1)
        st = tokenizer.decode(enc.ids[a-2:b-1])
    all.append(st)

In [10]:
test['selected_text'] = all
test[['textID','selected_text']].to_csv('submission.csv',index=False)
pd.set_option('max_colwidth', 60)
test.sample(25)


Unnamed: 0,textID,text,sentiment,selected_text
87,cde0016d6d,"http://twitpic.com/4wp8s - My ear hurts, and THIS is my ...",negative,"hurts,"
2701,acf7eddf2b,****. You could have just called or told me in person. ...,negative,****.
589,7315faa6ce,oh mannn i`m gonna be there tomorroww,neutral,oh mannn i`m gonna be there tomorroww
1822,5a8f396cb2,so who is in for bring at HK lounge tomorrow ? $12 all y...,positive,nice
1644,fc6128f4e5,"ok i just spent like ï¿½50 on soundtracks, a galaxy clas...",neutral,"ok i just spent like ï¿½50 on soundtracks, a galaxy cla..."
1839,c4393e4cba,HAPPY MOTHERS DAY! Tell ur mom that`s she an awesome m...,positive,happy
1596,6612e42b28,Kyle is Cody`s wee bro!,neutral,kyle is cody`s wee bro!
2687,4e05652b8c,Time to play the drums,neutral,time to play the drums
1952,63491134ab,i made a vid for you proving my skiLLs that you denied ...,negative,i made a vid for you proving my skiLLs that you denied ...
2539,ba7df25a0e,Chrystina Grace Timberlake has a ring to it!,neutral,chrystina grace timberlake has a ring to it!
