# Import

In [400]:
import os
import re
import random
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import string
from tqdm import tqdm, tqdm_notebook

import torch
import torch.nn as nn
from torch.nn import functional as F
from torch.optim import lr_scheduler

from sklearn import model_selection
from sklearn import metrics
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from scipy.optimize import minimize
from scipy.special import softmax

import transformers
import tokenizers
from transformers import AdamW, get_linear_schedule_with_warmup

from IPython.core.debugger import set_trace
pd.set_option('display.max_colwidth', -1)
pd.set_option('display.max_rows', 500)



# Settings

In [401]:
class config:
    DATA_DIR = '../tweet_sentiment_extraction'
    ROBERTA_PATH = '../tweet_sentiment_extraction/roberta-base'
    OUTPUT_DIR = '../tweet_sentiment_extraction'
    TRAIN_FILE = 'train_folds.csv'
    TEST_FILE = 'test.csv'
    PREDICT_FILE = 'predict.csv'
    SAVE_MODEL_DIR = 'trained_model'
    PRETRAINED_MODELS = 'warm_up_steps_100'
    MAX_LEN = 128
    MAX_LEN_CHAR = 137
    TRAIN_BATCH_SIZE = 32
    VALID_BATCH_SIZE = 32
    LOGGING_STEPS = 100
    SEED = 1111
    DEVICE = torch.device('cuda:1')
    TOKENIZER = tokenizers.ByteLevelBPETokenizer(
        vocab_file=f"{ROBERTA_PATH}/vocab.json", 
        merges_file=f"{ROBERTA_PATH}/merges.txt", 
        lowercase=True,
        add_prefix_space=True
    )

In [402]:
def seed_everything(seed=1234):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
seed_everything(config.SEED)

# Utils

In [403]:
class AverageMeter:
    """
    Computes and stores the average and current value
    """
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

def jaccard(str1, str2): 
    a = set(str1.lower().split()) 
    b = set(str2.lower().split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

# Data Processing

In [404]:
def process_data(tweet, selected_text, sentiment, tokenizer, max_len):
    tweet = " " + " ".join(str(tweet).split())
    selected_text = " " + " ".join(str(selected_text).split())

    len_st = len(selected_text) - 1
    idx0 = None
    idx1 = None

    for ind in (i for i, e in enumerate(tweet) if e == selected_text[1]):
        if " " + tweet[ind: ind+len_st] == selected_text:
            idx0 = ind
            idx1 = ind + len_st - 1
            break

    char_targets = [0] * len(tweet)
    if idx0 != None and idx1 != None:
        for ct in range(idx0, idx1 + 1):
            char_targets[ct] = 1
    
    tok_tweet = tokenizer.encode(tweet)
    input_ids_orig = tok_tweet.ids
    tweet_offsets = tok_tweet.offsets
    
    target_idx = []
    for j, (offset1, offset2) in enumerate(tweet_offsets):
        if sum(char_targets[offset1: offset2]) > 0:
            target_idx.append(j)
    
    targets_start = target_idx[0]
    targets_end = target_idx[-1]

    sentiment_id = {
        'positive': 1313,
        'negative': 2430,
        'neutral': 7974
    }
    
    input_ids = [0] + [sentiment_id[sentiment]] + [2] + [2] + input_ids_orig + [2]
    token_type_ids = [0, 0, 0, 0] + [0] * (len(input_ids_orig) + 1)
    mask = [1] * len(token_type_ids)
    tweet_offsets = [(0, 0)] * 4 + tweet_offsets + [(0, 0)]
    targets_start += 4
    targets_end += 4

    padding_length = max_len - len(input_ids)
    if padding_length > 0:
        input_ids = input_ids + ([1] * padding_length)
        mask = mask + ([0] * padding_length)
        token_type_ids = token_type_ids + ([0] * padding_length)
        tweet_offsets = tweet_offsets + ([(0, 0)] * padding_length)
    
    return {
        'ids': input_ids,
        'mask': mask,
        'token_type_ids': token_type_ids,
        'targets_start': targets_start,
        'targets_end': targets_end,
        'orig_tweet': tweet,
        'orig_selected': selected_text,
        'sentiment': sentiment,
        'offsets': tweet_offsets
    }

# Data Loader

In [405]:
train_df = pd.read_csv(os.path.join(config.DATA_DIR, config.TRAIN_FILE))
train_df = train_df.dropna(how='any', axis=0)
# test_df = pd.read_csv(os.path.join(config.DATA_DIR, config.TEST_FILE))

In [406]:
class TweetDataset :
    def __init__(self, tweet, sentiment, selected_text):
        self.tweet = tweet
        self.sentiment = sentiment
        self.selected_text = selected_text
        self.tokenizer = config.TOKENIZER
        self.max_len = config.MAX_LEN
    
    def __len__(self):
        return len(self.tweet)
    
    def __getitem__(self, item):
        data = process_data(
            self.tweet[item],
            self.selected_text[item],
            self.sentiment[item],
            self.tokenizer,
            self.max_len
        )
        
        return {
            'ids': torch.tensor(data['ids'], dtype=torch.long),
            'mask': torch.tensor(data['mask'], dtype=torch.long),
            'token_type_ids': torch.tensor(data["token_type_ids"], dtype=torch.long),
            'targets_start': torch.tensor(data["targets_start"], dtype=torch.long),
            'targets_end': torch.tensor(data["targets_end"], dtype=torch.long),
            'orig_tweet': data["orig_tweet"],
            'orig_selected': data["orig_selected"],
            'sentiment': data["sentiment"],
            'offsets': torch.tensor(data["offsets"], dtype=torch.long)
        }

# Model

In [407]:
class TweetModel(transformers.BertPreTrainedModel):
    def __init__(self, conf):
        super(TweetModel, self).__init__(conf)
        self.roberta = transformers.RobertaModel.from_pretrained(config.ROBERTA_PATH, config=conf)
        self.drop_out = nn.Dropout(0.1)
        self.l0 = nn.Linear(768 * 2, 200)
        torch.nn.init.normal_(self.l0.weight, std=0.02)
        self.l1 = nn.Linear(200, 2)
        torch.nn.init.normal_(self.l1.weight, std=0.02)
    
    def forward(self, ids, mask, token_type_ids):
        _, _, out = self.roberta(
            ids,
            attention_mask=mask,
            token_type_ids=token_type_ids
        )

        out = torch.cat((out[-1], out[-2]), dim=-1)
        out = self.drop_out(out)
        logits = self.l0(out)
        logits = self.l1(logits)

        start_logits, end_logits = logits.split(1, dim=-1)

        start_logits = start_logits.squeeze(-1)
        end_logits = end_logits.squeeze(-1)

        return start_logits, end_logits

In [408]:
def calculate_jaccard_score(original_tweet, target_string, sentiment_val, idx_start, idx_end, offsets):
    if idx_end < idx_start:
        idx_end = idx_start
    
#     while idx_start > 4:
#         try:
#             if original_tweet[offsets[idx_start][0]] != ' ': #offsets[idx_start - 1][1] == offsets[idx_start][0]:
#                 idx_start = idx_start - 1
#             else:
#                 break
#         except:
#             print('Start Exception:', original_tweet)
#             break
            
#     while idx_end < len(offsets) - 2:
#         try:
#             if original_tweet[offsets[idx_end + 1][0]] != ' ': #offsets[idx_end][1] == offsets[idx_end + 1][0]:
#                 idx_end = idx_end + 1
#             else:
#                 break
#         except:
#             print('End Exception:', original_tweet)
#             break
    
    filtered_output  = ""
    for ix in range(idx_start, idx_end + 1):
        filtered_output += original_tweet[offsets[ix][0]: offsets[ix][1]]
        if (ix+1) < len(offsets) and offsets[ix][1] < offsets[ix+1][0]:
            filtered_output += " "
            
#     if sentiment_val == 'neutral' or len(original_tweet.split()) < 2:
    if len(original_tweet.split()) < 2:
        filtered_output = original_tweet
        
    jac = jaccard(target_string.strip(), filtered_output.strip())
        
    return jac, filtered_output

# Predict

In [409]:
def run_predict(pretrained_model):
    print('###########################')
    print('### Get meta data model {}'.format(pretrained_model))
    print('###########################')
    
    model_outputs_start = None
    model_outputs_end = None
    tweet_list = []
    selected_text_list = []
    sentiment_list = []
    final_output = []
        
    for fold in range(5):
        
        fold_train_df = train_df[train_df.kfold != fold].reset_index(drop=True)
        fold_valid_df = train_df[train_df.kfold == fold].reset_index(drop=True)
        
        valid_dataset = TweetDataset(
            tweet = fold_valid_df.text.values,
            sentiment = fold_valid_df.sentiment.values,
            selected_text = fold_valid_df.selected_text.values
        )

        valid_data_loader = torch.utils.data.DataLoader(
            valid_dataset,
            batch_size = config.VALID_BATCH_SIZE,
            shuffle = False,
            num_workers = 8
        )
        
        model_path = os.path.join(config.DATA_DIR, 
                                  config.SAVE_MODEL_DIR, 
                                  pretrained_model, 
                                  f'model_fold_{fold}.bin')
        
        model = TweetModel(conf=model_config)
        model.to(config.DEVICE)
        model.load_state_dict(torch.load(model_path))
        model.eval()
        
        with torch.no_grad():
            tk0 = tqdm_notebook(valid_data_loader, total=len(valid_data_loader), desc=f'Fold {fold}')
            for bi, d in enumerate(tk0):
                ids = d["ids"]
                token_type_ids = d["token_type_ids"]
                mask = d["mask"]
                sentiment = d["sentiment"]
                orig_selected = d["orig_selected"]
                orig_tweet = d["orig_tweet"]
                targets_start = d["targets_start"]
                targets_end = d["targets_end"]
                offsets = d["offsets"].numpy() # (32, 128, 2)
                
                ids = ids.to(config.DEVICE, dtype=torch.long)
                token_type_ids = token_type_ids.to(config.DEVICE, dtype=torch.long)
                mask = mask.to(config.DEVICE, dtype=torch.long)
                targets_start = targets_start.to(config.DEVICE, dtype=torch.long)
                targets_end = targets_end.to(config.DEVICE, dtype=torch.long)
                
                tweet_list.extend(orig_tweet)
                selected_text_list.extend(orig_selected)
                sentiment_list.extend(sentiment)

#                 outputs_start [32, 128]
                outputs_start, outputs_end = model(
                    ids=ids,
                    mask=mask,
                    token_type_ids=token_type_ids
                )
        
                outputs_start = torch.softmax(outputs_start, dim=1).cpu().detach().numpy()
                outputs_end = torch.softmax(outputs_end, dim=1).cpu().detach().numpy()

                for px, tweet in enumerate(orig_tweet):
                    selected_tweet = orig_selected[px]
                    tweet_sentiment = sentiment[px]
                    
                    start_index = np.argmax(outputs_start[px, :])
                    end_index = np.argmax(outputs_end[px, :])
                    
#                     next_i = 0
#                     while end_index < start_index:
#                         next_i += 1
#                         end_index = outputs_end[px, :].argsort()[::-1][next_i]

                    _, output_sentence = calculate_jaccard_score(
                        original_tweet=tweet,
                        target_string=selected_tweet,
                        sentiment_val=tweet_sentiment,
                        idx_start=start_index,
                        idx_end=end_index,
                        offsets=offsets[px]
                    )
                    
#                     if output_sentence.strip() not in selected_tweet.strip() and selected_tweet.strip() not in output_sentence.strip():
#                         print(tweet)
#                         print(selected_tweet)
#                         print(output_sentence)
#                         set_trace()

                    final_output.append(output_sentence)
    
    meta_train_df['tweet'] = tweet_list
    meta_train_df['selected_text'] = selected_text_list
    meta_train_df['sentiment'] = sentiment_list
    meta_train_df['predicted_text'] = final_output

In [410]:
model_config = transformers.RobertaConfig.from_pretrained(config.ROBERTA_PATH)
model_config.output_hidden_states = True

meta_train_df = pd.DataFrame()

In [411]:
run_predict(config.PRETRAINED_MODELS)

###########################
### Get meta data model warm_up_steps_100
###########################


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=0.0, description='Fold 0', max=172.0, style=ProgressStyle(description_width…




HBox(children=(FloatProgress(value=0.0, description='Fold 1', max=172.0, style=ProgressStyle(description_width…




HBox(children=(FloatProgress(value=0.0, description='Fold 2', max=172.0, style=ProgressStyle(description_width…




HBox(children=(FloatProgress(value=0.0, description='Fold 3', max=172.0, style=ProgressStyle(description_width…




HBox(children=(FloatProgress(value=0.0, description='Fold 4', max=172.0, style=ProgressStyle(description_width…




In [412]:
def compare_text(s):
    if s.selected_text == s.predicted_text:
        return 0
    elif s.predicted_text in s.selected_text:
        return -1
    elif s.selected_text in s.predicted_text:
        return 1
    else:
        return 2
    
meta_train_df['result'] = meta_train_df.apply(compare_text, axis=1)

In [413]:
meta_train_df['result'].value_counts()

 0    15496
-1    5757 
 1    4121 
 2    2106 
Name: result, dtype: int64

In [414]:
def jaccard_row(s):
    return jaccard(s.selected_text, s.predicted_text)

meta_train_df['jaccard'] = meta_train_df.apply(jaccard_row, axis=1)
meta_train_df.jaccard.mean()

# Best 0.7108151638410456
# 0.7113087842265181

0.7107719675776074

In [16]:
meta_train_df[meta_train_df.jaccard == 0].sample(100)

Unnamed: 0,tweet,selected_text,sentiment,predicted_text,result,jaccard
284,you got it all done?? even the labs? thats all i have left to do. and i cant figure it out. FML,i cant figure it out.,negative,and,2,0.0
12006,haha that photo is too funny! I hope he wasn`t disturbed by passengers too much during the flight,hope,positive,funny!,2,0.0
5512,SWEEEEET - San Fran is awesome!!!! Love it there,Love it there,positive,awesome!!!!,2,0.0
9392,bruise on knee will make it hard to skate tomorrow.,bruise,negative,hard,2,0.0
1825,http://twitpic.com/4wf30 - EEEEEEEEEEE!!!!! IT CAME!!!!,EEEEEEEEEEE!!,positive,EEEEEEEEEEE!!!!!,1,0.0
3205,"will u give ur mother anything special? I`ll make a cake and make sure mine gets a stressfree day =p cheesy, but it works",stressfree,positive,special?,2,0.0
15771,I am so hungry! And there is no food for me to steal,no food,negative,I am so hungry!,2,0.0
14408,"i just saw a baby dove fall from a tree, break its neck and die i tried to save it and couldnt. what a cruel world",die,negative,cruel world,2,0.0
15874,are you are a RIOT!!,RIOT!,negative,RIOT!!,1,0.0
26675,wonder what white supremists have to say about this I abhor racists. http://bit.ly/Rd31F,wonder,negative,abhor,2,0.0


In [17]:
meta_train_df[meta_train_df['result'] == 2].sample(20)

Unnamed: 0,tweet,selected_text,sentiment,predicted_text,result,jaccard
2892,C-News wasn`t as bad as I expected... could have done better... but today we did a great show...,better..,positive,great,2,0.0
10795,It SUCKS when my playmate is MIA all weekend. Sad,Sad,negative,SUCKS,2,0.0
20520,"I hate different referenced assemblies. My FNH goes against castle 1.1, Caliburn against 1.03 .. not to mention dynamicproxy in NH",hate di,negative,I hate,2,0.333333
10096,"That poor girl on britains got talent, god love her forgot the words and cried but gets a second chance to perform again : ]",t gets a second chance to perform again,positive,god love her,2,0.0
9570,errr....I don`t want to go sit in the heat and watch a high school graduation.,heat,negative,I don`t want to go,2,0.0
1785,"Thought I`d check out home, that was 10 minutes ago, downloading update, now home square....that`s why I can`t be bothered with it Sony",that`s why I can`t be bothered with it Sony,positive,....that`s why I can`t be bothered with it Sony,2,0.8
11209,Think I`ll go enjoy the sun`s rays again...I LOVE being off work,I LOVE being off work,positive,enjoy the sun`s rays again...I LOVE,2,0.1
12348,i really should stop relying on my family when they say their going to get me lunch,should stop relying,negative,i really should stop,2,0.4
5311,() Oh snap! Just broke my windshield while replacing my wiper blades.,Oh snap!,negative,broke,2,0.0
26509,"My back is all kinds of messed up, and Strudel is going to live in another town until we move, I`m going to miss the little runt",miss,negative,"messed up,",2,0.0


In [204]:
def test_predict(s):
    tweet = s.tweet.strip()
    predicted_text = s.predicted_text.strip()
    
    if ' ' + predicted_text + ' ' in ' ' + tweet + ' ':
        return True
    else:
        return False
    
meta_train_df[~meta_train_df.apply(test_predict, axis=1)]

Unnamed: 0,tweet,selected_text,sentiment,predicted_text,result,jaccard
24043,.ï¿½ï¿½?????ï¿½ï¿½. i bet. man i wish i coulda went some whr. i didn`t even go out here. i was home all weekend. nyt life sux here,nd. nyt life sux,negative,,-1,0.0


## Predict 2

In [415]:
meta_train_df['predicted_text_2'] = meta_train_df['predicted_text'].apply(lambda x: x.replace('!!!!', '!') if len(x.split())==1 else x)
meta_train_df['predicted_text_2'] = meta_train_df['predicted_text_2'].apply(lambda x: x.replace('..', '.') if len(x.split())==1 else x)
meta_train_df['predicted_text_2'] = meta_train_df['predicted_text_2'].apply(lambda x: x.replace('...', '.') if len(x.split())==1 else x)

In [416]:
def jaccard_row_2(s):
    return jaccard(s.selected_text, s.predicted_text_2)

meta_train_df['jaccard_2'] = meta_train_df.apply(jaccard_row_2, axis=1)
meta_train_df.jaccard_2.mean()
# 0.7114070927056755
# 0.7118643229892556

0.7113638964422373

In [57]:
def post_process(text):
    remove_chars = [',', '.', '?', '!']
    new_text = text
    for char in remove_chars:
        if char == new_text[-1]:
            new_text = new_text.rstrip(char)
            new_text = new_text + char
            break
    
    text_set = set(text.split())
    new_text_set = set(new_text.split())
    return ' '.join(set.union(text_set, new_text_set))

meta_train_df['predicted_text_3'] = meta_train_df['predicted_text'].apply(post_process)

## Predict 3

In [58]:
def jaccard_row_3(s):
    return jaccard(s.selected_text, s.predicted_text_3)

meta_train_df['jaccard_3'] = meta_train_df.apply(jaccard_row_3, axis=1)

In [59]:
meta_train_df.jaccard_3.mean()

0.7064071440565611

In [53]:
meta_train_df.sample(20)

Unnamed: 0,tweet,selected_text,sentiment,predicted_text,result,jaccard,char_offsets,predicted_text_2,jaccard_2,predicted_text_3,jaccard_3
19885,Bummer. MacBook is bugging out Any Mac heads want to lend a helping hand.,Bummer.,negative,Bummer.,0,1.0,"(0, 6)",Bummer.,1.0,Bummer. Bummer,0.5
14798,"When youre kawawa, you make me kawawa cuz i hate seeing you so kawawa`s ohhhhh <3",hate,negative,hate,0,1.0,"(0, 3)",hate,1.0,hate,1.0
9030,I think I`m running a fever and not feeling so well!,I think I`m running a fever and not feeling so well!,negative,not feeling so well!,-1,0.363636,"(0, 19)",not feeling so well!,0.363636,well! feeling so not well,0.333333
4836,"watching shark boy and lava girl! on disney channel. hanging out with sarah jane, micah, & rachel","watching shark boy and lava girl! on disney channel. hanging out with sarah jane, micah, & rachel",neutral,"watching shark boy and lava girl! on disney channel. hanging out with sarah jane, micah, & rachel",0,1.0,"(0, 96)","watching shark boy and lava girl! on disney channel. hanging out with sarah jane, micah, & rachel",1.0,"with rachel jane, boy watching and girl! micah, & channel. lava sarah shark hanging on disney out",1.0
16206,"nyc placements have started to come out, but it looks like upgrades might have to wait yet another two weeks for placements! dreadfully",dreadfully,negative,dreadfully,0,1.0,"(0, 9)",dreadfully,1.0,dreadfully,1.0
16449,picat 21,picat 21,neutral,picat 21,0,1.0,"(0, 7)",picat 21,1.0,picat 21,1.0
19732,Got to reconnect with some dear friends tonight. I am so lucky to have so many great people in my life. I am blessed,I am so lucky to have so many great people in my life. I am blessed,positive,I am so lucky to have so many great people in my life. I am blessed,0,1.0,"(0, 66)",I am so lucky to have so many great people in my life. I am blessed,1.0,lucky to I so great many people life. blessed my have in am,1.0
19675,She is all I can ever think about. http://twitpic.com/4wg12,She is all I can ever think about. http://twitpic.com/4wg12,neutral,She is all I can ever think about. http://twitpic.com/4wg12,0,1.0,"(0, 58)",She is all I can ever think about. http://twitpic.com/4wg12,1.0,ever http://twitpic.com/4wg12 about. all I She can is think,1.0
5090,"you have been busy, have a lovely time","you have been busy, have a lovely time",positive,lovely time,-1,0.285714,"(0, 10)",lovely time,0.285714,time lovely,0.285714
19084,well we aren`t normal then because 1. we can`t drive in mexico (you have to be 18) and 2. there are no dairy queens here,well we aren`t normal then,negative,well we aren`t normal,-1,0.8,"(0, 20)",well we aren`t normal,0.8,aren`t we normal well,0.8


# PP Model

In [None]:
# predicted_text     I

In [253]:
meta_train_df = meta_train_df[meta_train_df.predicted_text != ''][meta_train_df.sentiment != 'neutral']
meta_train_df['selected_text'] = meta_train_df['selected_text'].apply(lambda x: x.strip())
meta_train_df['predicted_text'] = meta_train_df['predicted_text'].apply(lambda x: x.strip())

  """Entry point for launching an IPython kernel.


In [254]:
character_list = list(set(' '.join(meta_train_df.tweet.values)))

print(character_list)

['T', 'y', 'L', '1', ';', 'X', 'B', 'a', '½', '´', 'f', '[', ']', ')', 'V', 'A', '#', 'c', 'l', 'o', 'u', 'P', '&', 'x', '+', 'Z', 'E', '6', '_', 'ï', '>', 'h', '?', 'g', '0', ' ', 't', '2', '}', 'z', '<', 'M', 'G', '¿', 'F', 'U', 'r', 'p', '7', 'i', 'v', '|', '$', ',', '@', 'k', 'm', 'n', 'J', 'H', '3', 'j', '*', 'C', 'S', '8', '.', 'b', '9', '~', '^', '4', '-', '!', '%', 's', 'Y', 'O', 'K', 'e', 'W', 'w', 'q', '5', ':', "'", 'Â', '{', '`', 'R', 'N', 'I', 'd', 'D', 'Q', '=', '/', '\\', '(']


In [255]:
def one_hot(c):
    one_hot_c = torch.zeros((len(character_list), ))
    one_hot_c[character_list.index(c)] = 1
    return one_hot_c

def process_data_one_hot(selected_text, predicted_text):
    input_matrix = torch.zeros((config.MAX_LEN_CHAR, len(character_list)))
    for i, c in enumerate(predicted_text[:config.MAX_LEN_CHAR]):
        one_hot_c = one_hot(c)
        input_matrix[i] = one_hot_c
    
    return  input_matrix

def loss_fn(start_logits, end_logits, start_positions, end_positions):
    loss_fct = nn.CrossEntropyLoss()
    start_loss = loss_fct(start_logits, start_positions)
    end_loss = loss_fct(end_logits, end_positions)
    total_loss = start_loss + end_loss
    
    return total_loss

In [256]:
class PPDataset:
    def __init__(self, selected_text, predicted_text, char_offsets):
        self.selected_text = selected_text
        self.predicted_text = predicted_text
        self.char_offsets = char_offsets
        
    def __len__(self):
        return len(self.selected_text)
    
    def __getitem__(self, item):
        input_matrix = process_data_one_hot(self.selected_text[item], self.predicted_text[item])
        
        return self.selected_text[item], self.predicted_text[item], input_matrix, self.char_offsets[item][0], self.char_offsets[item][1]

class PPModel(nn.Module):
    def __init__(self, embed_dim, hidden_dim):
        super(PPModel, self).__init__()
        
        self.lstm = nn.LSTM(embed_dim, hidden_dim, num_layers=1, batch_first=True)
        self.l1 = nn.Linear(hidden_dim, 50)
        self.m = nn.Sigmoid()
        self.l2 = nn.Linear(50, 2)
        
    def forward(self, input_):
        lstm_output, lstm_hidden = self.lstm(input_)
        logits = self.l1(lstm_output)
        logits = self.l2(self.m(logits))
        
        start_logits, end_logits = logits.split(1, dim=-1)

        start_logits = start_logits.squeeze(-1)
        end_logits = end_logits.squeeze(-1)

        return start_logits, end_logits

In [257]:
def test_index_finding(s):
    selected_text = s.selected_text.strip()
    predicted_text = s.predicted_text.strip()
    
    len_st = len(selected_text)
    len_pt = len(predicted_text)
    idx0 = None
    idx1 = None
    
    longest_sub_str = {}
    for head_index in range(len_st):
        for tail_index in range(len_st, head_index, -1):
            sub_str = selected_text[head_index: tail_index]
            idx0 = predicted_text.find(sub_str)
            if idx0 != -1:
                idx1 = idx0 + len(sub_str) - 1
                longest_sub_str[(idx0, idx1)] = len(sub_str)
                break
    
    if len(longest_sub_str) != 0:
        (idx0, idx1), _ = sorted(longest_sub_str.items(), key=lambda item: -item[1])[0]
    
    if idx0 == None or idx1 == None:
        return (None, None)
    
    return (idx0, idx1)

meta_train_df['char_offsets'] = meta_train_df.apply(test_index_finding, axis=1)

In [258]:
filter_train_df = meta_train_df[meta_train_df['char_offsets'] != (None, None)] #[meta_train_df.result != 2]
filter_train_df.sample(10)

Unnamed: 0,tweet,selected_text,sentiment,predicted_text,result,jaccard,char_offsets
25358,omg i didnt tweet all that much today.... *sadness*,. *sadness*,negative,*sadness*,-1,0.5,"(0, 8)"
5529,"She`s unassuming and unpretentious. She`s just, as. I suppose that`s why she`s so endearing--because we can relate to her",endearing-,positive,endearing--because,1,0.0,"(0, 9)"
19762,still at home goodness,still at home goodness,positive,goodness,-1,0.25,"(0, 7)"
418,_pearlman mow love mow,love,positive,love,0,1.0,"(0, 3)"
6976,Happy 11th I`m going to sleep now lolz,Happy,positive,Happy,0,1.0,"(0, 4)"
16407,received her first pressie. Happy happy.,Happy,positive,Happy happy.,1,0.5,"(0, 4)"
21101,Yeah. About **** time. My film SLR just sits there quietly being ignored. It`s time to step up!,ignored.,negative,ignored.,0,1.0,"(0, 7)"
2712,: experiencing the unique #Winnipeg tradition known as 'the social'. Trying to pace myself.,Trying to pace myself.,positive,unique,2,0.0,"(2, 2)"
26624,"Hi Susan, read your blog. It`s really good. I look forward for the tweet updates",y good,positive,It`s really good.,2,0.0,"(10, 15)"
16268,"epic writing fail, ew. going to try and sleep","fail,",negative,"fail,",0,1.0,"(0, 4)"


In [259]:
filter_train_df['predicted_text'].apply(lambda x: len(x)).describe()

count    16159.000000
mean     14.880376   
std      17.992678   
min      1.000000    
25%      5.000000    
50%      8.000000    
75%      16.000000   
max      137.000000  
Name: predicted_text, dtype: float64

In [260]:
train_selected_text, test_selected_text, train_predicted_text, test_predicted_text, train_char_offsets, test_char_offsets = train_test_split(filter_train_df.selected_text.values, 
                                                    filter_train_df.predicted_text.values,
                                                    filter_train_df.char_offsets.values,
                                                    test_size=0.33, 
                                                    random_state=config.SEED)
current_jaccards = AverageMeter()
for test_s_text, test_p_text in zip(test_selected_text, test_predicted_text):
    current_jaccards.update(jaccard(test_s_text, test_p_text))
    
print(f'Current Jaccard: {current_jaccards.avg}')

train_pp_dataset = PPDataset(train_selected_text, train_predicted_text, train_char_offsets)
test_pp_dataset = PPDataset(test_selected_text, test_predicted_text, test_char_offsets)

train_pp_loader = torch.utils.data.DataLoader(train_pp_dataset, batch_size=32, shuffle=False, num_workers=8)
test_pp_loader = torch.utils.data.DataLoader(test_pp_dataset, batch_size=32, shuffle=False, num_workers=8)

ppmodel = PPModel(len(character_list), 128)
ppmodel.to(config.DEVICE)
optimizer = AdamW(params=ppmodel.parameters(), lr=5e-3)

losses = AverageMeter()
jaccards = AverageMeter()
vl_losses = AverageMeter()
vl_jaccards = AverageMeter()

for epoch in range(10):
    print('###########################')
    print('### Train Epoch {}'.format(epoch))
    print('###########################')
    ppmodel.train()
    for step, (selected_text, predicted_text, input_matrix, idx0, idx1) in enumerate(train_pp_loader):
        input_matrix = input_matrix.to(config.DEVICE)
        idx0 = idx0.to(config.DEVICE)
        idx1 = idx1.to(config.DEVICE)
        
        start_logits, end_logits = ppmodel(input_matrix)
        loss = loss_fn(start_logits, end_logits, idx0, idx1)
        losses.update(loss)
        
        loss.backward()
        optimizer.step()
        ppmodel.zero_grad()
        
        outputs_start = torch.softmax(start_logits, dim=1).cpu().detach().numpy()
        outputs_end = torch.softmax(end_logits, dim=1).cpu().detach().numpy()
        
        for i, s_text in enumerate(selected_text):
            p_text = predicted_text[i]
            start = np.argmax(outputs_start[i, :])
            end = np.argmax(outputs_end[i, :])
            if end < start: end = start
                
            jaccard_score = jaccard(s_text, p_text[start: end + 1])
            jaccards.update(jaccard_score)
        
        if step > 0 and step % 100 == 0:
            print(f'Train Jaccard {jaccards.avg} - Loss {losses.avg}')
            jaccards.reset()
            losses.reset()
    
    ppmodel.eval()
    for step, (selected_text, predicted_text, input_matrix, idx0, idx1) in enumerate(test_pp_loader):
        input_matrix = input_matrix.to(config.DEVICE)
        idx0 = idx0.to(config.DEVICE)
        idx1 = idx1.to(config.DEVICE)
        
        start_logits, end_logits = ppmodel(input_matrix)
        vl_loss = loss_fn(start_logits, end_logits, idx0, idx1)
        vl_losses.update(vl_loss)
        
        outputs_start = torch.softmax(start_logits, dim=1).cpu().detach().numpy()
        outputs_end = torch.softmax(end_logits, dim=1).cpu().detach().numpy()
        
        for i, s_text in enumerate(selected_text):
            p_text = predicted_text[i]
            start = np.argmax(outputs_start[i, :])
            end = np.argmax(outputs_end[i, :])
            if end < start: end = start
            
            jaccard_score = jaccard(s_text, p_text[start: end + 1])
            jaccards.update(jaccard_score)
        
    print(f'Valid Jaccard {jaccards.avg} - Loss {vl_losses.avg}')
    jaccards.reset()
    vl_losses.reset()

Current Jaccard: 0.5394888288498124
###########################
### Train Epoch 0
###########################
Train Jaccard 0.09676860685930415 - Loss 6.061311721801758
Train Jaccard 0.4043427273701321 - Loss 3.152916431427002
Train Jaccard 0.49519100129934257 - Loss 2.19154953956604
Valid Jaccard 0.5111996037172074 - Loss 2.1274518966674805
###########################
### Train Epoch 1
###########################
Train Jaccard 0.48906253695881446 - Loss 2.147061347961426
Train Jaccard 0.5056493771460931 - Loss 1.96355140209198
Train Jaccard 0.5157325493586757 - Loss 1.9952610731124878
Valid Jaccard 0.5141059914773018 - Loss 1.9456785917282104
###########################
### Train Epoch 2
###########################
Train Jaccard 0.5045425860829879 - Loss 1.9744781255722046
Train Jaccard 0.5123964082439798 - Loss 1.8446475267410278
Train Jaccard 0.5232435309083542 - Loss 1.9149211645126343
Valid Jaccard 0.5164292949907129 - Loss 1.8919349908828735
###########################
### Train 