# Import

In [1]:
import os
import re
import random
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import string
from tqdm import tqdm, tqdm_notebook

import torch
import torch.nn as nn
from torch.nn import functional as F
from torch.optim import lr_scheduler

from sklearn import model_selection
from sklearn import metrics
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from scipy.optimize import minimize
from scipy.special import softmax

import transformers
import tokenizers
from transformers import AdamW, get_linear_schedule_with_warmup

from IPython.core.debugger import set_trace
pd.set_option('display.max_colwidth', -1)
pd.set_option('display.max_rows', 500)



# Settings

In [38]:
class config:
    DATA_DIR = '../tweet_sentiment_extraction'
    ROBERTA_PATH = '../tweet_sentiment_extraction/roberta-base'
    OUTPUT_DIR = '../tweet_sentiment_extraction'
    TRAIN_FILE = 'train.csv'
    TEST_FILE = 'test.csv'
    FULL_DATA_FILE = 'tweet_dataset.csv'
    PREDICT_FILE = 'predict.csv'
    SAVE_MODEL_DIR = 'trained_model'
    PRETRAINED_MODELS = 'warm_up_steps_100'
    MAX_LEN = 128
    MAX_LEN_CHAR = 140
    TRAIN_BATCH_SIZE = 32
    VALID_BATCH_SIZE = 32
    LOGGING_STEPS = 100
    SEED = 1111
    DEVICE = torch.device('cuda:1')
    TOKENIZER = tokenizers.ByteLevelBPETokenizer(
        vocab_file=f"{ROBERTA_PATH}/vocab.json", 
        merges_file=f"{ROBERTA_PATH}/merges.txt", 
        lowercase=True,
        add_prefix_space=True
    )
    SENTIMENT_MAP = {'worry':'negative', 
                     'sadness':'negative', 
                     'hate':'negative', 
                     'boredom':'negative', 
                     'anger':'negative',
                     'relief':'positive', 
                     'happiness':'positive', 
                     'love':'positive', 
                     'surprise':'positive', 
                     'fun':'positive', 
                     'enthusiasm':'positive',
                     'empty':'neutral',
                     'neutral':'neutral',
                     'positive':'positive',
                     'negative':'negative'
                    }

In [3]:
def seed_everything(seed=1234):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
seed_everything(config.SEED)

# Utils

In [4]:
class AverageMeter:
    """
    Computes and stores the average and current value
    """
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

def jaccard(str1, str2): 
    a = set(str1.lower().split()) 
    b = set(str2.lower().split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

# Data Processing

In [5]:
def process_data(tweet, selected_text, sentiment, tokenizer, max_len):
    tweet = " " + " ".join(str(tweet).split())
    selected_text = " " + " ".join(str(selected_text).split())

    len_st = len(selected_text) - 1
    idx0 = None
    idx1 = None

    for ind in (i for i, e in enumerate(tweet) if e == selected_text[1]):
        if " " + tweet[ind: ind+len_st] == selected_text:
            idx0 = ind
            idx1 = ind + len_st - 1
            break

    char_targets = [0] * len(tweet)
    if idx0 != None and idx1 != None:
        for ct in range(idx0, idx1 + 1):
            char_targets[ct] = 1
    
    tok_tweet = tokenizer.encode(tweet)
    input_ids_orig = tok_tweet.ids
    tweet_offsets = tok_tweet.offsets
    
    target_idx = []
    for j, (offset1, offset2) in enumerate(tweet_offsets):
        if sum(char_targets[offset1: offset2]) > 0:
            target_idx.append(j)
    
    targets_start = target_idx[0]
    targets_end = target_idx[-1]

    sentiment_id = {
        'positive': 1313,
        'negative': 2430,
        'neutral': 7974
    }
    
    input_ids = [0] + [sentiment_id[sentiment]] + [2] + [2] + input_ids_orig + [2]
    token_type_ids = [0, 0, 0, 0] + [0] * (len(input_ids_orig) + 1)
    mask = [1] * len(token_type_ids)
    tweet_offsets = [(0, 0)] * 4 + tweet_offsets + [(0, 0)]
    targets_start += 4
    targets_end += 4

    padding_length = max_len - len(input_ids)
    if padding_length > 0:
        input_ids = input_ids + ([1] * padding_length)
        mask = mask + ([0] * padding_length)
        token_type_ids = token_type_ids + ([0] * padding_length)
        tweet_offsets = tweet_offsets + ([(0, 0)] * padding_length)
    
    return {
        'ids': input_ids,
        'mask': mask,
        'token_type_ids': token_type_ids,
        'targets_start': targets_start,
        'targets_end': targets_end,
        'orig_tweet': tweet,
        'orig_selected': selected_text,
        'sentiment': sentiment,
        'offsets': tweet_offsets
    }

# Data Loader

In [6]:
train_df = pd.read_csv(os.path.join(config.DATA_DIR, config.FULL_DATA_FILE))
train_df = train_df[train_df['selected_text'].isna()]
train_df['new_sentiment'] = train_df['sentiment'].map(config.SENTIMENT_MAP)
train_df = train_df.drop(['sentiment', 'aux_id', 'author', 'old_text'], axis=1)
train_df = train_df.rename(columns={"new_sentiment": "sentiment"})
train_df.loc[:, 'selected_text'] = train_df.text.values

train_df = train_df.dropna(how='any', axis=0)

In [7]:
class TweetDataset :
    def __init__(self, tweet, sentiment, selected_text):
        self.tweet = tweet
        self.sentiment = sentiment
        self.selected_text = selected_text
        self.tokenizer = config.TOKENIZER
        self.max_len = config.MAX_LEN
    
    def __len__(self):
        return len(self.tweet)
    
    def __getitem__(self, item):
        data = process_data(
            self.tweet[item],
            self.selected_text[item],
            self.sentiment[item],
            self.tokenizer,
            self.max_len
        )
        
        return {
            'ids': torch.tensor(data['ids'], dtype=torch.long),
            'mask': torch.tensor(data['mask'], dtype=torch.long),
            'token_type_ids': torch.tensor(data["token_type_ids"], dtype=torch.long),
            'targets_start': torch.tensor(data["targets_start"], dtype=torch.long),
            'targets_end': torch.tensor(data["targets_end"], dtype=torch.long),
            'orig_tweet': data["orig_tweet"],
            'orig_selected': data["orig_selected"],
            'sentiment': data["sentiment"],
            'offsets': torch.tensor(data["offsets"], dtype=torch.long)
        }

# Model

In [8]:
class TweetModel(transformers.BertPreTrainedModel):
    def __init__(self, conf):
        super(TweetModel, self).__init__(conf)
        self.roberta = transformers.RobertaModel.from_pretrained(config.ROBERTA_PATH, config=conf)
        self.drop_out = nn.Dropout(0.1)
        self.l0 = nn.Linear(768 * 2, 200)
        torch.nn.init.normal_(self.l0.weight, std=0.02)
        self.l1 = nn.Linear(200, 2)
        torch.nn.init.normal_(self.l1.weight, std=0.02)
    
    def forward(self, ids, mask, token_type_ids):
        _, _, out = self.roberta(
            ids,
            attention_mask=mask,
            token_type_ids=token_type_ids
        )

        out = torch.cat((out[-1], out[-2]), dim=-1)
        out = self.drop_out(out)
        logits = self.l0(out)
        logits = self.l1(logits)

        start_logits, end_logits = logits.split(1, dim=-1)

        start_logits = start_logits.squeeze(-1)
        end_logits = end_logits.squeeze(-1)

        return start_logits, end_logits

In [9]:
def calculate_jaccard_score(original_tweet, target_string, sentiment_val, idx_start, idx_end, offsets):
    if idx_end < idx_start:
        idx_end = idx_start
        
    filtered_output  = ""
    for ix in range(idx_start, idx_end + 1):
        filtered_output += original_tweet[offsets[ix][0]: offsets[ix][1]]
        if (ix+1) < len(offsets) and offsets[ix][1] < offsets[ix+1][0]:
            filtered_output += " "
            
#     if sentiment_val == 'neutral' or len(original_tweet.split()) < 2:
    if len(original_tweet.split()) < 2:
        filtered_output = original_tweet
        
    jac = jaccard(target_string.strip(), filtered_output.strip())
        
    return jac, filtered_output

# Predict

In [30]:
print('###########################')
print('### Get meta data model {}'.format(config.PRETRAINED_MODELS))
print('###########################')

meta_train_df = pd.DataFrame()
tweet_list = []
sentiment_list = []
final_output = []
max_probability_start_list = []
max_probability_end_list = []

dataset = TweetDataset(
    tweet = train_df.text.values,
    sentiment = train_df.sentiment.values,
    selected_text = train_df.selected_text.values
)

data_loader = torch.utils.data.DataLoader(
    dataset,
    batch_size = config.VALID_BATCH_SIZE,
    shuffle = False,
    num_workers = 8
)

model_config = transformers.RobertaConfig.from_pretrained(config.ROBERTA_PATH)
model_config.output_hidden_states = True

model_path1 = os.path.join(config.DATA_DIR, 
                          config.SAVE_MODEL_DIR, 
                          config.PRETRAINED_MODELS, 
                          f'model_fold_0.bin')
model_path2 = os.path.join(config.DATA_DIR, 
                          config.SAVE_MODEL_DIR, 
                          config.PRETRAINED_MODELS, 
                          f'model_fold_1.bin')
model_path3 = os.path.join(config.DATA_DIR, 
                          config.SAVE_MODEL_DIR, 
                          config.PRETRAINED_MODELS, 
                          f'model_fold_2.bin')
model_path4 = os.path.join(config.DATA_DIR, 
                          config.SAVE_MODEL_DIR, 
                          config.PRETRAINED_MODELS, 
                          f'model_fold_3.bin')
model_path5 = os.path.join(config.DATA_DIR, 
                          config.SAVE_MODEL_DIR, 
                          config.PRETRAINED_MODELS, 
                          f'model_fold_4.bin')

model1 = TweetModel(conf=model_config)
model1.to(config.DEVICE)
model1.load_state_dict(torch.load(model_path1))
model1.eval()

model2 = TweetModel(conf=model_config)
model2.to(config.DEVICE)
model2.load_state_dict(torch.load(model_path2))
model2.eval()

model3 = TweetModel(conf=model_config)
model3.to(config.DEVICE)
model3.load_state_dict(torch.load(model_path3))
model3.eval()

model4 = TweetModel(conf=model_config)
model4.to(config.DEVICE)
model4.load_state_dict(torch.load(model_path4))
model4.eval()

model5 = TweetModel(conf=model_config)
model5.to(config.DEVICE)
model5.load_state_dict(torch.load(model_path5))
model5.eval()

with torch.no_grad():
    tk0 = tqdm_notebook(data_loader, total=len(data_loader))
    for bi, d in enumerate(tk0):
        ids = d["ids"]
        token_type_ids = d["token_type_ids"]
        mask = d["mask"]
        sentiment = d["sentiment"]
        orig_selected = d["orig_selected"]
        orig_tweet = d["orig_tweet"]
        targets_start = d["targets_start"]
        targets_end = d["targets_end"]
        offsets = d["offsets"].numpy() # (32, 128, 2)

        ids = ids.to(config.DEVICE, dtype=torch.long)
        token_type_ids = token_type_ids.to(config.DEVICE, dtype=torch.long)
        mask = mask.to(config.DEVICE, dtype=torch.long)
        targets_start = targets_start.to(config.DEVICE, dtype=torch.long)
        targets_end = targets_end.to(config.DEVICE, dtype=torch.long)

#                 outputs_start [32, 128]
        outputs_start1, outputs_end1 = model1(
            ids=ids,
            mask=mask,
            token_type_ids=token_type_ids
        )

        outputs_start2, outputs_end2 = model2(
            ids=ids,
            mask=mask,
            token_type_ids=token_type_ids
        )

        outputs_start3, outputs_end3 = model3(
            ids=ids,
            mask=mask,
            token_type_ids=token_type_ids
        )

        outputs_start4, outputs_end4 = model4(
            ids=ids,
            mask=mask,
            token_type_ids=token_type_ids
        )

        outputs_start5, outputs_end5 = model5(
            ids=ids,
            mask=mask,
            token_type_ids=token_type_ids
        )

        outputs_start = (
            outputs_start1
            + outputs_start2
            + outputs_start3
            + outputs_start4
            + outputs_start5
        ) / 5

        outputs_end = (
            outputs_end1
            + outputs_end2
            + outputs_end3
            + outputs_end4
            + outputs_end5
        ) / 5

        outputs_start = torch.softmax(outputs_start, dim=1).cpu().detach().numpy()
        outputs_end = torch.softmax(outputs_end, dim=1).cpu().detach().numpy()

        for px, tweet in enumerate(orig_tweet):
            selected_tweet = orig_selected[px]
            tweet_sentiment = sentiment[px]

            item_start = outputs_start[px, :]
            item_end = outputs_end[px, :]

            max_probability_start = np.max(item_start)
            max_probability_end = np.max(item_end)

#             if max_probability_start >= 0.8 and max_probability_end >= 0.8:
            _, output_sentence = calculate_jaccard_score(
                original_tweet=tweet,
                target_string=selected_tweet,
                sentiment_val=tweet_sentiment,
                idx_start=np.argmax(item_start),
                idx_end=np.argmax(item_end),
                offsets=offsets[px]
            )
            
            max_probability_start_list.append(max_probability_start)
            max_probability_end_list.append(max_probability_end)
            tweet_list.append(tweet)
            sentiment_list.append(tweet_sentiment)
            final_output.append(output_sentence)

meta_train_df['text'] = tweet_list
meta_train_df['sentiment'] = sentiment_list
meta_train_df['selected_text'] = final_output
meta_train_df['start_max'] = max_probability_start_list
meta_train_df['end_max'] = max_probability_end_list

###########################
### Get meta data model warm_up_steps_100
###########################


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=0.0, max=381.0), HTML(value='')))




In [44]:
meta_train_df[meta_train_df['start_max'] >= 0.7][meta_train_df['end_max'] >= 0.7]

  """Entry point for launching an IPython kernel.


Unnamed: 0,text,sentiment,selected_text,start_max,end_max
5,No Topic Maps talks at the Balisage Markup Conference 2009 Program online at http://tr.im/mL6Z (via ) #topicmaps,neutral,No Topic Maps talks at the Balisage Markup Conference 2009 Program online at http://tr.im/mL6Z (via ) #topicmaps,0.997821,0.950961
8,Screw you ! I only have 3 weeks...,neutral,Screw you ! I only have 3 weeks...,0.999639,0.983572
9,Aw you would not unfollow me would you? Then I would cry,neutral,Aw you would not unfollow me would you? Then I would cry,0.991137,0.999574
12,middle school and elem. High schools will remain open for those who need credits to graduate. Cali is broken,neutral,middle school and elem. High schools will remain open for those who need credits to graduate. Cali is broken,0.994568,0.999130
13,hey yu lil **** i textd yu,neutral,hey yu lil **** i textd yu,0.998491,0.991769
...,...,...,...,...,...
12162,"hey guys, if you have something to ask, just ask, okay? we`ll accept your critics and comments. thanks guys",neutral,"hey guys, if you have something to ask, just ask, okay? we`ll accept your critics and comments. thanks guys",0.986107,0.988612
12163,"not really just leaving flat now, on the lookout for lunch fancy having a wee stroll but dunno where... Oh well!",neutral,"not really just leaving flat now, on the lookout for lunch fancy having a wee stroll but dunno where... Oh well!",0.997334,0.974085
12164,I think the lesson of the day is not to have luggage,neutral,I think the lesson of the day is not to have luggage,0.999124,0.999350
12165,"haha, yeah. Twitter has many uses. For me it`s just to know what the ppl i care about are doing",neutral,"haha, yeah. Twitter has many uses. For me it`s just to know what the ppl i care about are doing",0.997363,0.993949


In [39]:
ori_train_df = pd.read_csv(os.path.join(config.DATA_DIR, config.TRAIN_FILE))

In [48]:
extend_df = ori_train_df[['text', 'selected_text', 'sentiment']].append(meta_train_df[meta_train_df['start_max'] >= 0.7][meta_train_df['end_max'] >= 0.7][['text', 'selected_text', 'sentiment']], ignore_index = True)

  """Entry point for launching an IPython kernel.


In [51]:
extend_df.to_csv(os.path.join(config.DATA_DIR, 'extend_data_pseudo_label.csv'), index=False)