# Import

In [1]:
import os
import re
import random
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import string
from tqdm import tqdm, tqdm_notebook

import torch
import torch.nn as nn
from torch.nn import functional as F
from torch.optim import lr_scheduler

from sklearn import model_selection
from sklearn import metrics
from sklearn.model_selection import StratifiedKFold
from scipy.optimize import minimize
from scipy.special import softmax

import transformers
import tokenizers
from transformers import AdamW, get_linear_schedule_with_warmup

from IPython.core.debugger import set_trace

# Settings

In [2]:
class config:
    DATA_DIR = '../tweet_sentiment_extraction'
    ROBERTA_PATH = '../tweet_sentiment_extraction/roberta-base'
    OUTPUT_DIR = '../tweet_sentiment_extraction'
    TRAIN_FILE = 'train_folds.csv'
    TEST_FILE = 'test.csv'
    PREDICT_FILE = 'predict.csv'
    SAVE_MODEL_DIR = 'trained_model'
    PRETRAINED_MODELS = ['warm_up_steps_100', 
                         'loss_0.45_0.55', 
                         'warm_up_steps_128', 
                         'distance_loss',
                         'label_smoothing_0.1_epoch_3',
                         'label_smoothing_0.15',
#                          'conv1d_last_output',
                         'tweat_loss_function'
                        ]
    MAX_LEN = 128
    TRAIN_BATCH_SIZE = 32
    VALID_BATCH_SIZE = 32
    LOGGING_STEPS = 100
    SEED = 1111
    DEVICE = torch.device('cuda:1')
    TOKENIZER = tokenizers.ByteLevelBPETokenizer(
        vocab_file=f"{ROBERTA_PATH}/vocab.json", 
        merges_file=f"{ROBERTA_PATH}/merges.txt", 
        lowercase=True,
        add_prefix_space=True
    )

In [3]:
def seed_everything(seed=1234):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
seed_everything(config.SEED)

# Utils

In [4]:
class AverageMeter:
    """
    Computes and stores the average and current value
    """
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

def jaccard(str1, str2): 
    a = set(str1.lower().split()) 
    b = set(str2.lower().split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

In [5]:
# def jaccard_score(weights):
#     jaccards = AverageMeter()
    
#     for _, row in meta_train_df.iterrows():
#         outputs_start = torch.zeros((config.MAX_LEN,)).to(config.DEVICE)
#         outputs_end = torch.zeros((config.MAX_LEN,)).to(config.DEVICE)
        
#         for ii, pretrained_model in enumerate(config.PRETRAINED_MODELS):
#             outputs_start += row[f'{pretrained_model}_start'] * weights[ii]
#             outputs_end += row[f'{pretrained_model}_end'] * weights[ii]
    
#         outputs_start = torch.softmax(outputs_start, dim=0).cpu().detach().numpy()
#         outputs_end = torch.softmax(outputs_end, dim=0).cpu().detach().numpy()
        
#         jaccard_score, _ = calculate_jaccard_score(
#             original_tweet=row['tweet'],
#             target_string=row['selected_text'],
#             sentiment_val=row['sentiment'],
#             idx_start=np.argmax(outputs_start),
#             idx_end=np.argmax(outputs_end),
#             offsets=row['offsets']
#         )
    
#         jaccards.update(jaccard_score)
        
#     return -jaccards.avg

def jaccard_score(weights):
    weights = torch.tensor(weights, dtype=torch.float32).to(config.DEVICE)

    start_softmax = torch.softmax(torch.matmul(start_model_predict, weights), dim=1).cpu().numpy()
    end_softmax = torch.softmax(torch.matmul(end_model_predict, weights), dim=1).cpu().numpy()
    index_start_max = np.argmax(start_softmax, 1)
    index_end_max = np.argmax(end_softmax, 1)

    jaccards = AverageMeter()

    for i, row in meta_train_df.iterrows():
        index_start = index_start_max[i]
        index_end = index_end_max[i]

        jaccard_s, _ = calculate_jaccard_score(
            original_tweet=row['tweet'],
            target_string=row['selected_text'],
            sentiment_val=row['sentiment'],
            idx_start=index_start,
            idx_end=index_end,
            offsets=row['offsets']
        )

        jaccards.update(jaccard_s)

    return -jaccards.avg

# Data Processing

In [6]:
def process_data(tweet, selected_text, sentiment, tokenizer, max_len):
    tweet = " " + " ".join(str(tweet).split())
    selected_text = " " + " ".join(str(selected_text).split())

    len_st = len(selected_text) - 1
    idx0 = None
    idx1 = None

    for ind in (i for i, e in enumerate(tweet) if e == selected_text[1]):
        if " " + tweet[ind: ind+len_st] == selected_text:
            idx0 = ind
            idx1 = ind + len_st - 1
            break

    char_targets = [0] * len(tweet)
    if idx0 != None and idx1 != None:
        for ct in range(idx0, idx1 + 1):
            char_targets[ct] = 1
    
    tok_tweet = tokenizer.encode(tweet)
    input_ids_orig = tok_tweet.ids
    tweet_offsets = tok_tweet.offsets
    
    target_idx = []
    for j, (offset1, offset2) in enumerate(tweet_offsets):
        if sum(char_targets[offset1: offset2]) > 0:
            target_idx.append(j)
    
    targets_start = target_idx[0]
    targets_end = target_idx[-1]

    sentiment_id = {
        'positive': 1313,
        'negative': 2430,
        'neutral': 7974
    }
    
    input_ids = [0] + [sentiment_id[sentiment]] + [2] + [2] + input_ids_orig + [2]
    token_type_ids = [0, 0, 0, 0] + [0] * (len(input_ids_orig) + 1)
    mask = [1] * len(token_type_ids)
    tweet_offsets = [(0, 0)] * 4 + tweet_offsets + [(0, 0)]
    targets_start += 4
    targets_end += 4

    padding_length = max_len - len(input_ids)
    if padding_length > 0:
        input_ids = input_ids + ([1] * padding_length)
        mask = mask + ([0] * padding_length)
        token_type_ids = token_type_ids + ([0] * padding_length)
        tweet_offsets = tweet_offsets + ([(0, 0)] * padding_length)
    
    return {
        'ids': input_ids,
        'mask': mask,
        'token_type_ids': token_type_ids,
        'targets_start': targets_start,
        'targets_end': targets_end,
        'orig_tweet': tweet,
        'orig_selected': selected_text,
        'sentiment': sentiment,
        'offsets': tweet_offsets
    }

# Data Loader

In [7]:
train_df = pd.read_csv(os.path.join(config.DATA_DIR, config.TRAIN_FILE))
train_df = train_df.dropna(how='any', axis=0)
# test_df = pd.read_csv(os.path.join(config.DATA_DIR, config.TEST_FILE))

In [8]:
class TweetDataset :
    def __init__(self, tweet, sentiment, selected_text):
        self.tweet = tweet
        self.sentiment = sentiment
        self.selected_text = selected_text
        self.tokenizer = config.TOKENIZER
        self.max_len = config.MAX_LEN
    
    def __len__(self):
        return len(self.tweet)
    
    def __getitem__(self, item):
        data = process_data(
            self.tweet[item],
            self.selected_text[item],
            self.sentiment[item],
            self.tokenizer,
            self.max_len
        )
        
        return {
            'ids': torch.tensor(data['ids'], dtype=torch.long),
            'mask': torch.tensor(data['mask'], dtype=torch.long),
            'token_type_ids': torch.tensor(data["token_type_ids"], dtype=torch.long),
            'targets_start': torch.tensor(data["targets_start"], dtype=torch.long),
            'targets_end': torch.tensor(data["targets_end"], dtype=torch.long),
            'orig_tweet': data["orig_tweet"],
            'orig_selected': data["orig_selected"],
            'sentiment': data["sentiment"],
            'offsets': torch.tensor(data["offsets"], dtype=torch.long)
        }

# Model

In [9]:
class TweetModel(transformers.BertPreTrainedModel):
    def __init__(self, conf):
        super(TweetModel, self).__init__(conf)
        self.roberta = transformers.RobertaModel.from_pretrained(config.ROBERTA_PATH, config=conf)
        self.drop_out = nn.Dropout(0.1)
        self.l0 = nn.Linear(768 * 2, 200)
        torch.nn.init.normal_(self.l0.weight, std=0.02)
        self.l1 = nn.Linear(200, 2)
        torch.nn.init.normal_(self.l1.weight, std=0.02)
    
    def forward(self, ids, mask, token_type_ids):
        _, _, out = self.roberta(
            ids,
            attention_mask=mask,
            token_type_ids=token_type_ids
        )

        out = torch.cat((out[-1], out[-2]), dim=-1)
        out = self.drop_out(out)
        logits = self.l0(out)
        logits = self.l1(logits)

        start_logits, end_logits = logits.split(1, dim=-1)

        start_logits = start_logits.squeeze(-1)
        end_logits = end_logits.squeeze(-1)

        return start_logits, end_logits
    
# class TweetModel(transformers.BertPreTrainedModel):
#     def __init__(self, conf):
#         super(TweetModel, self).__init__(conf)
#         self.roberta = transformers.RobertaModel.from_pretrained(config.ROBERTA_PATH, config=conf)
#         self.dropout = nn.Dropout(0.1)
        
#         self.qa_outputs1c = torch.nn.Conv1d(768, config.MAX_LEN, 2)
#         self.qa_outputs2c = torch.nn.Conv1d(768, config.MAX_LEN, 2)

#         self.qa_outputs1 = nn.Linear(config.MAX_LEN, 1)
#         self.qa_outputs2 = nn.Linear(config.MAX_LEN, 1)
    
#     def forward(self, ids, mask, token_type_ids):
#         _, _, out = self.roberta(
#             ids,
#             attention_mask=mask,
#             token_type_ids=token_type_ids
#         )

# #         out = torch.cat((out[-1], out[-2]), dim=-1)
#         s_out = self.dropout(out[-1])
#         s_out = torch.nn.functional.pad(s_out.transpose(1,2), (1, 0))

#         out1 = self.qa_outputs1c(s_out).transpose(1,2)
#         out2 = self.qa_outputs2c(s_out).transpose(1,2)

#         start_logits = self.qa_outputs1(self.dropout(out1)).squeeze(-1)
#         end_logits = self.qa_outputs2(self.dropout(out2)).squeeze(-1)

#         return start_logits, end_logits

In [10]:
def calculate_jaccard_score(original_tweet, target_string, sentiment_val, idx_start, idx_end, offsets):
    if idx_end < idx_start:
        idx_end = idx_start
        
    filtered_output  = ""
    for ix in range(idx_start, idx_end + 1):
        filtered_output += original_tweet[offsets[ix][0]: offsets[ix][1]]
        if (ix+1) < len(offsets) and offsets[ix][1] < offsets[ix+1][0]:
            filtered_output += " "
            
#     if sentiment_val == 'neutral' or len(original_tweet.split()) < 2:
    if len(original_tweet.split()) < 2:
        filtered_output = original_tweet
        
    jac = jaccard(target_string.strip(), filtered_output.strip())
        
    return jac, filtered_output

# Predict

In [11]:
def run_predict(i, pretrained_model):
    print('###########################')
    print('### Get meta data model {}'.format(pretrained_model))
    print('###########################')
    
    model_outputs_start = None
    model_outputs_end = None
    if i == 0:
        tweet_list = []
        selected_text_list = []
        sentiment_list = []
        offsets_full = None
        
    for fold in range(5):
        
        fold_train_df = train_df[train_df.kfold != fold].reset_index(drop=True)
        fold_valid_df = train_df[train_df.kfold == fold].reset_index(drop=True)
        
        valid_dataset = TweetDataset(
            tweet = fold_valid_df.text.values,
            sentiment = fold_valid_df.sentiment.values,
            selected_text = fold_valid_df.selected_text.values
        )

        valid_data_loader = torch.utils.data.DataLoader(
            valid_dataset,
            batch_size = config.VALID_BATCH_SIZE,
            shuffle = False,
            num_workers = 8
        )
        
        model_path = os.path.join(config.DATA_DIR, 
                                  config.SAVE_MODEL_DIR, 
                                  pretrained_model, 
                                  f'model_fold_{fold}.bin')
        
        model = TweetModel(conf=model_config)
        model.to(config.DEVICE)
        model.load_state_dict(torch.load(model_path))
        model.eval()
        
        with torch.no_grad():
            tk0 = tqdm_notebook(valid_data_loader, total=len(valid_data_loader), desc=f'Fold {fold}')
            for bi, d in enumerate(tk0):
                ids = d["ids"]
                token_type_ids = d["token_type_ids"]
                mask = d["mask"]
                sentiment = d["sentiment"]
                orig_selected = d["orig_selected"]
                orig_tweet = d["orig_tweet"]
                targets_start = d["targets_start"]
                targets_end = d["targets_end"]
                offsets = d["offsets"].numpy() # (32, 128, 2)
                
                ids = ids.to(config.DEVICE, dtype=torch.long)
                token_type_ids = token_type_ids.to(config.DEVICE, dtype=torch.long)
                mask = mask.to(config.DEVICE, dtype=torch.long)
                targets_start = targets_start.to(config.DEVICE, dtype=torch.long)
                targets_end = targets_end.to(config.DEVICE, dtype=torch.long)

#                 outputs_start [32, 128]
                outputs_start, outputs_end = model(
                    ids=ids,
                    mask=mask,
                    token_type_ids=token_type_ids
                )
                
                if i == 0:
                    tweet_list.extend(orig_tweet)
                    selected_text_list.extend(orig_selected)
                    sentiment_list.extend(sentiment)
                    
                    if offsets_full is None:
                        offsets_full = offsets
                    else:
                        offsets_full = np.vstack((offsets_full, offsets))
        
        
                if model_outputs_start is None:
                    model_outputs_start = outputs_start
                else:
                    model_outputs_start = torch.cat([model_outputs_start, outputs_start], dim=0)
                    
                if model_outputs_end is None:
                    model_outputs_end = outputs_end
                else:
                    model_outputs_end = torch.cat([model_outputs_end, outputs_end], dim=0)
    
    if i == 0:
        meta_train_df['tweet'] = tweet_list
        meta_train_df['selected_text'] = selected_text_list
        meta_train_df['sentiment'] = sentiment_list
        meta_train_df['offsets'] = [offsets_full[i] for i in range(offsets_full.shape[0])]
        
        
    meta_train_df[f'{pretrained_model}_start'] = [model_outputs_start[i] for i in range(model_outputs_start.shape[0])]
    meta_train_df[f'{pretrained_model}_end'] = [model_outputs_end[i] for i in range(model_outputs_end.shape[0])]

In [12]:
model_config = transformers.RobertaConfig.from_pretrained(config.ROBERTA_PATH)
model_config.output_hidden_states = True

meta_train_df = pd.DataFrame()

In [13]:
for i, pretrained_model in enumerate(config.PRETRAINED_MODELS):
    run_predict(i, pretrained_model)

###########################
### Get meta data model warm_up_steps_100
###########################


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=0.0, description='Fold 0', max=172.0, style=ProgressStyle(description_width…




HBox(children=(FloatProgress(value=0.0, description='Fold 1', max=172.0, style=ProgressStyle(description_width…




HBox(children=(FloatProgress(value=0.0, description='Fold 2', max=172.0, style=ProgressStyle(description_width…




HBox(children=(FloatProgress(value=0.0, description='Fold 3', max=172.0, style=ProgressStyle(description_width…




HBox(children=(FloatProgress(value=0.0, description='Fold 4', max=172.0, style=ProgressStyle(description_width…


###########################
### Get meta data model loss_0.45_0.55
###########################


HBox(children=(FloatProgress(value=0.0, description='Fold 0', max=172.0, style=ProgressStyle(description_width…




HBox(children=(FloatProgress(value=0.0, description='Fold 1', max=172.0, style=ProgressStyle(description_width…




HBox(children=(FloatProgress(value=0.0, description='Fold 2', max=172.0, style=ProgressStyle(description_width…




HBox(children=(FloatProgress(value=0.0, description='Fold 3', max=172.0, style=ProgressStyle(description_width…




HBox(children=(FloatProgress(value=0.0, description='Fold 4', max=172.0, style=ProgressStyle(description_width…


###########################
### Get meta data model warm_up_steps_128
###########################


HBox(children=(FloatProgress(value=0.0, description='Fold 0', max=172.0, style=ProgressStyle(description_width…




HBox(children=(FloatProgress(value=0.0, description='Fold 1', max=172.0, style=ProgressStyle(description_width…




HBox(children=(FloatProgress(value=0.0, description='Fold 2', max=172.0, style=ProgressStyle(description_width…




HBox(children=(FloatProgress(value=0.0, description='Fold 3', max=172.0, style=ProgressStyle(description_width…




HBox(children=(FloatProgress(value=0.0, description='Fold 4', max=172.0, style=ProgressStyle(description_width…


###########################
### Get meta data model distance_loss
###########################


HBox(children=(FloatProgress(value=0.0, description='Fold 0', max=172.0, style=ProgressStyle(description_width…




HBox(children=(FloatProgress(value=0.0, description='Fold 1', max=172.0, style=ProgressStyle(description_width…




HBox(children=(FloatProgress(value=0.0, description='Fold 2', max=172.0, style=ProgressStyle(description_width…




HBox(children=(FloatProgress(value=0.0, description='Fold 3', max=172.0, style=ProgressStyle(description_width…




HBox(children=(FloatProgress(value=0.0, description='Fold 4', max=172.0, style=ProgressStyle(description_width…


###########################
### Get meta data model label_smoothing_0.1_epoch_3
###########################


HBox(children=(FloatProgress(value=0.0, description='Fold 0', max=172.0, style=ProgressStyle(description_width…




HBox(children=(FloatProgress(value=0.0, description='Fold 1', max=172.0, style=ProgressStyle(description_width…




HBox(children=(FloatProgress(value=0.0, description='Fold 2', max=172.0, style=ProgressStyle(description_width…




HBox(children=(FloatProgress(value=0.0, description='Fold 3', max=172.0, style=ProgressStyle(description_width…




HBox(children=(FloatProgress(value=0.0, description='Fold 4', max=172.0, style=ProgressStyle(description_width…


###########################
### Get meta data model label_smoothing_0.15
###########################


HBox(children=(FloatProgress(value=0.0, description='Fold 0', max=172.0, style=ProgressStyle(description_width…




HBox(children=(FloatProgress(value=0.0, description='Fold 1', max=172.0, style=ProgressStyle(description_width…




HBox(children=(FloatProgress(value=0.0, description='Fold 2', max=172.0, style=ProgressStyle(description_width…




HBox(children=(FloatProgress(value=0.0, description='Fold 3', max=172.0, style=ProgressStyle(description_width…




HBox(children=(FloatProgress(value=0.0, description='Fold 4', max=172.0, style=ProgressStyle(description_width…


###########################
### Get meta data model tweat_loss_function
###########################


HBox(children=(FloatProgress(value=0.0, description='Fold 0', max=172.0, style=ProgressStyle(description_width…




HBox(children=(FloatProgress(value=0.0, description='Fold 1', max=172.0, style=ProgressStyle(description_width…




HBox(children=(FloatProgress(value=0.0, description='Fold 2', max=172.0, style=ProgressStyle(description_width…




HBox(children=(FloatProgress(value=0.0, description='Fold 3', max=172.0, style=ProgressStyle(description_width…




HBox(children=(FloatProgress(value=0.0, description='Fold 4', max=172.0, style=ProgressStyle(description_width…




In [14]:
# run_predict(1, 'conv1d_last_output')

In [15]:
for model_name in config.PRETRAINED_MODELS:
    meta_train_df[f'{model_name}_start_max'] = meta_train_df[f'{model_name}_start'].apply(lambda x: np.argmax(torch.softmax(x, dim=0).cpu().detach().numpy()))
    meta_train_df[f'{model_name}_end_max'] = meta_train_df[f'{model_name}_end'].apply(lambda x: np.argmax(torch.softmax(x, dim=0).cpu().detach().numpy()))

In [16]:
meta_train_df.head()

Unnamed: 0,tweet,selected_text,sentiment,offsets,warm_up_steps_100_start,warm_up_steps_100_end,loss_0.45_0.55_start,loss_0.45_0.55_end,warm_up_steps_128_start,warm_up_steps_128_end,...,warm_up_steps_128_start_max,warm_up_steps_128_end_max,distance_loss_start_max,distance_loss_end_max,label_smoothing_0.1_epoch_3_start_max,label_smoothing_0.1_epoch_3_end_max,label_smoothing_0.15_start_max,label_smoothing_0.15_end_max,tweat_loss_function_start_max,tweat_loss_function_end_max
0,http://www.dothebouncy.com/smf - some shamele...,http://www.dothebouncy.com/smf - some shamele...,neutral,"[[0, 0], [0, 0], [0, 0], [0, 0], [0, 5], [5, 8...","[tensor(-4.1104, device='cuda:1'), tensor(-7.1...","[tensor(-4.2847, device='cuda:1'), tensor(-9.1...","[tensor(-5.4201, device='cuda:1'), tensor(-7.2...","[tensor(-6.6316, device='cuda:1'), tensor(-8.2...","[tensor(-5.4729, device='cuda:1'), tensor(-6.4...","[tensor(-8.3191, device='cuda:1'), tensor(-8.8...",...,19,30,4,30,19,30,19,30,4,30
1,Soooo high,Soooo high,neutral,"[[0, 0], [0, 0], [0, 0], [0, 0], [0, 3], [3, 6...","[tensor(-3.9394, device='cuda:1'), tensor(-7.1...","[tensor(-4.2483, device='cuda:1'), tensor(-9.3...","[tensor(-5.3600, device='cuda:1'), tensor(-6.9...","[tensor(-6.1883, device='cuda:1'), tensor(-8.5...","[tensor(-5.5256, device='cuda:1'), tensor(-6.3...","[tensor(-8.3316, device='cuda:1'), tensor(-8.8...",...,4,6,4,6,4,6,4,6,4,6
2,Both of you,Both of you,neutral,"[[0, 0], [0, 0], [0, 0], [0, 0], [0, 5], [5, 8...","[tensor(-3.9214, device='cuda:1'), tensor(-7.1...","[tensor(-4.1463, device='cuda:1'), tensor(-9.3...","[tensor(-5.2640, device='cuda:1'), tensor(-6.9...","[tensor(-6.4536, device='cuda:1'), tensor(-8.4...","[tensor(-5.5179, device='cuda:1'), tensor(-6.4...","[tensor(-8.2840, device='cuda:1'), tensor(-8.8...",...,4,6,4,6,4,6,4,6,4,6
3,i want to go to music tonight but i lost my v...,lost,negative,"[[0, 0], [0, 0], [0, 0], [0, 0], [0, 2], [2, 7...","[tensor(-4.4005, device='cuda:1'), tensor(-6.4...","[tensor(-6.5967, device='cuda:1'), tensor(-8.9...","[tensor(-5.9398, device='cuda:1'), tensor(-7.1...","[tensor(-7.0927, device='cuda:1'), tensor(-7.4...","[tensor(-5.4113, device='cuda:1'), tensor(-5.8...","[tensor(-7.5815, device='cuda:1'), tensor(-7.6...",...,13,16,13,13,13,13,13,13,13,13
4,Hes just not that into you,Hes just not that into you,neutral,"[[0, 0], [0, 0], [0, 0], [0, 0], [0, 4], [4, 9...","[tensor(-3.9571, device='cuda:1'), tensor(-7.1...","[tensor(-4.2409, device='cuda:1'), tensor(-9.3...","[tensor(-5.3277, device='cuda:1'), tensor(-7.0...","[tensor(-6.4537, device='cuda:1'), tensor(-8.4...","[tensor(-5.5039, device='cuda:1'), tensor(-6.2...","[tensor(-8.3873, device='cuda:1'), tensor(-8.9...",...,4,9,4,9,4,9,4,9,4,9


In [None]:
# print('Simple averaging:', jaccard_score([0.333333333, 0.333333333, 0.333333333]))

# Correlation

In [17]:
# def correlation(df):
#     base_model = config.PRETRAINED_MODELS[0]

#     for model_name in config.PRETRAINED_MODELS[1:]:
#         print(f"### Model {base_model} - {model_name}")
#         print("Start correlation")
#         print("Correlation: {}".format(df[f'{base_model}_start_max'].corr(df[f'{model_name}_start_max'], method='pearson')))

#         print("\nEnd correlation")
#         print("Correlation: {}".format(df[f'{base_model}_end_max'].corr(df[f'{model_name}_end_max'], method='pearson')))
        
def correlation(df):
    cm = sns.light_palette("green", as_cmap=True) 
    
    corr = df[[f'{model_name}_start_max' for model_name in config.PRETRAINED_MODELS]].corr()
    display(corr.style.background_gradient(cmap=cm).set_precision(3))

    corr = df[[f'{model_name}_end_max' for model_name in config.PRETRAINED_MODELS]].corr()
    display(corr.style.background_gradient(cmap=cm).set_precision(3))

In [18]:
correlation(meta_train_df)

Unnamed: 0,warm_up_steps_100_start_max,loss_0.45_0.55_start_max,warm_up_steps_128_start_max,distance_loss_start_max,label_smoothing_0.1_epoch_3_start_max,label_smoothing_0.15_start_max,tweat_loss_function_start_max
warm_up_steps_100_start_max,1.0,0.953,0.954,0.953,0.945,0.953,0.942
loss_0.45_0.55_start_max,0.953,1.0,0.951,0.956,0.949,0.955,0.947
warm_up_steps_128_start_max,0.954,0.951,1.0,0.951,0.947,0.951,0.946
distance_loss_start_max,0.953,0.956,0.951,1.0,0.946,0.951,0.948
label_smoothing_0.1_epoch_3_start_max,0.945,0.949,0.947,0.946,1.0,0.947,0.94
label_smoothing_0.15_start_max,0.953,0.955,0.951,0.951,0.947,1.0,0.943
tweat_loss_function_start_max,0.942,0.947,0.946,0.948,0.94,0.943,1.0


Unnamed: 0,warm_up_steps_100_end_max,loss_0.45_0.55_end_max,warm_up_steps_128_end_max,distance_loss_end_max,label_smoothing_0.1_epoch_3_end_max,label_smoothing_0.15_end_max,tweat_loss_function_end_max
warm_up_steps_100_end_max,1.0,0.961,0.964,0.964,0.955,0.963,0.95
loss_0.45_0.55_end_max,0.961,1.0,0.965,0.963,0.96,0.962,0.954
warm_up_steps_128_end_max,0.964,0.965,1.0,0.965,0.958,0.965,0.956
distance_loss_end_max,0.964,0.963,0.965,1.0,0.958,0.963,0.953
label_smoothing_0.1_epoch_3_end_max,0.955,0.96,0.958,0.958,1.0,0.956,0.951
label_smoothing_0.15_end_max,0.963,0.962,0.965,0.963,0.956,1.0,0.953
tweat_loss_function_end_max,0.95,0.954,0.956,0.953,0.951,0.953,1.0


In [19]:
print('### Positive')
positive_df = meta_train_df[meta_train_df['sentiment'] == 'positive']
correlation(positive_df)

### Positive


Unnamed: 0,warm_up_steps_100_start_max,loss_0.45_0.55_start_max,warm_up_steps_128_start_max,distance_loss_start_max,label_smoothing_0.1_epoch_3_start_max,label_smoothing_0.15_start_max,tweat_loss_function_start_max
warm_up_steps_100_start_max,1.0,0.952,0.953,0.951,0.945,0.95,0.937
loss_0.45_0.55_start_max,0.952,1.0,0.949,0.952,0.948,0.953,0.941
warm_up_steps_128_start_max,0.953,0.949,1.0,0.949,0.943,0.95,0.946
distance_loss_start_max,0.951,0.952,0.949,1.0,0.946,0.951,0.937
label_smoothing_0.1_epoch_3_start_max,0.945,0.948,0.943,0.946,1.0,0.948,0.935
label_smoothing_0.15_start_max,0.95,0.953,0.95,0.951,0.948,1.0,0.94
tweat_loss_function_start_max,0.937,0.941,0.946,0.937,0.935,0.94,1.0


Unnamed: 0,warm_up_steps_100_end_max,loss_0.45_0.55_end_max,warm_up_steps_128_end_max,distance_loss_end_max,label_smoothing_0.1_epoch_3_end_max,label_smoothing_0.15_end_max,tweat_loss_function_end_max
warm_up_steps_100_end_max,1.0,0.919,0.926,0.929,0.91,0.924,0.902
loss_0.45_0.55_end_max,0.919,1.0,0.932,0.926,0.924,0.92,0.914
warm_up_steps_128_end_max,0.926,0.932,1.0,0.927,0.92,0.929,0.917
distance_loss_end_max,0.929,0.926,0.927,1.0,0.918,0.928,0.911
label_smoothing_0.1_epoch_3_end_max,0.91,0.924,0.92,0.918,1.0,0.917,0.908
label_smoothing_0.15_end_max,0.924,0.92,0.929,0.928,0.917,1.0,0.914
tweat_loss_function_end_max,0.902,0.914,0.917,0.911,0.908,0.914,1.0


In [20]:
print('### Negative')
negative_df = meta_train_df[meta_train_df['sentiment'] == 'negative']
correlation(negative_df)

### Negative


Unnamed: 0,warm_up_steps_100_start_max,loss_0.45_0.55_start_max,warm_up_steps_128_start_max,distance_loss_start_max,label_smoothing_0.1_epoch_3_start_max,label_smoothing_0.15_start_max,tweat_loss_function_start_max
warm_up_steps_100_start_max,1.0,0.928,0.93,0.928,0.916,0.929,0.914
loss_0.45_0.55_start_max,0.928,1.0,0.928,0.938,0.922,0.932,0.923
warm_up_steps_128_start_max,0.93,0.928,1.0,0.928,0.922,0.925,0.919
distance_loss_start_max,0.928,0.938,0.928,1.0,0.917,0.924,0.929
label_smoothing_0.1_epoch_3_start_max,0.916,0.922,0.922,0.917,1.0,0.918,0.91
label_smoothing_0.15_start_max,0.929,0.932,0.925,0.924,0.918,1.0,0.915
tweat_loss_function_start_max,0.914,0.923,0.919,0.929,0.91,0.915,1.0


Unnamed: 0,warm_up_steps_100_end_max,loss_0.45_0.55_end_max,warm_up_steps_128_end_max,distance_loss_end_max,label_smoothing_0.1_epoch_3_end_max,label_smoothing_0.15_end_max,tweat_loss_function_end_max
warm_up_steps_100_end_max,1.0,0.929,0.934,0.932,0.92,0.935,0.907
loss_0.45_0.55_end_max,0.929,1.0,0.931,0.932,0.926,0.937,0.906
warm_up_steps_128_end_max,0.934,0.931,1.0,0.935,0.922,0.936,0.913
distance_loss_end_max,0.932,0.932,0.935,1.0,0.928,0.931,0.912
label_smoothing_0.1_epoch_3_end_max,0.92,0.926,0.922,0.928,1.0,0.918,0.903
label_smoothing_0.15_end_max,0.935,0.937,0.936,0.931,0.918,1.0,0.907
tweat_loss_function_end_max,0.907,0.906,0.913,0.912,0.903,0.907,1.0


In [21]:
print('### Neutral')
neutral_df = meta_train_df[meta_train_df['sentiment'] == 'neutral']
correlation(neutral_df)

### Neutral


Unnamed: 0,warm_up_steps_100_start_max,loss_0.45_0.55_start_max,warm_up_steps_128_start_max,distance_loss_start_max,label_smoothing_0.1_epoch_3_start_max,label_smoothing_0.15_start_max,tweat_loss_function_start_max
warm_up_steps_100_start_max,1.0,0.723,0.731,0.714,0.644,0.75,0.631
loss_0.45_0.55_start_max,0.723,1.0,0.694,0.625,0.696,0.743,0.7
warm_up_steps_128_start_max,0.731,0.694,1.0,0.613,0.647,0.784,0.605
distance_loss_start_max,0.714,0.625,0.613,1.0,0.567,0.636,0.724
label_smoothing_0.1_epoch_3_start_max,0.644,0.696,0.647,0.567,1.0,0.687,0.644
label_smoothing_0.15_start_max,0.75,0.743,0.784,0.636,0.687,1.0,0.662
tweat_loss_function_start_max,0.631,0.7,0.605,0.724,0.644,0.662,1.0


Unnamed: 0,warm_up_steps_100_end_max,loss_0.45_0.55_end_max,warm_up_steps_128_end_max,distance_loss_end_max,label_smoothing_0.1_epoch_3_end_max,label_smoothing_0.15_end_max,tweat_loss_function_end_max
warm_up_steps_100_end_max,1.0,0.99,0.992,0.991,0.987,0.99,0.989
loss_0.45_0.55_end_max,0.99,1.0,0.993,0.992,0.988,0.989,0.991
warm_up_steps_128_end_max,0.992,0.993,1.0,0.993,0.989,0.991,0.99
distance_loss_end_max,0.991,0.992,0.993,1.0,0.986,0.99,0.988
label_smoothing_0.1_epoch_3_end_max,0.987,0.988,0.989,0.986,1.0,0.986,0.988
label_smoothing_0.15_end_max,0.99,0.989,0.991,0.99,0.986,1.0,0.988
tweat_loss_function_end_max,0.989,0.991,0.99,0.988,0.988,0.988,1.0


# Ensemble

## Blending

In [22]:
start_model_predict_list = []
end_model_predict_list = []
blend_models = ['warm_up_steps_100',
                'loss_0.45_0.55',
                'label_smoothing_0.1_epoch_3',
               ]

for ii, pretrained_model in enumerate(blend_models):
#     meta_train_df[f'{pretrained_model}_start'] = meta_train_df[f'{pretrained_model}_start'].apply(lambda x: x.to(config.DEVICE))
#     meta_train_df[f'{pretrained_model}_end'] = meta_train_df[f'{pretrained_model}_end'].apply(lambda x: x.to(config.DEVICE))
    
    start_tensor = torch.stack(list(meta_train_df[f'{pretrained_model}_start'].values))
    end_tensor = torch.stack(list(meta_train_df[f'{pretrained_model}_end'].values))
    
    start_model_predict_list.append(start_tensor)
    end_model_predict_list.append(end_tensor)

In [23]:
start_model_predict = torch.stack(start_model_predict_list).permute(1, 2, 0)
end_model_predict = torch.stack(end_model_predict_list).permute(1, 2, 0)

In [24]:
jaccard_score([1.0/len(blend_models), ] * len(blend_models))

-0.7110215311851128

### Minimization

In [25]:
lls = []
wghts = []

for i in range(100):
    starting_values = np.random.uniform(size=len(blend_models))
    print(f'#Step: {i} Starting Values: {starting_values}')
    cons = ({'type':'eq','fun':lambda w: 1-sum(w)})
    bounds = [(0, 1)] * len(config.PRETRAINED_MODELS)

    res = minimize(jaccard_score, starting_values, method='COBYLA', bounds=bounds, 
                   options={'disp': True, 'maxiter': 1000})
    
    lls.append(res['fun'])
    wghts.append(res['x'])
    print('Weights: {weights}  Score: {score}'.format(weights=res['x'], score=res['fun']))

bestSC = np.min(lls)
bestWght = wghts[np.argmin(lls)]

print('\n Ensemble Score: {best_score}'.format(best_score=bestSC))
print('\n Best Weights: {weights}'.format(weights=bestWght))

#Step: 0 Starting Values: [0.0955492  0.9250037  0.34357342 0.31047694]




Weights: [1.04512689 1.40910564 1.35407344 0.15515397]  Score: -0.7126583944417036
#Step: 1 Starting Values: [0.00200984 0.23559472 0.23779172 0.73591587]
Weights: [ 2.07977179  1.22186579  1.64163017 -0.86513152]  Score: -0.712497338977579
#Step: 2 Starting Values: [0.49546808 0.78442535 0.12650631 0.60664932]
Weights: [ 1.38689171  1.8994657   1.5165766  -0.43794839]  Score: -0.712408533257906
#Step: 3 Starting Values: [0.46612097 0.23713212 0.43515918 0.24367151]
Weights: [0.57169908 0.52947369 0.36736703 0.14307611]  Score: -0.7124922379537034
#Step: 4 Starting Values: [0.38383991 0.83839369 0.65518473 0.14844667]
Weights: [0.27345664 0.90909357 0.93252944 0.22201299]  Score: -0.712458693157514
#Step: 5 Starting Values: [0.63914517 0.63737456 0.61087429 0.93001855]
Weights: [2.18172396 1.8651856  0.4975133  0.27119367]  Score: -0.7124528461192287
#Step: 6 Starting Values: [0.81649992 0.76942493 0.08540093 0.66500273]
Weights: [ 2.07878427  1.64709179  1.72596875 -0.31943768]  Score

KeyboardInterrupt: 

### MCMC

In [177]:
n = 1000
counter = 0
result={}
num = len(config.PRETRAINED_MODELS)
weight = np.array([1.0/num,]*num)
old_score = 0.0
best_score = 0.0
best_weight = None

for i in range(0, n):
    new_weights = weight + np.array([0.005,] * num) * np.random.normal(loc=0.0, scale=1.0, size=num)

    new_score = -jaccard_score(new_weights)
#     print(new_score)
    if best_score < new_score:
        best_score = new_score
        best_weight = new_weights
        
    diff = new_score - old_score
    prob = min(1, np.exp(-diff/.3))
    random_prob = np.random.rand()
    
    if random_prob < prob:
        result[i] = (new_score, old_score, prob, random_prob, new_weights)
        weight = new_weights
        old_score = new_score
        counter += 1

print(f'Best score {best_score}')
print(f'Best weights {best_weight}')

Best score 0.7133585204720817
Best weights [0.34684806 0.21034593 0.17571597 0.02993298 0.10681787 0.22442206]


## Stacking