In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/tweet-sentiment-extraction/test.csv
/kaggle/input/tweet-sentiment-extraction/sample_submission.csv
/kaggle/input/tweet-sentiment-extraction/train.csv
/kaggle/input/glove-twitter/glove.twitter.27B.100d.txt
/kaggle/input/glove-twitter/glove.twitter.27B.200d.txt
/kaggle/input/glove-twitter/glove.twitter.27B.50d.txt
/kaggle/input/glove-twitter/glove.twitter.27B.25d.txt


In [2]:
# Load the competetion datasets

class KaggleReader(object):
    def __init__(self):
        self.df_objects = {}        
    def read_kaggle_df(self, df_name, df_path):
        self.df_objects[df_name] = pd.read_csv(df_path)
        print()
        print(df_name + ' loaded')
        print("Shape => " + str(self.df_objects[df_name].shape))
    
df_objs = KaggleReader()
df_objs.read_kaggle_df('train', "../input/tweet-sentiment-extraction/train.csv")
df_objs.read_kaggle_df('test', "../input/tweet-sentiment-extraction/test.csv")
df_objs.read_kaggle_df('submission', "../input/tweet-sentiment-extraction/sample_submission.csv")


train loaded
Shape => (27481, 4)

test loaded
Shape => (3534, 3)

submission loaded
Shape => (3534, 2)


In [12]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import average_precision_score
from numpy import hstack, vstack
# Tokenize text
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import text_to_word_sequence
from keras.utils import to_categorical
from nltk.tokenize import WordPunctTokenizer

# Model
from keras.models import Sequential, Model
from keras.layers import Dense, Flatten, LSTM, Embedding, Dropout, Input, Bidirectional, TimeDistributed, BatchNormalization
from keras.initializers import Constant
from keras.layers.merge import concatenate
import keras

from sklearn.preprocessing import OneHotEncoder, StandardScaler

In [5]:
class Handlers(object):
    def __init__(self):
        self.run_tests()
        pass
    
    def draw_pr_curve_plt(self, Y_valid, y_pred, x_range=1.0):
    #     (precision, recall, x_range=1.0):

            precision, recall, thresholds_pr = precision_recall_curve(Y_valid, y_pred)

            # import dependencies
            import matplotlib.pyplot as plt

            plt.step(recall, precision, color='b', alpha=0.2,
                     where='post')
            plt.fill_between(recall, precision, alpha=0.2, color='b')
            plt.xlabel('Recall')
            plt.ylabel('Precision')
            plt.ylim([0.0, 1.05])
            plt.xlim([0.0, x_range])
            plt.show()
     
    def draw_roc_curve_plt(self, Y_valid, y_pred):
#     (fpr, tpr, auc):
        # import dependencies
        import matplotlib.pyplot as plt
        fpr, tpr, thresholds_roc = roc_curve(Y_valid, y_pred)
        auc_roc = auc(fpr, tpr)
        
        plt.figure(1)
        plt.plot([0, 1], [0, 1], 'k--')
        plt.plot(fpr, tpr, label='(area = {:.3f})'.format(auc_roc))
        # plt.plot(fpr_rf, tpr_rf, label='RF (area = {:.3f})'.format(auc_rf))
        plt.xlabel('False positive rate')
        plt.ylabel('True positive rate')
        plt.title('ROC curve')
        plt.legend(loc='best')
        plt.show()

    # Optimize using KMP
    def return_start_end_indices(self, big, small):
        if len(small) <= 0 or len(big) <= 0:
            return (-1,-1)
        i = 0
        j = 0
        started = False
        while i < len(big):
            if big[i] == small[j]:
                if started == False:
                    started = True
                    i_start = i

                if j == len(small) - 1:
                    return (i-len(small)+1, i)
                j += 1

            else:
                if started == True:
                    started = False
                    i = i_start + 1
                j = 0
            i += 1
        return (-1,-1)
    
    def run_tests(self):
        
        assert self.return_start_end_indices(['i', '`', 'd', 'have', 'responded', ',', 'if', 'i', 'were', 'going'], ['have', 'responded', ',']) == (3,5)
        assert self.return_start_end_indices('abc', '') == (-1,-1)
        assert self.return_start_end_indices('abc', 'a') == (0,0)
        assert self.return_start_end_indices('abc', 'b') == (1,1)
        assert self.return_start_end_indices('abc', 'c') == (2,2)
        assert self.return_start_end_indices('abc', 'ab') == (0,1)
        assert self.return_start_end_indices('abc', 'bc') == (1,2)
        assert self.return_start_end_indices('abc', 'ac') == (-1,-1)
        assert self.return_start_end_indices('abc', 'abc') == (0,2)
        assert self.return_start_end_indices('abcabcabc', 'abc') == (0,2)
        assert self.return_start_end_indices('ababcc', 'abc') == (2,4)
    
    def jaccard(self, str1, str2): 
        a = set(str1.lower().split()) 
        b = set(str2.lower().split())
        c = a.intersection(b)
        return float(len(c)) / (len(a) + len(b) - len(c))
    
    def load_gloVe_embeddings(self):
        # Boilerplate taken from here - https://www.kaggle.com/stacykurnikova/using-glove-embedding
        embeddings_index = {}
        f = open('/kaggle/input/glove-twitter/glove.twitter.27B.25d.txt')
        for line in f:
            values = line.split(' ')
            word = values[0] ## The first entry is the word
            coefs = np.asarray(values[1:], dtype='float32') ## These are the vecotrs representing the embedding for the word
            embeddings_index[word] = coefs
        f.close()
        print('GloVe data loaded')
        return embeddings_index
    
    def load_embeddings_matrix(self, embeddings_index, index_tokenizer):
        # https://www.kaggle.com/stacykurnikova/using-glove-embedding
        # Create an embedding matrix with embedding vectors for the tokens recognized in the vocab of tweets
        
        EMBEDDING_DIM = embeddings_index.get('a').shape[0]
        # num_words = min(MAX_NUM_WORDS, len(word_index)) + 1
        num_words = len(index_tokenizer.word_index) + 1

        # To Do: constrain the vocab size
        embedding_matrix = np.random.uniform(-1,+1,(num_words, EMBEDDING_DIM))
        count_in_embedding_vocab = 0
        for word, i in index_tokenizer.word_index.items():
            embedding_vector = embeddings_index.get(word) ## This references the loaded embeddings dictionary
            if embedding_vector is not None:
                # words not found in embedding index will be all-zeros.
                embedding_matrix[i] = embedding_vector
                count_in_embedding_vocab += 1
#         TO DO:
#             else:
#                 PASS EMBEDDINGS OF SYNONYMS (!!! == !)
        return embedding_matrix
        # print(count_in_embedding_vocab)
        # print(len(embedding_matrix))
    
    def get_token_indices(self, x):
        span_generator = WordPunctTokenizer().span_tokenize(x)
        spans = [span for span in span_generator]
        return spans
    
    # TO DO: compensate for end and start on top of padding
    def get_pred_text_span(self, x):
        start_token_index = int(x['pred_start'])
        end_token_index = int(x['pred_end'])
        token_indices_list = x['tokens_indices']
        text = x['text']

        # start token ind > end token ind (could change logic to also have either 0:end)
        if start_token_index > end_token_index:
            return text

        # start token ind and end token ind within bounds
        elif start_token_index < len(token_indices_list) and end_token_index < len(token_indices_list):
            return text[token_indices_list[start_token_index][0]: token_indices_list[end_token_index][1]]


        # start token ind after bounds (could change logic to also have either 0:end or sth similar)
        elif start_token_index >= len(token_indices_list):
            return text

        # only end token ind out of bounds
        elif start_token_index < len(token_indices_list) and end_token_index >= len(token_indices_list):
            return text[token_indices_list[start_token_index][0]: len(text)-1]
        
    def get_preds_out(self, preds_indexes, test_df):
        temp_df = pd.concat([test_df, pd.DataFrame(preds_indexes, columns=['pred_start', 'pred_end'])], axis=1)
        temp_df['tokens_indices'] = temp_df['trans_text'].apply(handlers.get_token_indices)
        # print((temp_df['tokens_indices'].apply(len) == temp_df['tokens'].apply(len)).value_counts())

        # Get the predictions
        return temp_df.apply(handlers.get_pred_text_span, axis = 1)
    
    def get_indexes_from_argmax(self, preds, test_df):
        start_preds = preds[0]
        end_preds = preds[1]
        start_preds = start_preds.reshape((start_preds.shape[0], start_preds.shape[1]))
        end_preds = end_preds.reshape((end_preds.shape[0], end_preds.shape[1]))
        start_inds = start_preds.argmax(axis=1)
        end_inds = end_preds.argmax(axis=1)
        return vstack([start_inds, end_inds]).transpose()
        
        
    def batch_jaccard(self, preds, test_df):
        temp_df = pd.concat([test_df, pd.DataFrame(preds, columns=['pred_start', 'pred_end'])], axis=1)
        temp_df['tokens_indices'] = temp_df['trans_text'].apply(handlers.get_token_indices)
        # print((temp_df['tokens_indices'].apply(len) == temp_df['tokens'].apply(len)).value_counts())

        # Get the predictions
        temp_df['out_pred_span'] = temp_df.apply(handlers.get_pred_text_span, axis = 1)
        return temp_df.apply(lambda x: handlers.jaccard(x['selected_text'], x['out_pred_span']), axis=1).mean()
    
    
    def batch_jaccard_from_argmax(self, preds, test_df):
        start_preds = preds[0]
        end_preds = preds[1]
        start_preds = start_preds.reshape((start_preds.shape[0], start_preds.shape[1]))
        end_preds = end_preds.reshape((end_preds.shape[0], end_preds.shape[1]))
        start_inds = start_preds.argmax(axis=1)
        end_inds = end_preds.argmax(axis=1)
        return self.batch_jaccard(vstack([start_inds, end_inds]).transpose(), test_df)    

handlers = Handlers()


# To add Jaccard Similarity Value on Validation set after each epoch
class Metrics(keras.callbacks.Callback):
    # https://stackoverflow.com/questions/37657260/how-to-implement-custom-metric-in-keras
    def on_train_begin(self, logs={}):
        self._data = []

    def on_epoch_end(self, batch, logs={}):
        X_val, X_sentiment, y_val = self.validation_data[0], self.validation_data[1], self.validation_data[2]
        y_predict = np.asarray(self.model.predict([X_val, X_sentiment]))
        print('Val Jaccard Similarity: {}'.format(handlers.batch_jaccard(y_predict, test_df))) 
        return

    def get_data(self):
        return self._data
    
class MetricsCategorical(keras.callbacks.Callback):
    # https://stackoverflow.com/questions/37657260/how-to-implement-custom-metric-in-keras
    def on_train_begin(self, logs={}):
        self._data = []

    def on_epoch_end(self, batch, logs={}):
        X_val, X_numerics, y_val = self.validation_data[0], self.validation_data[1], self.validation_data[2]
        y_predict = np.asarray(self.model.predict([X_val, X_numerics]))
        print('Val Jaccard Similarity: {}'.format(handlers.batch_jaccard_from_argmax(y_predict, test_df))) 
        return

    def get_data(self):
        return self._data    
    
metrics = Metrics()
metrics_categorical = MetricsCategorical()

In [6]:
MAX_SEQUENCE_LENGTH = 40

class DataManipulationPipeline(object):
    def __init__(self):
        # Load dependencies
        try:
            self.handlers = handlers
        except NameError:
            self.handlers = Handlers()
            
        # init vars
        self.vars = {
            'MAX_SEQUENCE_LENGTH' : MAX_SEQUENCE_LENGTH
        }

    def pre_fit(self, X):
        
        # Strip the text
        X['text'] = X['text'].str.strip()
        # Lower case the text
        X['trans_text'] = X['text'].apply(str.lower)
        return X

    def create_sequences(self, X):
        # Tokenize the word tokens to word_indexes
        sequences = self.vars['keras_index_tokenizer'].texts_to_sequences(X['tokens'])
        # Pad the sequences to be fed to NN [Note that this will effectively change the start, end index if padded on post]
        sequences_padded = pad_sequences(sequences, maxlen=self.vars['MAX_SEQUENCE_LENGTH'], padding='post', truncating='post')        

        return sequences_padded
    
    def create_Y(self, X):
        X['trans_selected_text'] = X['selected_text'].apply(str.lower)
        X = self.punct_tokenize(X, 'trans_selected_text', 'tokens_selected_text', self.vars['punct_tokenizer'])
        X['start_end_indices'] = X.apply(lambda x: handlers.return_start_end_indices(x['tokens'], x['tokens_selected_text']), axis=1)
        # truncate the start end indices to end of MAX LEN of sequence
        X['start_end_indices'] = X['start_end_indices'].apply(lambda x: (x[0], (MAX_SEQUENCE_LENGTH-1) if x[1]>= MAX_SEQUENCE_LENGTH else x[1]))
        # if start is ahead of end, put stamp
        X['start_end_indices'] = X['start_end_indices'].apply(lambda x: (x[0],x[1]) if x[0] <= x[1] else (-1,-1))
        
        X = X[(X['start_end_indices'] != (-1,-1))]
        X = X.reset_index(drop=True)
        X['start_ind'] = X['start_end_indices'].apply(lambda x: x[0])
        X['end_ind'] = X['start_end_indices'].apply(lambda x: x[1])
        Y = hstack(
            (
                X['start_ind'].values.reshape(X.shape[0],1),
                X['end_ind'].values.reshape(X.shape[0],1)
            )
        )
        return Y,X
    
    def create_Y_Categorical(self, Y):
        Y_categorical = np.array([[0]*MAX_SEQUENCE_LENGTH for j in range(Y.shape[0])])
        for i in range(Y.shape[0]):
            for j in range(Y[i][0], Y[i][1]+1):
                Y_categorical[i][j] = 1
        
        return keras.utils.to_categorical(Y_categorical, 2)
    
    def create_Y_Categorical_two_outs(self, Y):
        Y_categorical_start = np.array([[0]*MAX_SEQUENCE_LENGTH for j in range(Y.shape[0])])
        for i in range(Y.shape[0]): # for each example
            # print(Y[i])
            Y_categorical_start[i][Y[i][0]] = 1
        Y_categorical_start = Y_categorical_start.reshape(Y_categorical_start.shape[0], Y_categorical_start.shape[1], 1)

        Y_categorical_end = np.array([[0]*MAX_SEQUENCE_LENGTH for j in range(Y.shape[0])])
        for i in range(Y.shape[0]): # for each example
            Y_categorical_end[i][Y[i][1]] = 1
        Y_categorical_end = Y_categorical_end.reshape(Y_categorical_end.shape[0], Y_categorical_end.shape[1], 1)            

#         return (keras.utils.to_categorical(Y_categorical_start, 2), keras.utils.to_categorical(Y_categorical_end, 2))
        return (Y_categorical_start, Y_categorical_end)
        
    
    def handle_sentiment_feature_eng(self, X):
        sentiment_transform = self.vars['sentiment_one_hot'].transform(X['sentiment'].values.reshape((X['sentiment'].shape[0],1)))
        # Copy over each data row for MAX_SEQUENCE_LENGTH times to send it inside each LSTM sequence
#         sentiment_transform_repeated = np.array([([sentiment_transform[i] for x in range(MAX_SEQUENCE_LENGTH)]) for i in range(X.shape[0])])
        
        # To Do: Scale?
        return sentiment_transform
    
    def text_stats(self, X):
        X['char_len'] = X['trans_text'].apply(len)
        X['word_len'] = X['tokens'].apply(len)
        X['char_word_ratio'] = X['char_len']/X['word_len']
            
        text_stats_singular = X[['char_len', 'word_len', 'char_word_ratio']].values
        
        # Standardize
#         text_stats_singular        

#         text_stats_repeated = np.array([([text_stats_singular[i] for x in range(MAX_SEQUENCE_LENGTH)]) for i in range(X.shape[0])])
        return text_stats_singular

    def repeater(self, X):
        return np.array([([X[i] for x in range(MAX_SEQUENCE_LENGTH)]) for i in range(X.shape[0])])
    
        
    
    # Fits the parameters on train
    def fit_transform(self, X):
        
        # {x}
        # Remove nulls from train
        X = X.dropna()
        
        X = self.pre_fit(X)
        
        # Create word tokens from sentences using NTLK
        self.vars['punct_tokenizer'] = WordPunctTokenizer()
        # put sth else inseatd of X foe expanded vocab?
        X = self.punct_tokenize(X, 'trans_text', 'tokens', self.vars['punct_tokenizer'])

        # Fit Index-Tokenizer the vocab using Keras
        self.vars['keras_index_tokenizer'] = Tokenizer()
        # Use all words for extended voacb
        self.vars['keras_index_tokenizer'].fit_on_texts(
            pd.concat(
            [
                df_objs.df_objects['train']['text'].dropna().str.strip().apply(str.lower),
                # To Do: Make an option statement on test in case not available
                # Increase Vocab
                df_objs.df_objects['test']['text'].dropna().str.strip().apply(str.lower)
            ]
        ).reset_index(drop=True).apply(WordPunctTokenizer().tokenize)
        )
        # len(keras_tokenizer.word_index)
        
        # load glove embeddings
        self.vars['embeddings_index'] = self.handlers.load_gloVe_embeddings() 
        self.vars['embeddings_matrix'] = self.handlers.load_embeddings_matrix(self.vars['embeddings_index'], self.vars['keras_index_tokenizer'])

        # {y}
        # Create label column - Word Tokenize selected_text, and create label indices
        Y,X = self.create_Y(X)
        Y_categorical = self.create_Y_Categorical_two_outs(Y)
        # Run final X transform after Y since Y transform filters out some X
        sequences_padded = self.create_sequences(X)
        
        # Fit one hot encoder for sentiment (Try standard scalar downstream as well?)
        self.vars['sentiment_one_hot'] = OneHotEncoder(handle_unknown='ignore', sparse=False)
        self.vars['sentiment_one_hot'].fit(X['sentiment'].values.reshape((X['sentiment'].shape[0],1)))
        # print(enc.get_feature_names())
        X_sentiment_one_hot = self.handle_sentiment_feature_eng(X)
        
        # Create text stats
        X_text_stats = self.text_stats(X)
        self.vars['X_text_stats_standard_scaler'] = StandardScaler()
        self.vars['X_text_stats_standard_scaler'].fit(X_text_stats)
        X_text_stats_scaled = self.vars['X_text_stats_standard_scaler'].transform(X_text_stats)
        X_agg_numerics_all = hstack([X_sentiment_one_hot, X_text_stats_scaled])
        
        X_numerics_repeated = self.repeater(X_agg_numerics_all)
        
        return (sequences_padded, X_numerics_repeated, Y, Y_categorical, X)


    # Transforms using the parameters on train
    def transform(self, X):
        
        X = self.pre_fit(X)
        
        # Create word tokens
        X = self.punct_tokenize(X, 'trans_text', 'tokens', self.vars['punct_tokenizer'])
        
        Y = None
        Y_categorical = None
        if 'selected_text' in X.columns:
            Y,X = self.create_Y(X)
            Y_categorical = self.create_Y_Categorical_two_outs(Y)
        sequences_padded = self.create_sequences(X)
        X_sentiment_one_hot = self.handle_sentiment_feature_eng(X)        
        # Create text stats
        X_text_stats = self.text_stats(X)
        X_text_stats_scaled = self.vars['X_text_stats_standard_scaler'].transform(X_text_stats)
        
        X_agg_numerics_all = hstack([X_sentiment_one_hot, X_text_stats_scaled])
        X_numerics_repeated = self.repeater(X_agg_numerics_all)
    
        
        return (sequences_padded, X_numerics_repeated, Y, Y_categorical, X)
        
        
    def punct_tokenize(self, X,old_col, new_col, punct_tokenizer):
        X[new_col] = X[old_col].apply(punct_tokenizer.tokenize)
        return X

In [7]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(df_objs.df_objects['train'], test_size=0.15, random_state=42)
train = train.reset_index(drop=True)
test = test.reset_index(drop=True)
# X_train, X_test, Y_train, Y_test, idx1, idx2 = train_test_split(df_objs.df_objects['train'], Y, np.arange(Y.shape[0]), test_size=0.15, random_state=42)


In [8]:
data_mainpulation_pipeline = DataManipulationPipeline()
(train_manip_X, train_numerics_repeated, train_manip_Y, train_manip_Y_categorical, train_df) = data_mainpulation_pipeline.fit_transform(train)
(test_manip_X, test_numerics_repeated, test_manip_Y, test_manip_Y_categorical, test_df) = data_mainpulation_pipeline.transform(test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


GloVe data loaded


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user

In [10]:
VOCAB_SIZE = len(data_mainpulation_pipeline.vars['keras_index_tokenizer'].word_index)+1
EMBEDDING_DIM = len(data_mainpulation_pipeline.vars['embeddings_index']['a'])

In [14]:
# Model
# ----------------------------------------------------
inputLayer_words = Input(shape=(MAX_SEQUENCE_LENGTH,))
inputLayer_agg_numerics = Input(shape=(
    MAX_SEQUENCE_LENGTH,
    train_numerics_repeated.shape[2]
))

# Embedding layer for the tap names
wordEmbeddings = Embedding(input_dim=VOCAB_SIZE, output_dim=EMBEDDING_DIM, input_length=MAX_SEQUENCE_LENGTH)(inputLayer_words)
wordEmbeddings = Embedding(
    input_dim=VOCAB_SIZE, 
    output_dim=EMBEDDING_DIM, 
    input_length=MAX_SEQUENCE_LENGTH, 
    weights=[data_mainpulation_pipeline.vars['embeddings_matrix']],
    trainable=False)(inputLayer_words)
# 
merged_Input = concatenate([wordEmbeddings, inputLayer_agg_numerics])
# inputLayer_agg_indices
# # LSTM
lstm_1 = Bidirectional(LSTM(50, 
                            return_sequences = True,
                            dropout=0.2,
                            recurrent_dropout=0.2,
                           ))(merged_Input)
# b_norm_1 = BatchNormalization(5)(lstm_1)
# input_shape=(25,(EMBEDDING_DIM))
td = TimeDistributed(Dense(25))(lstm_1)


# drp1 = Dropout(0.2)(lstm_1)

# # Dense
# dense_0 = Dense(75, activation='relu')(drp1)
# drp2 = Dropout(0.2)(dense_0)
# dense_1 = Dense(30, activation='relu')(drp2)
outputLayer_start = Dense(1, activation='sigmoid')(td)
outputLayer_end = Dense(1, activation='sigmoid')(td)

model_2 = Model(inputs=[inputLayer_words, inputLayer_agg_numerics], outputs=[outputLayer_start, outputLayer_end])

model_2.compile(loss='binary_crossentropy', optimizer='adam')
# model.compile(loss='mse', optimizer='adam', metrics=[jaccard])
model_2.summary()
# ----------------------------------------------------

model_2.fit(
    [train_manip_X, train_numerics_repeated], 
    [train_manip_Y_categorical[0], train_manip_Y_categorical[1]], 
    validation_data=(
        [test_manip_X, test_numerics_repeated], 
        [test_manip_Y_categorical[0], test_manip_Y_categorical[1]]), 
    batch_size=100, 
    epochs=200,
    callbacks=[metrics_categorical]
)

Model: "model_3"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_5 (InputLayer)            (None, 40)           0                                            
__________________________________________________________________________________________________
embedding_6 (Embedding)         (None, 40, 25)       728175      input_5[0][0]                    
__________________________________________________________________________________________________
input_6 (InputLayer)            (None, 40, 6)        0                                            
__________________________________________________________________________________________________
concatenate_3 (Concatenate)     (None, 40, 31)       0           embedding_6[0][0]                
                                                                 input_6[0][0]              

Val Jaccard Similarity: 0.6999335772331678
Epoch 25/200
Val Jaccard Similarity: 0.6985261175492206
Epoch 26/200
Val Jaccard Similarity: 0.7040279113986752
Epoch 27/200
Val Jaccard Similarity: 0.7022302134495331
Epoch 28/200
Val Jaccard Similarity: 0.7056393732950444
Epoch 29/200
Val Jaccard Similarity: 0.707658616135887
Epoch 30/200
Val Jaccard Similarity: 0.706937412732956
Epoch 31/200
Val Jaccard Similarity: 0.7073677204319032
Epoch 32/200
Val Jaccard Similarity: 0.7054536817665295
Epoch 33/200
Val Jaccard Similarity: 0.7017417986156712
Epoch 34/200
Val Jaccard Similarity: 0.7063670887950548
Epoch 35/200
Val Jaccard Similarity: 0.7062747622132618
Epoch 36/200
Val Jaccard Similarity: 0.7017553392328134
Epoch 37/200
Val Jaccard Similarity: 0.7062191606711566
Epoch 38/200
Val Jaccard Similarity: 0.7088347486749469
Epoch 39/200
Val Jaccard Similarity: 0.708928938290315
Epoch 40/200
Val Jaccard Similarity: 0.708291013838653
Epoch 41/200
Val Jaccard Similarity: 0.7094375911815918
Epoch 42/

Val Jaccard Similarity: 0.7137328283836303
Epoch 58/200
Val Jaccard Similarity: 0.7122871008207629
Epoch 59/200
Val Jaccard Similarity: 0.7147027245898943
Epoch 60/200
Val Jaccard Similarity: 0.7138314684981322
Epoch 61/200
Val Jaccard Similarity: 0.7182355495386693
Epoch 62/200
Val Jaccard Similarity: 0.7115314944612122
Epoch 63/200
Val Jaccard Similarity: 0.7160720723268715
Epoch 64/200
Val Jaccard Similarity: 0.7100748763970584
Epoch 65/200
Val Jaccard Similarity: 0.7145249034625285
Epoch 66/200
Val Jaccard Similarity: 0.7143422278791056
Epoch 67/200
Val Jaccard Similarity: 0.7128618486767591
Epoch 68/200
Val Jaccard Similarity: 0.7182555707387595
Epoch 69/200
Val Jaccard Similarity: 0.7114120133684352
Epoch 70/200
Val Jaccard Similarity: 0.7112487490095745
Epoch 71/200
Val Jaccard Similarity: 0.7161565542845509
Epoch 72/200
Val Jaccard Similarity: 0.7131063208950769
Epoch 73/200
Val Jaccard Similarity: 0.713196100617974
Epoch 74/200
Val Jaccard Similarity: 0.7130291995595655
Epoch 

Val Jaccard Similarity: 0.7153834683385509
Epoch 90/200
Val Jaccard Similarity: 0.7164511385496671
Epoch 91/200
Val Jaccard Similarity: 0.7161386179158155
Epoch 92/200
Val Jaccard Similarity: 0.7158923272246154
Epoch 93/200
Val Jaccard Similarity: 0.7160930515375298
Epoch 94/200
Val Jaccard Similarity: 0.7143250961010886
Epoch 95/200
Val Jaccard Similarity: 0.7145610719160169
Epoch 96/200
Val Jaccard Similarity: 0.713577389531533
Epoch 97/200
Val Jaccard Similarity: 0.7152241245601766
Epoch 98/200
Val Jaccard Similarity: 0.7189710462659988
Epoch 99/200
Val Jaccard Similarity: 0.7129870732248234
Epoch 100/200
Val Jaccard Similarity: 0.7147577395850742
Epoch 101/200
Val Jaccard Similarity: 0.7190000491531402
Epoch 102/200
Val Jaccard Similarity: 0.7168247747235433
Epoch 103/200
Val Jaccard Similarity: 0.7164463950930882
Epoch 104/200
Val Jaccard Similarity: 0.7159911862666182
Epoch 105/200
Val Jaccard Similarity: 0.7163903358029033
Epoch 106/200
Val Jaccard Similarity: 0.7154717371917556

Val Jaccard Similarity: 0.7186887668361355
Epoch 123/200
Val Jaccard Similarity: 0.7192805846165078
Epoch 124/200
Val Jaccard Similarity: 0.7201806663750604
Epoch 125/200
Val Jaccard Similarity: 0.7147511168563762
Epoch 126/200
Val Jaccard Similarity: 0.7150648729210697
Epoch 127/200
Val Jaccard Similarity: 0.7167839289839235
Epoch 128/200
Val Jaccard Similarity: 0.7149175198069211
Epoch 129/200
Val Jaccard Similarity: 0.7123354089684097
Epoch 130/200
Val Jaccard Similarity: 0.714800250133067
Epoch 131/200
Val Jaccard Similarity: 0.7177292808008535
Epoch 132/200
Val Jaccard Similarity: 0.7159224999128311
Epoch 133/200
Val Jaccard Similarity: 0.7186905371828256
Epoch 134/200
Val Jaccard Similarity: 0.71633970225451
Epoch 135/200
Val Jaccard Similarity: 0.7178524048140479
Epoch 136/200
Val Jaccard Similarity: 0.7151681072609453
Epoch 137/200
Val Jaccard Similarity: 0.7174990240771251
Epoch 138/200
Val Jaccard Similarity: 0.7167793041474525
Epoch 139/200
Val Jaccard Similarity: 0.71663766

KeyboardInterrupt: 

Epoch 98/100
21561/21561 [==============================] - 8s 357us/step - loss: 26.4123 - val_loss: 32.8301
Val Jaccard Similarity: 0.5266293328859964

Epoch 99/100
21561/21561 [==============================] - 8s 356us/step - loss: 26.0379 - val_loss: 32.5830
Val Jaccard Similarity: 0.5592650588400309

Epoch 100/100
21561/21561 [==============================] - 8s 352us/step - loss: 25.8462 - val_loss: 32.7917
Val Jaccard Similarity: 0.5504962474510444

In [None]:
model_2.fit(
    [train_manip_X, train_numerics_repeated], 
    [train_manip_Y_categorical[0], train_manip_Y_categorical[1]], 
    validation_data=(
        [test_manip_X, test_numerics_repeated], 
        [test_manip_Y_categorical[0], test_manip_Y_categorical[1]]), 
    batch_size=10000, 
    epochs=1000,
    callbacks=[metrics_categorical]
)

Train on 21560 samples, validate on 3804 samples
Epoch 1/1000
Val Jaccard Similarity: 0.7218930148985786
Epoch 2/1000
Val Jaccard Similarity: 0.7216997479714511
Epoch 3/1000
Val Jaccard Similarity: 0.7222091212281733
Epoch 4/1000
Val Jaccard Similarity: 0.7215352694190896
Epoch 5/1000
Val Jaccard Similarity: 0.7202440199837691
Epoch 6/1000
Val Jaccard Similarity: 0.7189238558366413
Epoch 7/1000
Val Jaccard Similarity: 0.7197147116811191
Epoch 8/1000
Val Jaccard Similarity: 0.7205790734836478
Epoch 9/1000
Val Jaccard Similarity: 0.7198999451487073
Epoch 10/1000
Val Jaccard Similarity: 0.7202045468351949
Epoch 11/1000
Val Jaccard Similarity: 0.7200264286837191
Epoch 12/1000
Val Jaccard Similarity: 0.719228378729558
Epoch 13/1000
Val Jaccard Similarity: 0.7191460648863222
Epoch 14/1000
Val Jaccard Similarity: 0.7194397831076721
Epoch 15/1000
Val Jaccard Similarity: 0.7205620071042355
Epoch 16/1000
Val Jaccard Similarity: 0.7207346198453054
Epoch 17/1000
Val Jaccard Similarity: 0.722021577

Val Jaccard Similarity: 0.7204436273367022
Epoch 34/1000
Val Jaccard Similarity: 0.7204316707178481
Epoch 35/1000
Val Jaccard Similarity: 0.7196071438683778
Epoch 36/1000
Val Jaccard Similarity: 0.7190405458589565
Epoch 37/1000
Val Jaccard Similarity: 0.7199897992013156
Epoch 38/1000
Val Jaccard Similarity: 0.7201336625695838
Epoch 39/1000
Val Jaccard Similarity: 0.7205217450182007
Epoch 40/1000
Val Jaccard Similarity: 0.7210647943671408
Epoch 41/1000
Val Jaccard Similarity: 0.720315015985189
Epoch 42/1000
Val Jaccard Similarity: 0.7200125628364538
Epoch 43/1000
Val Jaccard Similarity: 0.7194684079326565
Epoch 44/1000
Val Jaccard Similarity: 0.72024087514577
Epoch 45/1000
Val Jaccard Similarity: 0.7198373911781909
Epoch 46/1000
Val Jaccard Similarity: 0.7197221726922172
Epoch 47/1000
Val Jaccard Similarity: 0.7204898684673211
Epoch 48/1000
Val Jaccard Similarity: 0.7193018259077993
Epoch 49/1000
Val Jaccard Similarity: 0.7190857663511361
Epoch 50/1000
Val Jaccard Similarity: 0.71989613

Val Jaccard Similarity: 0.7191655118886265
Epoch 66/1000
Val Jaccard Similarity: 0.7190854199426114
Epoch 67/1000
Val Jaccard Similarity: 0.7184766584505693
Epoch 68/1000
Val Jaccard Similarity: 0.7188523561376191
Epoch 69/1000
Val Jaccard Similarity: 0.719889373169572
Epoch 70/1000
Val Jaccard Similarity: 0.7203971089113882
Epoch 71/1000
Val Jaccard Similarity: 0.7192904568746009
Epoch 72/1000
Val Jaccard Similarity: 0.7187149328711753
Epoch 73/1000
Val Jaccard Similarity: 0.7179127706829721
Epoch 74/1000
Val Jaccard Similarity: 0.7186254323770065
Epoch 75/1000
Val Jaccard Similarity: 0.718482086106315
Epoch 76/1000
Val Jaccard Similarity: 0.7190222961466567
Epoch 77/1000
Val Jaccard Similarity: 0.7181809270710577
Epoch 78/1000
Val Jaccard Similarity: 0.7193144152562193
Epoch 79/1000
Val Jaccard Similarity: 0.7193870084508888
Epoch 80/1000
Val Jaccard Similarity: 0.7188872555569676
Epoch 81/1000
Val Jaccard Similarity: 0.7201667886542836
Epoch 82/1000
Val Jaccard Similarity: 0.7192692

Val Jaccard Similarity: 0.7184431227844158
Epoch 99/1000
Val Jaccard Similarity: 0.7180052261602964
Epoch 100/1000
Val Jaccard Similarity: 0.7190393207368677
Epoch 101/1000
Val Jaccard Similarity: 0.7198568581079412
Epoch 102/1000
Val Jaccard Similarity: 0.7201817144944265
Epoch 103/1000
Val Jaccard Similarity: 0.719985739771638
Epoch 104/1000
Val Jaccard Similarity: 0.7200454373471084
Epoch 105/1000
Val Jaccard Similarity: 0.719654869833454
Epoch 106/1000
Val Jaccard Similarity: 0.718975438934967
Epoch 107/1000
Val Jaccard Similarity: 0.7196136605650237
Epoch 108/1000
Val Jaccard Similarity: 0.719244110934241
Epoch 109/1000
Val Jaccard Similarity: 0.7195377479635896
Epoch 110/1000
Val Jaccard Similarity: 0.7197234653933459
Epoch 111/1000
Val Jaccard Similarity: 0.7190088400402762
Epoch 112/1000
Val Jaccard Similarity: 0.7192422466616956
Epoch 113/1000
Val Jaccard Similarity: 0.7172541645862823
Epoch 114/1000
Val Jaccard Similarity: 0.7172129795952709
Epoch 115/1000
Val Jaccard Similar

Val Jaccard Similarity: 0.7210539788952826
Epoch 131/1000
Val Jaccard Similarity: 0.7206163759041526
Epoch 132/1000
Val Jaccard Similarity: 0.7188011727101089
Epoch 133/1000
Val Jaccard Similarity: 0.7181742200519107
Epoch 134/1000
Val Jaccard Similarity: 0.718831617469365
Epoch 135/1000
Val Jaccard Similarity: 0.7182054401326026
Epoch 136/1000
Val Jaccard Similarity: 0.7191411012143343
Epoch 137/1000
Val Jaccard Similarity: 0.7199343400964843
Epoch 138/1000
Val Jaccard Similarity: 0.7198985986546377
Epoch 139/1000
Val Jaccard Similarity: 0.7208292155469846
Epoch 140/1000
Val Jaccard Similarity: 0.7216592640518208
Epoch 141/1000
Val Jaccard Similarity: 0.7207073690749342
Epoch 142/1000
Val Jaccard Similarity: 0.7203137463483787
Epoch 143/1000
Val Jaccard Similarity: 0.7203855294682324
Epoch 144/1000
Val Jaccard Similarity: 0.7206380410580512
Epoch 145/1000
Val Jaccard Similarity: 0.7204987507601647
Epoch 146/1000
Val Jaccard Similarity: 0.7196801797113305
Epoch 147/1000
Val Jaccard Sim

Val Jaccard Similarity: 0.7217389886107783
Epoch 163/1000
Val Jaccard Similarity: 0.7197736219091733
Epoch 164/1000
Val Jaccard Similarity: 0.7189055033547549
Epoch 165/1000
Val Jaccard Similarity: 0.720073337201936
Epoch 166/1000
Val Jaccard Similarity: 0.7192297705177779
Epoch 167/1000
Val Jaccard Similarity: 0.7192614583042876
Epoch 168/1000
Val Jaccard Similarity: 0.7185886926571527
Epoch 169/1000
Val Jaccard Similarity: 0.7197920430769527
Epoch 170/1000
Val Jaccard Similarity: 0.7208538941728951
Epoch 171/1000
Val Jaccard Similarity: 0.7205873866431571
Epoch 172/1000
Val Jaccard Similarity: 0.7204095114527831
Epoch 173/1000
Val Jaccard Similarity: 0.7186616438349216
Epoch 174/1000
Val Jaccard Similarity: 0.7176736220921115
Epoch 175/1000
Val Jaccard Similarity: 0.7174752048450769
Epoch 176/1000
Val Jaccard Similarity: 0.717490237204352
Epoch 177/1000
Val Jaccard Similarity: 0.7187268002718539
Epoch 178/1000
Val Jaccard Similarity: 0.7198659368726951
Epoch 179/1000
Val Jaccard Simi

Val Jaccard Similarity: 0.7201178653631746
Epoch 195/1000
Val Jaccard Similarity: 0.7203799177098987
Epoch 196/1000
Val Jaccard Similarity: 0.7203106257010421
Epoch 197/1000
Val Jaccard Similarity: 0.7210543073796513
Epoch 198/1000
Val Jaccard Similarity: 0.7201000336307144
Epoch 199/1000
Val Jaccard Similarity: 0.7187758636716587
Epoch 200/1000
Val Jaccard Similarity: 0.7193771629955629
Epoch 201/1000
Val Jaccard Similarity: 0.7201816392872973
Epoch 202/1000
Val Jaccard Similarity: 0.7196794732282848
Epoch 203/1000
Val Jaccard Similarity: 0.7192336627511361
Epoch 204/1000
Val Jaccard Similarity: 0.7191029202707548
Epoch 205/1000
Val Jaccard Similarity: 0.7193480992231447
Epoch 206/1000
Val Jaccard Similarity: 0.7185820962894099
Epoch 207/1000
Val Jaccard Similarity: 0.7190405910317956
Epoch 208/1000
Val Jaccard Similarity: 0.718112518947842
Epoch 209/1000
Val Jaccard Similarity: 0.717749283962118
Epoch 210/1000
Val Jaccard Similarity: 0.719248365198768
Epoch 211/1000
Val Jaccard Simil

Val Jaccard Similarity: 0.7205995415159073
Epoch 227/1000
Val Jaccard Similarity: 0.7191039037465432
Epoch 228/1000
Val Jaccard Similarity: 0.7191595643487545
Epoch 229/1000
Val Jaccard Similarity: 0.7210052445142761
Epoch 230/1000
Val Jaccard Similarity: 0.721925551232692
Epoch 231/1000
Val Jaccard Similarity: 0.7202595166013577
Epoch 232/1000
Val Jaccard Similarity: 0.718931614434236
Epoch 233/1000
Val Jaccard Similarity: 0.7186340150751042
Epoch 234/1000
Val Jaccard Similarity: 0.7194792856973267
Epoch 235/1000
Val Jaccard Similarity: 0.7199929604960339
Epoch 236/1000
Val Jaccard Similarity: 0.7184087355772997
Epoch 237/1000
Val Jaccard Similarity: 0.7195931862237778
Epoch 238/1000
Val Jaccard Similarity: 0.719515541990582
Epoch 239/1000
Val Jaccard Similarity: 0.7205078788216239
Epoch 240/1000
Val Jaccard Similarity: 0.7181944143204466
Epoch 241/1000
Val Jaccard Similarity: 0.7187227545001602
Epoch 242/1000
Val Jaccard Similarity: 0.7203971522937564
Epoch 243/1000
Val Jaccard Simil

Val Jaccard Similarity: 0.7191067425697113
Epoch 259/1000
Val Jaccard Similarity: 0.7207119456048495
Epoch 260/1000
Val Jaccard Similarity: 0.7214693392048491
Epoch 261/1000
Val Jaccard Similarity: 0.7205863937327434
Epoch 262/1000
Val Jaccard Similarity: 0.722121689588768
Epoch 263/1000
Val Jaccard Similarity: 0.7196378604108485
Epoch 264/1000
Val Jaccard Similarity: 0.7177363518025738
Epoch 265/1000
Val Jaccard Similarity: 0.7181631405352885
Epoch 266/1000
Val Jaccard Similarity: 0.7197924046064784
Epoch 267/1000
Val Jaccard Similarity: 0.719522301501959
Epoch 268/1000
Val Jaccard Similarity: 0.7194036223072986
Epoch 269/1000
Val Jaccard Similarity: 0.7183442159726486
Epoch 270/1000
Val Jaccard Similarity: 0.7181219796014425
Epoch 271/1000
Val Jaccard Similarity: 0.7190654887805693
Epoch 272/1000
Val Jaccard Similarity: 0.7173332602495883
Epoch 273/1000
Val Jaccard Similarity: 0.718011431028875
Epoch 274/1000
Val Jaccard Similarity: 0.7189602617032758
Epoch 275/1000
Val Jaccard Simil

Val Jaccard Similarity: 0.7210718182423491
Epoch 291/1000
Val Jaccard Similarity: 0.7211089816236075
Epoch 292/1000
Val Jaccard Similarity: 0.7212195681103768
Epoch 293/1000
Val Jaccard Similarity: 0.7212380458792489
Epoch 294/1000
Val Jaccard Similarity: 0.7216016177523223
Epoch 295/1000
Val Jaccard Similarity: 0.721280960126854
Epoch 296/1000
Val Jaccard Similarity: 0.7221261111634429
Epoch 297/1000
Val Jaccard Similarity: 0.7212370126719243
Epoch 298/1000
Val Jaccard Similarity: 0.7203776003673227
Epoch 299/1000
Val Jaccard Similarity: 0.7198148059875928
Epoch 300/1000
Val Jaccard Similarity: 0.7196232752644747
Epoch 301/1000
Val Jaccard Similarity: 0.7197980234982403
Epoch 302/1000
Val Jaccard Similarity: 0.7187248337168792
Epoch 303/1000
Val Jaccard Similarity: 0.7190945085848621
Epoch 304/1000
Val Jaccard Similarity: 0.7194121320646044
Epoch 305/1000
Val Jaccard Similarity: 0.7208313768919818
Epoch 306/1000
Val Jaccard Similarity: 0.718916007148324
Epoch 307/1000
Val Jaccard Simi

Val Jaccard Similarity: 0.7214891750276365
Epoch 323/1000
Val Jaccard Similarity: 0.7208804241324835
Epoch 324/1000
Val Jaccard Similarity: 0.7204523605918638
Epoch 325/1000
Val Jaccard Similarity: 0.7190712719462082
Epoch 326/1000
Val Jaccard Similarity: 0.7171746263570016
Epoch 327/1000
Val Jaccard Similarity: 0.7175959599013636
Epoch 328/1000
Val Jaccard Similarity: 0.7182525640807383
Epoch 329/1000
Val Jaccard Similarity: 0.7201090798682337
Epoch 330/1000
Val Jaccard Similarity: 0.7208341789397531
Epoch 331/1000
Val Jaccard Similarity: 0.7202924717488935
Epoch 332/1000
Val Jaccard Similarity: 0.7194415377811577
Epoch 333/1000
Val Jaccard Similarity: 0.7208179635956748
Epoch 334/1000
Val Jaccard Similarity: 0.7209506516801487
Epoch 335/1000
Val Jaccard Similarity: 0.7205721470234242
Epoch 336/1000
Val Jaccard Similarity: 0.7199477046075969
Epoch 337/1000
Val Jaccard Similarity: 0.7197965545242184
Epoch 338/1000
Val Jaccard Similarity: 0.7204926497989085
Epoch 339/1000
Val Jaccard Si

Val Jaccard Similarity: 0.7189545891672415
Epoch 355/1000
Val Jaccard Similarity: 0.7200597786485111
Epoch 356/1000
Val Jaccard Similarity: 0.718647199841614
Epoch 357/1000
Val Jaccard Similarity: 0.7180979917005117
Epoch 358/1000
Val Jaccard Similarity: 0.717684560421544
Epoch 359/1000
Val Jaccard Similarity: 0.7166559504187381
Epoch 360/1000
Val Jaccard Similarity: 0.7175059548136549
Epoch 361/1000
Val Jaccard Similarity: 0.7179012035505552
Epoch 362/1000
Val Jaccard Similarity: 0.7178329883331241
Epoch 363/1000
Val Jaccard Similarity: 0.718413868192283
Epoch 364/1000
Val Jaccard Similarity: 0.7193749102765723
Epoch 365/1000
Val Jaccard Similarity: 0.7198002104042808
Epoch 366/1000
Val Jaccard Similarity: 0.7186450569296322
Epoch 367/1000
Val Jaccard Similarity: 0.7186805648369889
Epoch 368/1000
Val Jaccard Similarity: 0.7196759395853368
Epoch 369/1000
Val Jaccard Similarity: 0.7178910141500717
Epoch 370/1000
Val Jaccard Similarity: 0.7187765377080593
Epoch 371/1000
Val Jaccard Simil

Val Jaccard Similarity: 0.7192928682702053
Epoch 387/1000
Val Jaccard Similarity: 0.7207309896036643
Epoch 388/1000
Val Jaccard Similarity: 0.7195605897615287
Epoch 389/1000
Val Jaccard Similarity: 0.718665607149647
Epoch 390/1000
Val Jaccard Similarity: 0.7182816643709565
Epoch 391/1000
Val Jaccard Similarity: 0.7181881631705465
Epoch 392/1000
Val Jaccard Similarity: 0.718254993650603
Epoch 393/1000
Val Jaccard Similarity: 0.7188140250101582
Epoch 394/1000
Val Jaccard Similarity: 0.7184180838075537
Epoch 395/1000
Val Jaccard Similarity: 0.7184264333307838
Epoch 396/1000
Val Jaccard Similarity: 0.7189746429883455
Epoch 397/1000
Val Jaccard Similarity: 0.7185475570422287
Epoch 398/1000
Val Jaccard Similarity: 0.7182211513530469
Epoch 399/1000
Val Jaccard Similarity: 0.7176813302923731
Epoch 400/1000
Val Jaccard Similarity: 0.7182140763417699
Epoch 401/1000
Val Jaccard Similarity: 0.719319827193531
Epoch 402/1000
Val Jaccard Similarity: 0.7222001331054824
Epoch 403/1000
Val Jaccard Simil

Val Jaccard Similarity: 0.7199684421355531
Epoch 419/1000
Val Jaccard Similarity: 0.7198070658270792
Epoch 420/1000
Val Jaccard Similarity: 0.7187744070221613
Epoch 421/1000
Val Jaccard Similarity: 0.7192646513308929
Epoch 422/1000
Val Jaccard Similarity: 0.7204228363810342
Epoch 423/1000
Val Jaccard Similarity: 0.720193436382001
Epoch 424/1000
Val Jaccard Similarity: 0.7187502493398599
Epoch 425/1000
Val Jaccard Similarity: 0.7189297494254188
Epoch 426/1000
Val Jaccard Similarity: 0.7205871154558132
Epoch 427/1000
Val Jaccard Similarity: 0.7201238028339003
Epoch 428/1000
Val Jaccard Similarity: 0.7200103082247891
Epoch 429/1000
Val Jaccard Similarity: 0.7204683666325818
Epoch 430/1000
Val Jaccard Similarity: 0.7197950242929866
Epoch 431/1000
Val Jaccard Similarity: 0.7191339995191858
Epoch 432/1000
Val Jaccard Similarity: 0.7201696809958759
Epoch 433/1000
Val Jaccard Similarity: 0.7210079232096356
Epoch 434/1000
Val Jaccard Similarity: 0.7204639911257611
Epoch 435/1000
Val Jaccard Sim

Val Jaccard Similarity: 0.7181731606998976
Epoch 451/1000
Val Jaccard Similarity: 0.7191058400810875
Epoch 452/1000
Val Jaccard Similarity: 0.720172348890109
Epoch 453/1000
Val Jaccard Similarity: 0.7192230312776207
Epoch 454/1000
Val Jaccard Similarity: 0.7182583572351896
Epoch 455/1000
Val Jaccard Similarity: 0.7188527778909823
Epoch 456/1000
Val Jaccard Similarity: 0.7189097804470367
Epoch 457/1000
Val Jaccard Similarity: 0.7190789319434677
Epoch 458/1000
Val Jaccard Similarity: 0.7191530730322163
Epoch 459/1000
Val Jaccard Similarity: 0.7189965111668615
Epoch 460/1000
Val Jaccard Similarity: 0.7172703772204048
Epoch 461/1000
Val Jaccard Similarity: 0.7183554705768138
Epoch 462/1000
Val Jaccard Similarity: 0.7185665465795563
Epoch 463/1000
Val Jaccard Similarity: 0.7180905044293252
Epoch 464/1000
Val Jaccard Similarity: 0.7179282467292346
Epoch 465/1000
Val Jaccard Similarity: 0.7188631011020054
Epoch 466/1000
Val Jaccard Similarity: 0.7182204110832111
Epoch 467/1000
Val Jaccard Sim

Val Jaccard Similarity: 0.7190898485827038
Epoch 483/1000
Val Jaccard Similarity: 0.7201767810180641
Epoch 484/1000
Val Jaccard Similarity: 0.7203188502367347
Epoch 485/1000
Val Jaccard Similarity: 0.7196724962300731
Epoch 486/1000
Val Jaccard Similarity: 0.7199312931975693
Epoch 487/1000
Val Jaccard Similarity: 0.7205359218569057
Epoch 488/1000
Val Jaccard Similarity: 0.7197547392440985
Epoch 489/1000
Val Jaccard Similarity: 0.721296218522266
Epoch 490/1000
Val Jaccard Similarity: 0.7206748150362309
Epoch 491/1000
Val Jaccard Similarity: 0.7213516250667682
Epoch 492/1000
Val Jaccard Similarity: 0.7201640331665817
Epoch 493/1000
Val Jaccard Similarity: 0.7194958831298346
Epoch 494/1000
Val Jaccard Similarity: 0.7204088748306396
Epoch 495/1000
Val Jaccard Similarity: 0.720421663958005
Epoch 496/1000
Val Jaccard Similarity: 0.7200807801086593
Epoch 497/1000
Val Jaccard Similarity: 0.7208126890410081
Epoch 498/1000
Val Jaccard Similarity: 0.7182617470788726
Epoch 499/1000
Val Jaccard Simi

Val Jaccard Similarity: 0.7160967325386823
Epoch 515/1000
Val Jaccard Similarity: 0.7170263075541713
Epoch 516/1000
Val Jaccard Similarity: 0.7184489244330416
Epoch 517/1000
Val Jaccard Similarity: 0.7185919136730341
Epoch 518/1000
Val Jaccard Similarity: 0.7171460394409167
Epoch 519/1000
Val Jaccard Similarity: 0.7177625559048727
Epoch 520/1000
Val Jaccard Similarity: 0.7172258101538749
Epoch 521/1000
Val Jaccard Similarity: 0.7196994610507987
Epoch 522/1000
Val Jaccard Similarity: 0.720528995023077
Epoch 523/1000
Val Jaccard Similarity: 0.7193974640435673
Epoch 524/1000
Val Jaccard Similarity: 0.7196516530404496
Epoch 525/1000
Val Jaccard Similarity: 0.719475112513378
Epoch 526/1000
Val Jaccard Similarity: 0.7185443116035788
Epoch 527/1000
Val Jaccard Similarity: 0.7181173208546767
Epoch 528/1000
Val Jaccard Similarity: 0.718548484504802
Epoch 529/1000
Val Jaccard Similarity: 0.7186135648460422
Epoch 530/1000
Val Jaccard Similarity: 0.7169085671920601
Epoch 531/1000
Val Jaccard Simil

Val Jaccard Similarity: 0.7201500162667661
Epoch 547/1000
Val Jaccard Similarity: 0.7201724589280843
Epoch 548/1000
Val Jaccard Similarity: 0.7188303349018377
Epoch 549/1000
Val Jaccard Similarity: 0.7191477915056437
Epoch 550/1000
Val Jaccard Similarity: 0.7199561053190374
Epoch 551/1000
Val Jaccard Similarity: 0.7202873828779621
Epoch 552/1000
Val Jaccard Similarity: 0.7175074520808291
Epoch 553/1000
Val Jaccard Similarity: 0.7173081013431621
Epoch 554/1000
Val Jaccard Similarity: 0.7175345979896812
Epoch 555/1000
Val Jaccard Similarity: 0.7181361870818175
Epoch 556/1000
Val Jaccard Similarity: 0.7193642334638345
Epoch 557/1000
Val Jaccard Similarity: 0.7199516693322364
Epoch 558/1000
Val Jaccard Similarity: 0.7203533320304748
Epoch 559/1000
Val Jaccard Similarity: 0.7191364062933457
Epoch 560/1000
Val Jaccard Similarity: 0.7204985145436554
Epoch 561/1000
Val Jaccard Similarity: 0.720160000188288
Epoch 562/1000
Val Jaccard Similarity: 0.718643640096329
Epoch 563/1000
Val Jaccard Simi

### Predictions on local test 

In [None]:
# preds = model_2.predict([test_manip_X, test_manip_X_sentiment_one_hot_repeated])
preds = model_2.predict(
    [
        test_manip_X, test_numerics_repeated
    ]
)
temp_out = pd.concat([
    handlers.get_preds_out(handlers.get_indexes_from_argmax(preds, test_df), test_df), 
    test_df['selected_text']],
    axis=1
)
temp_out.columns = ['predicted_text', 'selected_text']
# temp_out['predicted_text']
temp_out.apply(lambda x: handlers.jaccard(x['predicted_text'], x['selected_text']), axis=1).mean()

- Basic Config with Dropout No Stage: 0.7176681846688269
- Add 200 epochs at 1000 batch: 0.7215144500264917

In [None]:
# Two Stage Result
neuts = test_df[test_df['sentiment']=='neutral'].reset_index(drop=True)[['text', 'selected_text']]
neuts.columns = ['predicted_text', 'selected_text']

pd.concat(
    [
        temp_out[test_df['sentiment']!='neutral'].reset_index(drop=True),
        neuts
    ]
).reset_index(drop=True).apply(lambda x: handlers.jaccard(x['predicted_text'], x['selected_text']), axis=1).mean()

- Basic Confif with Dropout Static First Stage: 0.7174145968259453
- Add 200 epochs at 1000 batch: 0.7213928829363386






- Check if embedding matrix is correctly created

In [None]:
# print('Overall Jaccard from Model:')
# print(test_df_1[['text', 'selected_text', 'out_pred_span']].apply(lambda x: handlers.jaccard(x['selected_text'], x['out_pred_span']), axis=1).mean())

# print('Overall Jaccard baseline from predicting the complete text:')
# print(test_df_1[['text', 'selected_text', 'out_pred_span']].apply(lambda x: handlers.jaccard(x['selected_text'], x['text']), axis=1).mean())

# print('Overall Jaccard using model for NOT neutral and baseline for neutral:')
# print(pd.concat([
#     test_df_1[test_df_1['sentiment'] == 'neutral'].apply(lambda x: handlers.jaccard(x['selected_text'], x['text']), axis=1),
#     test_df_1[test_df_1['sentiment'] != 'neutral'].apply(lambda x: handlers.jaccard(x['selected_text'], x['out_pred_span']), axis=1)
# ]).mean())


# print('Model Jaccard for neutral')
# print(test_df_1[test_df_1['sentiment'] == 'neutral'].apply(lambda x: handlers.jaccard(x['selected_text'], x['out_pred_span']), axis=1).mean())

# print('Baseline Jaccard for neutral from predicting complete text')
# print(test_df_1[test_df_1['sentiment'] == 'neutral'].apply(lambda x: handlers.jaccard(x['selected_text'], x['text']), axis=1).mean())

# print('Model Jaccard for NOT neutral')
# print(test_df_1[test_df_1['sentiment'] != 'neutral'].apply(lambda x: handlers.jaccard(x['selected_text'], x['out_pred_span']), axis=1).mean())

# print('Baseline Jaccard for NOT neutral from predicting complete text')
# print(test_df_1[test_df_1['sentiment'] != 'neutral'].apply(lambda x: handlers.jaccard(x['selected_text'], x['text']), axis=1).mean())



### Predictions on Test-Test Data

In [16]:
# (valid_manip_X, valid_numerics_repeated, valid_manip_Y, valid_manip_Y_categorical, valid_df) = data_mainpulation_pipeline.transform(df_objs.df_objects['test'])

In [17]:
# preds = model_2.predict(
#     [
#         valid_manip_X, valid_numerics_repeated
#     ]
# )
# df_objs.df_objects['test']['out_pred_span'] = handlers.get_preds_out(handlers.get_indexes_from_argmax(preds, df_objs.df_objects['test']), df_objs.df_objects['test'])
# final_out = df_objs.df_objects['test'][['textID','out_pred_span']]
# final_out.columns = ['textID', 'selected_text']

In [41]:
# final_out.head(5)

In [42]:
# final_out.to_csv('submission.csv', index=False)