In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/glove-twitter/glove.twitter.27B.100d.txt
/kaggle/input/glove-twitter/glove.twitter.27B.200d.txt
/kaggle/input/glove-twitter/glove.twitter.27B.25d.txt
/kaggle/input/glove-twitter/glove.twitter.27B.50d.txt
/kaggle/input/tweet-sentiment-extraction/train.csv
/kaggle/input/tweet-sentiment-extraction/test.csv
/kaggle/input/tweet-sentiment-extraction/sample_submission.csv


In [2]:
# Load the competetion datasets

class KaggleReader(object):
    def __init__(self):
        self.df_objects = {}        
    def read_kaggle_df(self, df_name, df_path):
        self.df_objects[df_name] = pd.read_csv(df_path)
        print()
        print(df_name + ' loaded')
        print("Shape => " + str(self.df_objects[df_name].shape))
    
df_objs = KaggleReader()
df_objs.read_kaggle_df('train', "../input/tweet-sentiment-extraction/train.csv")
df_objs.read_kaggle_df('test', "../input/tweet-sentiment-extraction/test.csv")
df_objs.read_kaggle_df('submission', "../input/tweet-sentiment-extraction/sample_submission.csv")


train loaded
Shape => (27481, 4)

test loaded
Shape => (3534, 3)

submission loaded
Shape => (3534, 2)


In [3]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import average_precision_score
from numpy import hstack, vstack
# Tokenize text
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import text_to_word_sequence
from keras.utils import to_categorical
from nltk.tokenize import WordPunctTokenizer

# Model
from keras.models import Sequential, Model
from keras.layers import Dense, Flatten, LSTM, Embedding, Dropout, Input, Bidirectional
from keras.initializers import Constant
import keras
from keras.layers.merge import concatenate

from sklearn.preprocessing import OneHotEncoder, StandardScaler

Using TensorFlow backend.


In [None]:
MAX_SEQUENCE_LENGTH = 50

In [5]:
class Handlers(object):
    def __init__(self):
        self.run_tests()
        pass
    
    def draw_pr_curve_plt(self, Y_valid, y_pred, x_range=1.0):
    #     (precision, recall, x_range=1.0):

            precision, recall, thresholds_pr = precision_recall_curve(Y_valid, y_pred)

            # import dependencies
            import matplotlib.pyplot as plt

            plt.step(recall, precision, color='b', alpha=0.2,
                     where='post')
            plt.fill_between(recall, precision, alpha=0.2, color='b')
            plt.xlabel('Recall')
            plt.ylabel('Precision')
            plt.ylim([0.0, 1.05])
            plt.xlim([0.0, x_range])
            plt.show()
     
    def draw_roc_curve_plt(self, Y_valid, y_pred):
#     (fpr, tpr, auc):
        # import dependencies
        import matplotlib.pyplot as plt
        fpr, tpr, thresholds_roc = roc_curve(Y_valid, y_pred)
        auc_roc = auc(fpr, tpr)
        
        plt.figure(1)
        plt.plot([0, 1], [0, 1], 'k--')
        plt.plot(fpr, tpr, label='(area = {:.3f})'.format(auc_roc))
        # plt.plot(fpr_rf, tpr_rf, label='RF (area = {:.3f})'.format(auc_rf))
        plt.xlabel('False positive rate')
        plt.ylabel('True positive rate')
        plt.title('ROC curve')
        plt.legend(loc='best')
        plt.show()

    # Optimize using KMP
    def return_start_end_indices(self, big, small):
        if len(small) <= 0 or len(big) <= 0:
            return (-1,-1)
        i = 0
        j = 0
        started = False
        while i < len(big):
            if big[i] == small[j]:
                if started == False:
                    started = True
                    i_start = i

                if j == len(small) - 1:
                    return (i-len(small)+1, i)
                j += 1

            else:
                if started == True:
                    started = False
                    i = i_start + 1
                j = 0
            i += 1
        return (-1,-1)
    
    def run_tests(self):
        
        assert self.return_start_end_indices(['i', '`', 'd', 'have', 'responded', ',', 'if', 'i', 'were', 'going'], ['have', 'responded', ',']) == (3,5)
        assert self.return_start_end_indices('abc', '') == (-1,-1)
        assert self.return_start_end_indices('abc', 'a') == (0,0)
        assert self.return_start_end_indices('abc', 'b') == (1,1)
        assert self.return_start_end_indices('abc', 'c') == (2,2)
        assert self.return_start_end_indices('abc', 'ab') == (0,1)
        assert self.return_start_end_indices('abc', 'bc') == (1,2)
        assert self.return_start_end_indices('abc', 'ac') == (-1,-1)
        assert self.return_start_end_indices('abc', 'abc') == (0,2)
        assert self.return_start_end_indices('abcabcabc', 'abc') == (0,2)
        assert self.return_start_end_indices('ababcc', 'abc') == (2,4)
        
    
    def jaccard(self, str1, str2): 
        a = set(str1.lower().split()) 
        b = set(str2.lower().split())
        c = a.intersection(b)
        return float(len(c)) / (len(a) + len(b) - len(c))
    
    def load_gloVe_embeddings(self):
        # Boilerplate taken from here - https://www.kaggle.com/stacykurnikova/using-glove-embedding
        embeddings_index = {}
        f = open('/kaggle/input/glove-twitter/glove.twitter.27B.25d.txt')
        for line in f:
            values = line.split(' ')
            word = values[0] ## The first entry is the word
            coefs = np.asarray(values[1:], dtype='float32') ## These are the vecotrs representing the embedding for the word
            embeddings_index[word] = coefs
        f.close()
        print('GloVe data loaded')
        return embeddings_index
    
    def load_embeddings_matrix(self, embeddings_index, index_tokenizer):
        # https://www.kaggle.com/stacykurnikova/using-glove-embedding
        # Create an embedding matrix with embedding vectors for the tokens recognized in the vocab of tweets
        
        EMBEDDING_DIM = embeddings_index.get('a').shape[0]
        # num_words = min(MAX_NUM_WORDS, len(word_index)) + 1
        num_words = len(index_tokenizer.word_index) + 1

        # To Do: constrain the vocab size
        embedding_matrix = np.random.uniform(-1,+1,(num_words, EMBEDDING_DIM))
        count_in_embedding_vocab = 0
        for word, i in index_tokenizer.word_index.items():
            embedding_vector = embeddings_index.get(word) ## This references the loaded embeddings dictionary
            if embedding_vector is not None:
                # words not found in embedding index will be all-zeros.
                embedding_matrix[i] = embedding_vector
                count_in_embedding_vocab += 1
#         TO DO:
#             else:
#                 PASS EMBEDDINGS OF SYNONYMS (!!! == !)
        return embedding_matrix
        # print(count_in_embedding_vocab)
        # print(len(embedding_matrix))
    
    def get_token_indices(self, x):
        span_generator = WordPunctTokenizer().span_tokenize(x)
        spans = [span for span in span_generator]
        return spans
    
    # TO DO: compensate for end and start on top of padding
    def get_pred_text_span(self, x):
        start_token_index = int(x['pred_start'])
        end_token_index = int(x['pred_end'])
        token_indices_list = x['tokens_indices']
        text = x['text']

        # start token ind > end token ind (could change logic to also have either 0:end)
        if start_token_index > end_token_index:
            return text

        # start token ind and end token ind within bounds
        elif start_token_index < len(token_indices_list) and end_token_index < len(token_indices_list):
            return text[token_indices_list[start_token_index][0]: token_indices_list[end_token_index][1]]


        # start token ind after bounds (could change logic to also have either 0:end or sth similar)
        elif start_token_index >= len(token_indices_list):
            return text

        # only end token ind out of bounds
        elif start_token_index < len(token_indices_list) and end_token_index >= len(token_indices_list):
            return text[token_indices_list[start_token_index][0]: len(text)-1]
        
    def batch_jaccard(self, preds, test_df):
        temp_df = pd.concat([test_df, pd.DataFrame(preds, columns=['pred_start', 'pred_end'])], axis=1)
        temp_df['tokens_indices'] = temp_df['trans_text'].apply(handlers.get_token_indices)
        # print((temp_df['tokens_indices'].apply(len) == temp_df['tokens'].apply(len)).value_counts())

        # Get the predictions
        temp_df['out_pred_span'] = temp_df.apply(handlers.get_pred_text_span, axis = 1)
        return temp_df.apply(lambda x: handlers.jaccard(x['selected_text'], x['out_pred_span']), axis=1).mean()

handlers = Handlers()


# To add Jaccard Similarity Value on Validation set after each epoch
class Metrics(keras.callbacks.Callback):
    # https://stackoverflow.com/questions/37657260/how-to-implement-custom-metric-in-keras
    def on_train_begin(self, logs={}):
        self._data = []

    def on_epoch_end(self, batch, logs={}):
        X_val, X_sentiment, y_val = self.validation_data[0], self.validation_data[1], self.validation_data[2]
        y_predict = np.asarray(self.model.predict([X_val, X_sentiment]))
        print('Val Jaccard Similarity: {}'.format(handlers.batch_jaccard(y_predict, test_df))) 
        return

    def get_data(self):
        return self._data
    
metrics = Metrics()

In [46]:


class DataManipulationPipeline(object):
    def __init__(self):
        # Load dependencies
        try:
            self.handlers = handlers
        except NameError:
            self.handlers = Handlers()
            
        # init vars
        self.vars = {
            'MAX_SEQUENCE_LENGTH' : MAX_SEQUENCE_LENGTH
        }

    def pre_fit(self, X):
        
        # Strip the text
        X['text'] = X['text'].str.strip()
        # Lower case the text
        X['trans_text'] = X['text'].apply(str.lower)
        return X

    def create_sequences(self, X):
        # Tokenize the word tokens to word_indexes
        sequences = self.vars['keras_index_tokenizer'].texts_to_sequences(X['tokens'])
        # Pad the sequences to be fed to NN [Note that this will effectively change the start, end index if padded on post]
        sequences_padded = pad_sequences(sequences, maxlen=self.vars['MAX_SEQUENCE_LENGTH'], padding='post', truncating='post')        

        return sequences_padded
    
    def create_Y(self, X):
        X['trans_selected_text'] = X['selected_text'].apply(str.lower)
        X = self.punct_tokenize(X, 'trans_selected_text', 'tokens_selected_text', self.vars['punct_tokenizer'])
        X['start_end_indices'] = X.apply(lambda x: handlers.return_start_end_indices(x['tokens'], x['tokens_selected_text']), axis=1)
        # truncate the start end indices to end of MAX LEN of sequence
        X['start_end_indices'] = X['start_end_indices'].apply(lambda x: (x[0], (MAX_SEQUENCE_LENGTH-1) if x[1]>= MAX_SEQUENCE_LENGTH else x[1]))
        
        X = X[(X['start_end_indices'] != (-1,-1))]
        X = X.reset_index(drop=True)
        X['start_ind'] = X['start_end_indices'].apply(lambda x: x[0])
        X['end_ind'] = X['start_end_indices'].apply(lambda x: x[1])
        Y = hstack(
            (
                X['start_ind'].values.reshape(X.shape[0],1),
                X['end_ind'].values.reshape(X.shape[0],1)
            )
        )
        return Y,X
    
    def handle_sentiment_feature_eng(self, X):
        sentiment_transform = self.vars['sentiment_one_hot'].transform(X['sentiment'].values.reshape((X['sentiment'].shape[0],1)))
        # Copy over each data row for MAX_SEQUENCE_LENGTH times to send it inside each LSTM sequence
#         sentiment_transform_repeated = np.array([([sentiment_transform[i] for x in range(MAX_SEQUENCE_LENGTH)]) for i in range(X.shape[0])])
        
        # To Do: Scale?
        return sentiment_transform
    
    def text_stats(self, X):
        X['char_len'] = X['trans_text'].apply(len)
        X['word_len'] = X['tokens'].apply(len)
        X['char_word_ratio'] = X['char_len']/X['word_len']
            
        text_stats_singular = X[['char_len', 'word_len', 'char_word_ratio']].values
        
        # Standardize
#         text_stats_singular        

#         text_stats_repeated = np.array([([text_stats_singular[i] for x in range(MAX_SEQUENCE_LENGTH)]) for i in range(X.shape[0])])
        return text_stats_singular

    def repeater(self, X):
        return np.array([([X[i] for x in range(MAX_SEQUENCE_LENGTH)]) for i in range(X.shape[0])])
    
        
    
    # Fits the parameters on train
    def fit_transform(self, X):
        
        # {x}
        # Remove nulls from train
        X = X.dropna()
        
        X = self.pre_fit(X)
        
        # Create word tokens from sentences using NTLK
        self.vars['punct_tokenizer'] = WordPunctTokenizer()
        # put sth else inseatd of X foe expanded vocab?
        X = self.punct_tokenize(X, 'trans_text', 'tokens', self.vars['punct_tokenizer'])

        # Fit Index-Tokenizer the vocab using Keras
        self.vars['keras_index_tokenizer'] = Tokenizer()
        # Use all words for extended voacb
        self.vars['keras_index_tokenizer'].fit_on_texts(
            pd.concat(
            [
                df_objs.df_objects['train']['text'].dropna().str.strip().apply(str.lower),
                df_objs.df_objects['test']['text'].dropna().str.strip().apply(str.lower)
            ]
        ).reset_index(drop=True).apply(WordPunctTokenizer().tokenize)
        )
        # len(keras_tokenizer.word_index)
        
        # load glove embeddings
        self.vars['embeddings_index'] = self.handlers.load_gloVe_embeddings() 
        self.vars['embeddings_matrix'] = self.handlers.load_embeddings_matrix(self.vars['embeddings_index'], self.vars['keras_index_tokenizer'])

        # {y}
        # Create label column - Word Tokenize selected_text, and create label indices
        Y,X = self.create_Y(X)
        # Run final X transform after Y since Y transform filters out some X
        sequences_padded = self.create_sequences(X)
        
        # Fit one hot encoder for sentiment (Try standard scalar downstream as well?)
        self.vars['sentiment_one_hot'] = OneHotEncoder(handle_unknown='ignore', sparse=False)
        self.vars['sentiment_one_hot'].fit(X['sentiment'].values.reshape((X['sentiment'].shape[0],1)))
        # print(enc.get_feature_names())
        X_sentiment_one_hot = self.handle_sentiment_feature_eng(X)
        
        # Create text stats
        X_text_stats = self.text_stats(X)
        self.vars['X_text_stats_standard_scaler'] = StandardScaler()
        self.vars['X_text_stats_standard_scaler'].fit(X_text_stats)
        X_text_stats_scaled = self.vars['X_text_stats_standard_scaler'].transform(X_text_stats)
        X_agg_numerics_all = hstack([X_sentiment_one_hot, X_text_stats_scaled])
        
        X_numerics_repeated = self.repeater(X_agg_numerics_all)
        
        return (sequences_padded, X_numerics_repeated, Y, X)


    # Transforms using the parameters on train
    def transform(self, X):
        
        X = self.pre_fit(X)
        
        # Create word tokens
        X = self.punct_tokenize(X, 'trans_text', 'tokens', self.vars['punct_tokenizer'])
        
        Y = None
        if 'selected_text' in X.columns:
            Y,X = self.create_Y(X)
        sequences_padded = self.create_sequences(X)
        X_sentiment_one_hot = self.handle_sentiment_feature_eng(X)        
        # Create text stats
        X_text_stats = self.text_stats(X)
        X_text_stats_scaled = self.vars['X_text_stats_standard_scaler'].transform(X_text_stats)
        
        X_agg_numerics_all = hstack([X_sentiment_one_hot, X_text_stats_scaled])
        X_numerics_repeated = self.repeater(X_agg_numerics_all)
    
        
        return (sequences_padded, X_numerics_repeated, Y, X)
        
        
    def punct_tokenize(self, X,old_col, new_col, punct_tokenizer):
        X[new_col] = X[old_col].apply(punct_tokenizer.tokenize)
        return X
        
        



# Get the optimum number of word length for sequence model - 40
# plt.hist(df_objs.df_objects['train']['tokens'].apply(len), bins=100)
# plt.show()
    
        
        

In [47]:
# tempy = df_objs.df_objects['train'][df_objs.df_objects['train']['sentiment'] != 'neutral'].reset_index(drop=True)
# tempy.shape
# df_objs.df_objects['train']

In [48]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(df_objs.df_objects['train'], test_size=0.15, random_state=42)
train = train.reset_index(drop=True)
test = test.reset_index(drop=True)
# X_train, X_test, Y_train, Y_test, idx1, idx2 = train_test_split(df_objs.df_objects['train'], Y, np.arange(Y.shape[0]), test_size=0.15, random_state=42)


In [49]:
data_mainpulation_pipeline = DataManipulationPipeline()
(train_manip_X, train_numerics_repeated, train_manip_Y, train_df) = data_mainpulation_pipeline.fit_transform(train)
(test_manip_X, test_numerics_repeated, test_manip_Y, test_df) = data_mainpulation_pipeline.transform(test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


GloVe data loaded


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [56]:
# print(train_numerics_repeated.shape)
# print(test_numerics_repeated.shape)
# # print(train_manip_X_sentiment_one_hot_repeated.shape)
# # train_stats_repeated
# # train_manip_X_sentiment_one_hot_repeated
# # hstack([train_stats_repeated, train_manip_X_sentiment_one_hot_repeated]).shape

# # train_numerics_repeated[120]
# import matplotlib.pyplot as plt
# plt.hist(train_df.tokens.apply(len), bins=100)
# plt.show()

# pd.concat([train_df['end_ind'], tempp['end_ind']])[(train_df['end_ind'] != tempp['end_ind'])]
# train_manip_X[19132]
# tempp = train_df.copy()


19421    49
19421    60
Name: end_ind, dtype: int64

In [None]:
VOCAB_SIZE = len(data_mainpulation_pipeline.vars['keras_index_tokenizer'].word_index)+1
EMBEDDING_DIM = len(data_mainpulation_pipeline.vars['embeddings_index']['a'])

In [14]:
# Model
# ----------------------------------------------------
inputLayer_words = Input(shape=(MAX_SEQUENCE_LENGTH,))
inputLayer_agg_numerics = Input(shape=(
    MAX_SEQUENCE_LENGTH,
    train_numerics_repeated.shape[2]
))

# Embedding layer for the tap names
wordEmbeddings = Embedding(input_dim=VOCAB_SIZE, output_dim=EMBEDDING_DIM, input_length=MAX_SEQUENCE_LENGTH)(inputLayer_words)
wordEmbeddings = Embedding(
    input_dim=VOCAB_SIZE, 
    output_dim=EMBEDDING_DIM, 
    input_length=MAX_SEQUENCE_LENGTH, 
    weights=[data_mainpulation_pipeline.vars['embeddings_matrix']],
    trainable=False)(inputLayer_words)
# 
merged_Input = concatenate([wordEmbeddings, inputLayer_agg_numerics])
# inputLayer_agg_indices
# # LSTM
lstm_1 = Bidirectional(LSTM(50, return_sequences = False))(merged_Input)
# input_shape=(25,(EMBEDDING_DIM))


drp1 = Dropout(0.2)(lstm_1)

# # Dense
dense_0 = Dense(75, activation='relu')(drp1)
drp2 = Dropout(0.2)(dense_0)
dense_1 = Dense(30, activation='relu')(drp2)
outputLayer = Dense(2, activation='relu')(dense_1)

model_2 = Model(inputs=[inputLayer_words, inputLayer_agg_numerics], outputs=outputLayer)

model_2.compile(loss='mse', optimizer='adam')
# model.compile(loss='mse', optimizer='adam', metrics=[jaccard])
model_2.summary()
# ----------------------------------------------------

model_2.fit([train_manip_X, train_numerics_repeated], train_manip_Y, validation_data=([test_manip_X, test_numerics_repeated], test_manip_Y), batch_size=100, epochs=200, callbacks=[metrics])

Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_3 (InputLayer)            (None, 50)           0                                            
__________________________________________________________________________________________________
embedding_4 (Embedding)         (None, 50, 25)       728175      input_3[0][0]                    
__________________________________________________________________________________________________
input_4 (InputLayer)            (None, 50, 6)        0                                            
__________________________________________________________________________________________________
concatenate_2 (Concatenate)     (None, 50, 31)       0           embedding_4[0][0]                
                                                                 input_4[0][0]              

KeyboardInterrupt: 

Epoch 98/100
21561/21561 [==============================] - 8s 357us/step - loss: 26.4123 - val_loss: 32.8301
Val Jaccard Similarity: 0.5266293328859964

Epoch 99/100
21561/21561 [==============================] - 8s 356us/step - loss: 26.0379 - val_loss: 32.5830
Val Jaccard Similarity: 0.5592650588400309

Epoch 100/100
21561/21561 [==============================] - 8s 352us/step - loss: 25.8462 - val_loss: 32.7917
Val Jaccard Similarity: 0.5504962474510444

In [13]:
model_2.fit([train_manip_X, train_manip_X_sentiment_one_hot_repeated], train_manip_Y, validation_data=([test_manip_X, test_manip_X_sentiment_one_hot_repeated], test_manip_Y), batch_size=500, epochs=100, callbacks=[metrics])

NameError: name 'train_manip_X_sentiment_one_hot_repeated' is not defined

In [None]:
# model_2.fit([train_manip_X, train_manip_X_sentiment_one_hot_repeated], train_manip_Y, validation_data=([test_manip_X, test_manip_X_sentiment_one_hot_repeated], test_manip_Y), batch_size=5000, epochs=200, callbacks=[metrics])

In [None]:
# model_2.fit([train_manip_X, train_manip_X_sentiment_one_hot_repeated], train_manip_Y, validation_data=([test_manip_X, test_manip_X_sentiment_one_hot_repeated], test_manip_Y), batch_size=2000, epochs=200, callbacks=[metrics])

In [None]:
preds = model_2.predict([test_manip_X, test_manip_X_sentiment_one_hot_repeated])

# test_X_df = df_objs.df_objects['train'].iloc[idx2].reset_index(drop=True)
test_df_1 = pd.concat([test_df, pd.DataFrame(preds, columns=['pred_start', 'pred_end'])], axis=1)
test_df_1['tokens_indices'] = test_df_1['trans_text'].apply(handlers.get_token_indices)
print((test_df_1['tokens_indices'].apply(len) == test_df_1['tokens'].apply(len)).value_counts())

# Get the predictions
test_df_1['out_pred_span'] = test_df_1.apply(handlers.get_pred_text_span, axis = 1)

- Check if embedding matrix is correctly created

In [None]:
print('Overall Jaccard from Model:')
print(test_df_1[['text', 'selected_text', 'out_pred_span']].apply(lambda x: handlers.jaccard(x['selected_text'], x['out_pred_span']), axis=1).mean())

print('Overall Jaccard baseline from predicting the complete text:')
print(test_df_1[['text', 'selected_text', 'out_pred_span']].apply(lambda x: handlers.jaccard(x['selected_text'], x['text']), axis=1).mean())

print('Overall Jaccard using model for NOT neutral and baseline for neutral:')
print(pd.concat([
    test_df_1[test_df_1['sentiment'] == 'neutral'].apply(lambda x: handlers.jaccard(x['selected_text'], x['text']), axis=1),
    test_df_1[test_df_1['sentiment'] != 'neutral'].apply(lambda x: handlers.jaccard(x['selected_text'], x['out_pred_span']), axis=1)
]).mean())


print('Model Jaccard for neutral')
print(test_df_1[test_df_1['sentiment'] == 'neutral'].apply(lambda x: handlers.jaccard(x['selected_text'], x['out_pred_span']), axis=1).mean())

print('Baseline Jaccard for neutral from predicting complete text')
print(test_df_1[test_df_1['sentiment'] == 'neutral'].apply(lambda x: handlers.jaccard(x['selected_text'], x['text']), axis=1).mean())

print('Model Jaccard for NOT neutral')
print(test_df_1[test_df_1['sentiment'] != 'neutral'].apply(lambda x: handlers.jaccard(x['selected_text'], x['out_pred_span']), axis=1).mean())

print('Baseline Jaccard for NOT neutral from predicting complete text')
print(test_df_1[test_df_1['sentiment'] != 'neutral'].apply(lambda x: handlers.jaccard(x['selected_text'], x['text']), axis=1).mean())



In [None]:
# test_df_1.apply(lambda x: handlers.jaccard(x['selected_text'], x['out_pred_span']), axis=1).mean()

In [None]:
# test_df_1[['text', 'selected_text', 'out_pred_span']]

In [None]:
# test_df_1['selected_text'].values