In [None]:
import fasttext
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [None]:
df = pd.read_csv("SICK.txt", delimiter="\t")

In [None]:
df.head()

In [None]:
y = pd.get_dummies(df['entailment_label'])

In [None]:
import re

def decontracted(phrase):
    # specific
    phrase = re.sub(r"won\'t", "will not", str(phrase))
    phrase = re.sub(r"can\'t", "can not", str(phrase))

    # general
    phrase = re.sub(r"n\'t", " not", str(phrase))
    phrase = re.sub(r"\'re", " are", str(phrase))
    phrase = re.sub(r"\'s", " is", str(phrase))
    phrase = re.sub(r"\'d", " would", str(phrase))
    phrase = re.sub(r"\'ll", " will", str(phrase))
    phrase = re.sub(r"\'t", " not", str(phrase))
    phrase = re.sub(r"\'ve", " have", str(phrase))
    phrase = re.sub(r"\'m", " am", str(phrase))
    return phrase

In [None]:
def striphtml(data): 
    cleanr = re.compile('<.*?>') 
    cleantext = re.sub(cleanr, ' ', str(data)) 
    return cleantext  

In [None]:
def stripunc(data): 
    return re.sub('[^A-Za-z]+', ' ', str(data), flags=re.MULTILINE|re.DOTALL) 

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import word_tokenize

stop_words = set(stopwords.words('english')) 
stemmer = SnowballStemmer("english") 

from tqdm import tqdm

In [None]:
def compute(sent): 
    
    sent = decontracted(sent) 
    sent = striphtml(sent) 
    sent = stripunc(sent) 
    
    words=word_tokenize(str(sent.lower())) 
    
    #Removing all single letter and and stopwords from question 
    sent1=' '.join(str(stemmer.stem(j)) for j in words if j not in stop_words and (len(j)!=1)) 
    sent2=' '.join(str(j) for j in words if j not in stop_words and (len(j)!=1)) 
    return sent1, sent2   

In [None]:
clean_stemmed_q1 = []
clean_stemmed_q2 = []
clean_q1 = []
clean_q2 = []
combined_stemmed_text = []
for _, row in tqdm(df.iterrows()):
    csq1, cq1 = compute(row['sentence_A'])
    csq2, cq2 = compute(row['sentence_B'])
    clean_stemmed_q1.append(csq1)
    clean_q1.append(cq1)
    clean_stemmed_q2.append(csq2)
    clean_q2.append(cq2)
    combined_stemmed_text.append(csq1+" "+csq2)

In [None]:
print(len(clean_stemmed_q1))
print(len(clean_stemmed_q2))
print(len(clean_q1))
print(len(clean_q2))
print(len(combined_stemmed_text))

In [None]:
df.head()

In [None]:
df['clean_stemmed_q1'] = clean_stemmed_q1
df['clean_stemmed_q2'] = clean_stemmed_q2
df['clean_q1'] = clean_q1
df['clean_q2'] = clean_q2
df['combined_stemmed_text'] = combined_stemmed_text

In [None]:
df.head()

In [None]:
df.shape

In [None]:
fastext_model = fasttext.load_model("crawl-300d-2M-subword.bin")

In [None]:
data = df

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_temp, X_test, y_temp, y_test = train_test_split(data[['clean_q1', 'clean_q2']], data['relatedness_score'], test_size=0.35, random_state=21, shuffle = True)

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.20, random_state=21)

In [None]:
print(X_train.shape)
print(y_train.shape)
print(X_val.shape)
print(y_val.shape)
print(X_test.shape)
print(y_test.shape)

In [None]:
X_train.head()

In [None]:
X_train['text'] = X_train[['clean_q1','clean_q2']].apply(lambda x:str(x[0])+" "+str(x[1]), axis=1)

In [None]:
import tensorflow as tf

In [None]:
import keras
import keras.backend as K

In [None]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from keras.layers import Input, Concatenate, Conv2D, Flatten, Dense, Embedding, LSTM
from keras.models import Model

In [None]:
t = Tokenizer()
t.fit_on_texts(X_train['text'].values)

In [None]:
X_train['clean_q1'] = X_train['clean_q1'].astype(str)
X_train['clean_q2'] = X_train['clean_q2'].astype(str)

In [None]:
X_val['clean_q1'] = X_val['clean_q1'].astype(str)
X_val['clean_q2'] = X_val['clean_q2'].astype(str)

X_test['clean_q1'] = X_test['clean_q1'].astype(str)
X_test['clean_q2'] = X_test['clean_q2'].astype(str)

In [None]:
#X_train

In [None]:
train_q1_seq = t.texts_to_sequences(X_train['clean_q1'].values)

In [None]:
train_q2_seq = t.texts_to_sequences(X_train['clean_q2'].values)
val_q1_seq = t.texts_to_sequences(X_val['clean_q1'].values)
val_q2_seq = t.texts_to_sequences(X_val['clean_q2'].values)
test_q1_seq = t.texts_to_sequences(X_test['clean_q1'].values)
test_q2_seq = t.texts_to_sequences(X_test['clean_q2'].values)

In [None]:
#train_q1_seq

In [None]:
len_vec = [len(sent_vec) for sent_vec in train_q1_seq]

In [None]:
np.max(len_vec)

In [None]:
len_vec = [len(sent_vec) for sent_vec in train_q2_seq]

In [None]:
np.max(len_vec)

In [None]:
max_len = 300

In [None]:
train_q1_seq = pad_sequences(train_q1_seq, maxlen=max_len, padding='pre', value=1.0)

In [None]:
train_q2_seq = pad_sequences(train_q2_seq, maxlen=max_len, padding='pre', value=1.0)
val_q1_seq = pad_sequences(val_q1_seq, maxlen=max_len, padding='pre', value=1.0)
val_q2_seq = pad_sequences(val_q2_seq, maxlen=max_len, padding='pre', value=1.0)
test_q1_seq = pad_sequences(test_q1_seq, maxlen=max_len, padding='pre', value=1.0)
test_q2_seq = pad_sequences(test_q2_seq, maxlen=max_len, padding='pre', value=1.0)

In [None]:
print(len(train_q1_seq[0]))
print(len(train_q2_seq[0]))
print(len(val_q1_seq[0]))
print(len(val_q2_seq[0]))
print(len(test_q1_seq[0]))
print(len(test_q1_seq[0]))

In [None]:
train_q1_seq

In [None]:
not_present_list = []
vocab_size = len(t.word_index) + 1
embedding_matrix = np.zeros((vocab_size, len(fastext_model['no'])))
for word, i in t.word_index.items():
    embedding_vector = fastext_model[word]

    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
    else:
        embedding_matrix[i] = np.zeros(300)

In [None]:
embedding_matrix

In [None]:
embedding_matrix.shape

In [None]:
len(t.word_index) 

In [None]:
from keras.regularizers import l2
from keras.models import Sequential
from keras.optimizers import Adam
from keras.layers import Conv2D, ZeroPadding2D, Activation, Input, concatenate
from keras.models import Model

from keras.layers.normalization import BatchNormalization
from keras.layers.pooling import MaxPooling2D
from keras.layers.merge import Concatenate
from keras.layers.core import Lambda, Flatten, Dense
from keras.initializers import glorot_uniform
from keras.layers import Input, Dense, Flatten, GlobalMaxPool2D, GlobalAvgPool2D, Concatenate, Multiply, Dropout, Subtract, Add, Conv2D

In [None]:
def exponent_neg_manhattan_distance(vects):
    ''' Helper function for the similarity estimate of the LSTMs outputs '''
    x, y = vects
    return K.exp(-K.sum(K.abs(x - y), axis=1, keepdims=True))

def exponent_neg_cosine_distance(x, hidden_size=50):
    ''' Helper function for the similarity estimate of the LSTMs outputs '''
    leftNorm = K.l2_normalize(x[:,:hidden_size], axis=-1)
    rightNorm = K.l2_normalize(x[:,hidden_size:], axis=-1)
    return K.exp(K.sum(K.prod([leftNorm, rightNorm], axis=0), axis=1, keepdims=True))

In [None]:
# from keras.utils import to_categorical
# y_train = pd.get_dummies(y_train)
# y_val = pd.get_dummies(y_val)
# y_test = pd.get_dummies(y_test)

In [None]:
y_train = y_train / 5.0
y_val = y_val / 5.0

In [None]:
def build_base_network(input_shape):
    
    model = Sequential()
    model.add(Embedding(name="synopsis_embedd",input_dim =len(t.word_index)+1, 
                            output_dim=len(fastext_model['no']),weights=[embedding_matrix], 
                            input_length=train_q1_seq.shape[1],trainable=True))
    model.add(LSTM(64,return_sequences=True, activation="sigmoid"))
    model.add(LSTM(128,return_sequences=True, activation="sigmoid"))
    model.add(LSTM(256,return_sequences=True, activation="sigmoid"))
    model.add(LSTM(256,return_sequences=True, activation="sigmoid"))
    model.add(LSTM(128,return_sequences=True, activation="sigmoid"))
    model.add(LSTM(64,return_sequences=False, activation="sigmoid"))
    model.add(Dense(128, activation='sigmoid'))
    model.add(Dense(64, activation='relu'))
    return model


In [None]:
input_dim = train_q1_seq.shape[1]
sent_a = Input(shape=(input_dim,))
sent_b = Input(shape=(input_dim,))
print('input_dim',input_dim)

In [None]:
base_network = build_base_network(input_dim)
feat_vecs_a = base_network(sent_a)
feat_vecs_b = base_network(sent_b)

In [None]:
feat_vecs_b.get_shape

In [None]:
distance = Lambda(exponent_neg_manhattan_distance, output_shape=(1,))([feat_vecs_a, feat_vecs_b])

In [None]:
concats = concatenate([feat_vecs_a, feat_vecs_b], axis=-1)

In [None]:
output_layer = Dense(1, activation='softmax')(concats)

In [None]:
from keras import optimizers
from keras.callbacks import ModelCheckpoint, LearningRateScheduler, EarlyStopping, ReduceLROnPlateau, TensorBoard
epochs = 20
rms = optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False)#RMSprop()
adadelta = optimizers.Adadelta(lr=0.001, rho=0.95, epsilon=1e-07)
adagrad = optimizers.Adagrad(lr=0.0001, epsilon=1e-02)

earlyStopping = EarlyStopping(monitor='val_loss',
                              min_delta=0,
                              patience=5,
                              verbose=1,
                              restore_best_weights=True)
callback_early_stop_reduceLROnPlateau=[earlyStopping]

In [None]:
def eucl_dist_output_shape(shapes):
    shape1, shape2 = shapes
    return (shape1[0], 1)

def contrastive_loss(y_true, y_pred):
    margin = 1
    return K.mean(y_true * K.square(y_pred) + (1 - y_true) * K.square(K.maximum(margin - y_pred, 0)))

def compute_accuracy(predictions, labels):
    '''Compute classification accuracy with a fixed threshold on distances.
    '''
    return labels[predictions.ravel() < 0.5].mean()

def accuracy(y_true, y_pred):
    '''Compute classification accuracy with a fixed threshold on distances.
    '''
    return K.mean(K.equal(y_true, K.cast(y_pred < 0.5, y_true.dtype)))

In [None]:
model = Model(input=[sent_a, sent_b], output=distance)
model.compile(loss='mean_absolute_error', optimizer=rms, metrics=['mse'])
model.summary()

In [None]:
history = model.fit([train_q1_seq,train_q2_seq], y_train, validation_data=([val_q1_seq, val_q2_seq], y_val),
                        epochs=20, batch_size=32, verbose=1, shuffle = True, callbacks=callback_early_stop_reduceLROnPlateau)

In [None]:
# "Accuracy"
plt.figure(1)
plt.plot(history.history['mean_squared_error'])
plt.plot(history.history['val_mean_squared_error'])
plt.title('Model Accuracy')
plt.ylabel('Accuracy')
plt.xlabel('epoch')
plt.legend(['Training', 'Validation'], loc='upper left')
plt.show()

# "Loss"
plt.figure(2)
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model Loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['Training', 'Validation'], loc='upper left')
plt.show()

In [None]:
scores = model.evaluate([train_q1_seq,train_q2_seq], y_train, verbose=0)

In [None]:
scores

In [None]:
scores_test = model.evaluate([val_q1_seq,val_q2_seq], y_val, verbose=1, batch_size=32)

In [None]:
scores_test