# Testing various models with word embeddings

In [None]:
import os
import time
import numpy as np 
import pandas as pd 
from tqdm import tqdm
import math
from sklearn.model_selection import train_test_split
from sklearn import metrics

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, CuDNNGRU, Conv1D
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers

In [None]:
train_df = pd.read_csv("../input/train.csv")
test_df = pd.read_csv("../input/test.csv")
print("Train shape : ",train_df.shape)
print("Test shape : ",test_df.shape)

In [None]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
from gensim.models import KeyedVectors
news_path = '../input/embeddings/GoogleNews-vectors-negative300/GoogleNews-vectors-negative300.bin'
embeddings_index = KeyedVectors.load_word2vec_format(news_path, binary=True)

In [None]:
embeddings_index.most_similar(positive=['King', 'woman'], negative = ['man'], topn = 1)

In [None]:
embeddings_index.most_similar(positive=['Australia', 'pizza'], negative = ['Italy'], topn = 1)

In [None]:
embeddings_index.most_similar(positive=['biked', 'today'], negative = ['yesterday'], topn = 1)

In [None]:
embeddings_index.most_similar(positive=['Africa', 'pizza'], negative = ['Italy'], topn = 1)

In [None]:
embeddings_index.most_similar(positive=['Brisbane', 'UK'], negative = ['Australia'], topn = 1)

In [None]:
vector = embeddings_index['Maccas']
vector

**Data pre-processing for using word vectors**
* Split training data into train and validation samples. 
* Fill missing values.
* Tokenise the individual words in text so we can convert the words into vectors
* Pad each sentence so the model can be fed vectors of the same size. For longer sentences, the number of words will be truncated to 'max_len' and for shorter sentences the sequence will be padded with zeroes. 

In [None]:
#Split data to training and validation sets
train_df, val_df = train_test_split(train_df, test_size = 0.1, random_state=2019)

#Configure the word embedding values
embed_size = 300 #size of each word vector (number of columns in the embedding matrix)
max_features = 50000 #number of unique words (number of rows in the embedding matrix)
maxlen = 100 #maximum number of words in each sentence

#Fill missing values
train_X = train_df['question_text'].fillna('_na_').values
val_X = val_df['question_text'].fillna('_na_').values
test_X = test_df['question_text'].fillna('_na_').values

#Tokenise sentences
# Tokenizer is the class for vectorizing texts, or/and turning texts into sequences (=list of word indexes, where the word of rank i in the dataset (starting at 1) has index i).
tokeniser = Tokenizer(num_words = max_features) 
tokeniser.fit_on_texts(list(train_X)) #fit on our training set

#Change the input vectors to lists of numbers corresponding to the word indexes
train_X = tokeniser.texts_to_sequences(train_X)
val_X = tokeniser.texts_to_sequences(val_X)
test_X = tokeniser.texts_to_sequences(test_X)

#Pad the sentences
train_X = pad_sequences(train_X, maxlen = maxlen)
val_X = pad_sequences(val_X, maxlen = maxlen)
test_X = pad_sequences(test_X, maxlen = maxlen)

#Get target values (sincere or insincere in our case)
train_y = train_df['target'].values
val_y = val_df['target'].values

**Training our own embeddings**

The contents of the table that relates indexes to embedding vectors (i.e., the weights of the embedding layer) are initialized at random and then optimized by the training algorithm (e.g., Gradient Descent). 
****
This type of learned embedding is different to an embedding learned through word2vec. Word2vec aims to capture the semantic similarity between words. The embedding trained from scratch is only useful for the particular classification problem.

For example, the following image taken from [this paper](https://link.springer.com/article/10.1007/s10489-017-1109-7) shows the embedding of three sentences with a Keras Embedding layer trained from scratch as part of a supervised network designed to detect clickbait headlines (left) and pre-trained word2vec embeddings (right). The word2vec embeddings reflect the semantic similarity between phrases b) and c). Conversely, the embeddings generated by Keras's Embedding layer might be useful for classification, but do not capture the semantical similarity of b) and c).

![](https://i.stack.imgur.com/BNIsVl.png)

 ## Basic Keras Model training our own embeddings

In [None]:
from keras.models import Sequential
from keras import layers

model = Sequential()
model.add(layers.Embedding(input_dim=max_features,
                           output_dim=embed_size,
                           input_length=maxlen))
model.add(layers.GlobalMaxPool1D()) #takes the maximum of all features 
model.add(layers.Dense(10, activation='relu')) 
model.add(layers.Dense(1, activation='sigmoid'))
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])
model.summary()


In [None]:
model.fit(train_X, train_y, batch_size=512, epochs=2, validation_data = (val_X, val_y))

In [None]:
pred_noemb_val_y = model.predict([val_X], batch_size=1024, verbose=1)
for thresh in np.arange(0.1, 0.501, 0.01):
    thresh = np.round(thresh, 2)
    print("F1 score at threshold {0} is {1}".format(thresh, metrics.f1_score(val_y, (pred_noemb_val_y>thresh).astype(int))))
    
#Best F1 score at threshold 0.29 is 0.6441116865570817 

## Basic Keras Model with pre-trained FastText Embeddings

In [None]:
EMBEDDING_FILE = '../input/embeddings/wiki-news-300d-1M/wiki-news-300d-1M.vec'
def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE) if len(o)>100)

all_embs = np.stack(embeddings_index.values())
emb_mean,emb_std = all_embs.mean(), all_embs.std()
embed_size = all_embs.shape[1]

word_index = tokeniser.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix_2 = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix_2[i] = embedding_vector


In [None]:
np.shape(embedding_matrix_2)

In [None]:
from keras.models import Sequential
from keras import layers

model = Sequential()
model.add(layers.Embedding(input_dim=max_features,
                           output_dim=embed_size,
                           input_length=maxlen
                          ,weights=[embedding_matrix_2]
                          ,trainable=False))
model.add(layers.GlobalMaxPool1D()) #takes the maximum of all features 
model.add(layers.Dense(10, activation='relu')) 
model.add(layers.Dense(1, activation='sigmoid'))
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])
model.summary()


In [None]:
model.fit(train_X, train_y, batch_size=512, epochs=2, validation_data = (val_X, val_y))

In [None]:
pred_noemb_val_y = model.predict([val_X], batch_size=1024, verbose=1)
for thresh in np.arange(0.1, 0.501, 0.01):
    thresh = np.round(thresh, 2)
    print("F1 score at threshold {0} is {1}".format(thresh, metrics.f1_score(val_y, (pred_noemb_val_y>thresh).astype(int))))
    
#Best F1 score at threshold 0.26 is 0.526683967054045

## Basic Keras model with re-trainable pretrained embeddings

In [None]:
from keras.models import Sequential
from keras import layers

model = Sequential()
model.add(layers.Embedding(input_dim=max_features,
                           output_dim=embed_size,
                           input_length=maxlen
                          ,weights=[embedding_matrix_2]
                          ,trainable=True))
model.add(layers.GlobalMaxPool1D()) #takes the maximum of all features 
model.add(layers.Dense(10, activation='relu')) 
model.add(layers.Dense(1, activation='sigmoid'))
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])
model.summary()


In [None]:
model.fit(train_X, train_y, batch_size=512, epochs=2, validation_data = (val_X, val_y))

In [None]:
pred_noemb_val_y = model.predict([val_X], batch_size=1024, verbose=1)
for thresh in np.arange(0.1, 0.501, 0.01):
    thresh = np.round(thresh, 2)
    print("F1 score at threshold {0} is {1}".format(thresh, metrics.f1_score(val_y, (pred_noemb_val_y>thresh).astype(int))))
    
#Best F1 score at threshold 0.35 is 0.6577158774373258

## Bidirectional GRU model (non-trainable, pre-trained FastText embeddings)

In [None]:
inp = Input(shape=(maxlen,))
x = Embedding(max_features, embed_size, weights=[embedding_matrix_2], trainable=False)(inp)
x = Bidirectional(CuDNNGRU(64, return_sequences=True))(x)
x = GlobalMaxPool1D()(x)
x = Dense(16, activation="relu")(x)
x = Dropout(0.1)(x)
x = Dense(1, activation="sigmoid")(x)
model = Model(inputs=inp, outputs=x)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

In [None]:
model.fit(train_X, train_y, batch_size=512, epochs=2, validation_data = (val_X, val_y))

In [None]:
pred_noemb_val_y = model.predict([val_X], batch_size=1024, verbose=1)
for thresh in np.arange(0.1, 0.501, 0.01):
    thresh = np.round(thresh, 2)
    print("F1 score at threshold {0} is {1}".format(thresh, metrics.f1_score(val_y, (pred_noemb_val_y>thresh).astype(int))))
    
# Best F1 score at threshold 0.31 is 0.664618760285244

## Bidirectional GRU model (trainable, pre-trained FastText embeddings)

In [None]:
inp = Input(shape=(maxlen,))
x = Embedding(max_features, embed_size, weights=[embedding_matrix_2] ,trainable=True)(inp)
x = Bidirectional(CuDNNGRU(64, return_sequences=True))(x)
x = GlobalMaxPool1D()(x)
x = Dense(16, activation="relu")(x)
x = Dropout(0.1)(x)
x = Dense(1, activation="sigmoid")(x)
model = Model(inputs=inp, outputs=x)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

In [None]:
model.fit(train_X, train_y, batch_size=512, epochs=2, validation_data = (val_X, val_y))

In [None]:
pred_noemb_val_y = model.predict([val_X], batch_size=1024, verbose=1)
for thresh in np.arange(0.1, 0.501, 0.01):
    thresh = np.round(thresh, 2)
    print("F1 score at threshold {0} is {1}".format(thresh, metrics.f1_score(val_y, (pred_noemb_val_y>thresh).astype(int))))
    
# Best F1 score at threshold 0.3 is 0.6725162436830121

In [None]:
pred_test_y = model.predict([test_X], batch_size=1024, verbose=1)
pred_test_y = (pred_test_y>0.44).astype(int)
out_df = pd.DataFrame({"qid":test_df["qid"].values})
out_df['prediction'] = pred_test_y
out_df.to_csv("submission.csv", index=False)