In [None]:
import pandas as pd
import numpy as np
import helpers_py
from nltk import word_tokenize
import pickle #to load Glove
from keras.models import Model
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Input, Embedding,LSTM, Bidirectional, Conv1D, GlobalMaxPooling1D, Dense, Dropout  
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [None]:
train_data = pd.read_csv("../input/nlp-getting-started/train.csv")
test_data = pd.read_csv("../input/nlp-getting-started/test.csv")
test_target = pd.read_csv("../input/test-data-with-the-target/test_target.csv")

Preprocessing Data

In [None]:
train_data['text_clean'] = train_data['text'].apply(lambda x : helpers_py.data_preprocessing(x))
test_data['text_clean'] = test_data['text'].apply(lambda x : helpers_py.data_preprocessing(x))
train_data = helpers_py.fix_labels(train_data)

Word Tokenizing with NTLK

In [None]:
train_tokens = [word_tokenize(sen) for sen in train_data.text_clean]
test_tokens = [word_tokenize(sen) for sen in test_data.text_clean]
train_data['tokens'] = train_tokens
test_data['tokens'] = test_tokens

In [None]:
train_data

In [None]:
test_data

In [None]:
def OneHotEncoder(data):
    Real, Not_Real = [],[]
    for target in data.target_fixed:
        if target == 1:
            Real.append(1)
            Not_Real.append(0)
        elif target == 0:
            Real.append(0)
            Not_Real.append(1)
    data['Real'] = Real
    data['Not_Real']= Not_Real
    data = data[['id','keyword','location','text','text_clean','tokens', 'target', 'target_fixed','Real', 'Not_Real']]
    return data

In [None]:
train_data = OneHotEncoder(train_data)

Building the train vocabulary

In [None]:
train_words = [word for tokens in train_data["tokens"] for word in tokens]
train_vocab = sorted(list(set(train_words)))
print("The total number of words in the train vocabulary:", len(train_words))
print("The total number of words in the train vocabulary:",  len(train_vocab))

[](http://)Building the test vocabulary

In [None]:
test_words = [word for tokens in test_data["tokens"] for word in tokens]
test_vocab = sorted(list(set(test_words)))
print("The total number of words in the test vocabulary:", len(test_words))
print("The total number of words in the test vocabulary:",  len(test_vocab))

Loading the Glove pretrained vector

In [None]:
with open('../input/pickled-glove840b300d-for-10sec-loading/glove.840B.300d.pkl', 'rb') as fp:
    glove = pickle.load(fp)
print ('Glove is Loaded ')

Building the train dictionary with Keras Tokenizer

In [None]:
tokenizer = Tokenizer(num_words = len(train_vocab), lower=True, char_level=False)
tokenizer.fit_on_texts(train_data["text_clean"].tolist())
train_word_index = tokenizer.word_index

training_sequences = tokenizer.texts_to_sequences(train_data["text_clean"].tolist())
test_sequences = tokenizer.texts_to_sequences(test_data["text_clean"].tolist())

Padding sequences to the maximum sequece length from EDA

In [None]:
train_pad_seq = pad_sequences(training_sequences, maxlen=23)
test_pad_seq = pad_sequences(test_sequences, maxlen=23)

Building Glove embedding weights

In [None]:
train_embedding_weights = np.zeros((len(train_word_index) + 1, 300))
for word,index in train_word_index.items():
    train_embedding_weights[index,:] = glove[word] if word in glove else np.random.rand(300)

In [None]:
def BiLSTM_CNN_Model(embeddings, max_sequence_length, num_words, embedding_dim, labels_index):
    
    embedding_layer = Embedding(num_words,
                            embedding_dim,
                            weights=[embeddings],
                            input_length=max_sequence_length,
                            trainable=False)
    
    sequence_input = Input(shape=(max_sequence_length,), dtype='int32')
    embedded_sequences = embedding_layer(sequence_input)
    embedded_sequences = Dropout(0.3)(embedded_sequences)
    l_lstm = Bidirectional(LSTM(300, activation = 'relu', return_sequences = True))(embedded_sequences)
    l_conv1 = Conv1D(filters=300, kernel_size=2, activation='relu')(l_lstm)
    l_conv2 = Conv1D(filters=300, kernel_size=3, activation='relu')(l_conv1)
    l_conv3 = Conv1D(filters=300, kernel_size=4, activation='relu')(l_conv2)
    l_pool = GlobalMaxPooling1D()(l_conv3)
    l_d1 = Dense(150, activation='relu')(l_pool)
    l_d1 = Dropout(0.2)(l_d1)
    l_d2 = Dense(75, activation='relu')(l_d1)
    l_d2 = Dropout(0.2)(l_d2)
    preds = Dense(labels_index, activation='sigmoid')(l_d2)
    
    model = Model(sequence_input, preds)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['acc'])
    model.summary()
    return model

In [None]:
x_train = train_pad_seq
label_names = ['Real', 'Not_Real']
y_train = train_data[label_names].values

In [None]:
model = BiLSTM_CNN_Model(train_embedding_weights, 23, len(train_word_index)+1, 300, 2)

In [None]:
num_epochs = 30
batch_size = 512

In [None]:
hist = model.fit(x_train, y_train, epochs=num_epochs, validation_split=0.2, shuffle=True, batch_size=batch_size)

In [None]:
predictions = model.predict(test_pad_seq, batch_size=512, verbose=1)

In [None]:
labels = [1, 0]
prediction_labels=[]
for p in predictions:
    prediction_labels.append(labels[np.argmax(p)])

In [None]:
y_true = test_target['target']
y_pred = prediction_labels

In [None]:
print("Accuracy score: ", accuracy_score(y_true, y_pred))
print("Precision score: ", precision_score(y_true, y_pred))
print("Recall score: ", recall_score(y_true, y_pred))
print("F1 score: ", f1_score(y_true, y_pred))