In [9]:
import os
import pickle

import numpy as np
import pandas as pd
import tensorflow as tf
from keras.callbacks import ModelCheckpoint
from keras.layers import Bidirectional, GlobalMaxPooling1D, GlobalAveragePooling1D, LSTM
from keras.layers import Embedding, Dense, Flatten, Input
from keras.layers import add, concatenate
from keras.layers.convolutional import Conv1D
from keras.layers.pooling import MaxPool1D
from keras.models import Model
from keras.preprocessing import text, sequence
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split


# load data from csv file.
def load_data(train_dir, test_dir):
    train = pd.read_csv(train_dir)
    test = pd.read_csv(test_dir)

    train, val = train_test_split(train, test_size=0.1, random_state=42)

    train_x, train_y = train["storyline"], train["label"]
    test_x, test_y = test["storyline"], test["label"]
    val_x, val_y = val["storyline"], val["label"]

    return train_x, train_y, test_x, test_y, val_x, val_y


# convert Text data to vector.
def pre_procissing(train_x, test_x, val_x):
    CHARS_TO_REMOVE = r'!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n“”’\'∞θ÷α•à−β∅³π‘₹´°£€\×™√²—'

    train_x = train_x.tolist()
    test_x = test_x.tolist()
    val_x = val_x.tolist()

    tokenizer = text.Tokenizer(filters=CHARS_TO_REMOVE)
    tokenizer.fit_on_texts(train_x + test_x + val_x)  # Make dictionary

    # Text match to dictionary.
    train_x = tokenizer.texts_to_sequences(train_x)
    test_x = tokenizer.texts_to_sequences(test_x)
    val_x = tokenizer.texts_to_sequences(val_x)

    temp_list = []
    total_list = list(train_x) + list(test_x) + list(val_x)

    for i in range(0, len(total_list)):
        temp_list.append(len(total_list[i]))

    max_len = max(temp_list)

    train_x = sequence.pad_sequences(train_x, maxlen=max_len, padding='post')
    test_x = sequence.pad_sequences(test_x, maxlen=max_len, padding='post')
    val_x = sequence.pad_sequences(val_x, maxlen=max_len, padding='post')

    return train_x, test_x, val_x, tokenizer


def get_coefs(word, *arr):
    return word, np.asarray(arr, dtype='float32')


def load_embeddings(path):
    with open(path, encoding="utf-8") as f:
        return dict(get_coefs(*line.strip().split(' ')) for line in f)


# Pre-trained embedding match to my dataset.
def text_to_vector(word_index, path, word_dimension):
    # If you change your embedding.pickle file, you must make new embedding.pickle file.
    if os.path.isfile("embedding_binary.pickle"):
        with open("embedding_binary.pickle", 'rb') as rotten_file:
            embedding_matrix = pickle.load(rotten_file)

    else:
        embedding_index = load_embeddings(path)
        embedding_matrix = np.zeros((len(word_index) + 1, 50))
        for word, i in word_index.items():
            try:
                embedding_matrix[i] = embedding_index[word]
            except KeyError:
                pass

        with open("embedding_binary.pickle", 'wb') as handle:
            pickle.dump(embedding_matrix, handle, protocol=pickle.HIGHEST_PROTOCOL)

    return embedding_matrix


def build_model(size, embedding_matrix):
    ### Hyper Parameter
    lstm_units = 128
    hidden_units = 512

    ### Model Architecture
    input_layer = Input(shape=(size,))

    embedding_layer = Embedding(*embedding_matrix.shape, weights=[embedding_matrix], trainable=False)(input_layer)

    lstm_layer = LSTM(128, return_sequences=True)(embedding_layer)
    hidden_layer = Flatten()(lstm_layer)

    output_layer = Dense(1, activation='sigmoid')(hidden_layer)

    model = Model(inputs=input_layer, outputs=output_layer)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.summary()

    return model


def evaluate(model, test_x, test_y):
    prediction = model.predict(test_x)
    y_pred = (prediction > 0.5)

    accuracy = accuracy_score(test_y, y_pred)
    print("Accuracy: %.2f%%" % (accuracy * 100.0))
    print(classification_report(test_y, y_pred, target_names=["0", "1"]))


def main():
    train_dir = "train.csv"
    test_dir = "test.csv"
    embedding_dir = "glove.6B.50d.txt"


    ### Flow
    train_x, train_y, test_x, test_y, val_x, val_y = load_data(train_dir, test_dir)

    train_x, test_x, val_x, tokenizer = pre_procissing(train_x, test_x, val_x)

    embedding_matrix = text_to_vector(tokenizer.word_index, embedding_dir, word_dimension=300)

    model = build_model(train_x.shape[1], embedding_matrix)
    model.fit(x=train_x, y=train_y, epochs=3, batch_size=128, validation_data=(val_x, val_y))

    evaluate(model, test_x, test_y)


if __name__ == '__main__':
    main()

Model: "functional_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         [(None, 258)]             0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 258, 50)           474800    
_________________________________________________________________
lstm_2 (LSTM)                (None, 258, 128)          91648     
_________________________________________________________________
flatten_2 (Flatten)          (None, 33024)             0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 33025     
Total params: 599,473
Trainable params: 124,673
Non-trainable params: 474,800
_________________________________________________________________
Epoch 1/3

KeyboardInterrupt: ignored