In [1]:
import random
import re
import os
import tqdm
import xml.etree.ElementTree as ET

import numpy as np
from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM
from keras.utils import np_utils
from keras.callbacks import ModelCheckpoint
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk import tokenize

Using TensorFlow backend.


In [2]:
class Debug:
    def __init__(self, debug_mode=True):
        self.debug_mode = debug_mode
        self.flag = {}

    def log(self, target, flag=None):
        if self.debug_mode:
            if flag is None:
                print(target)
            else:
                if flag in self.flag.keys():
                    if self.flag[flag]:
                        print(target)

    def set_flag(self, flag: str, val: bool):
        self.flag[flag] = val

debug = Debug(True)

In [3]:
def token_check(target):
    return target not in stopwords.words('english')

In [4]:
class GeneratorExceptions(Exception):
    """
    The Exception class for tracking all exceptions raised in data generator
    Param
        text: the displayed text
    """
    def __init__(self, text: str):
        self.text = text

class temp_generator:
    def __init__(self, dataset_file_path : str="data/dataset/nysk.xml", processed_dataset_path: str ="data/processed_dataset/"):
        self.dataset_file_path = dataset_file_path
        self.processed_dataset_path = processed_dataset_path
        self.preprocess_data(override=False)

    def preprocess_data(self, override=False):
        if os.path.isfile(self.dataset_file_path):
            if not os.path.isdir(self.processed_dataset_path):
                os.mkdir(self.processed_dataset_path)

            res = os.listdir(self.processed_dataset_path)

            with open(self.dataset_file_path, "r", encoding="utf-8") as f:
                doc = ET.ElementTree(file=f)

            root = doc.getroot()
            print(len(root))

            for item in tqdm.tqdm(root):
                news_id = item.findtext('docid')
                source = item.findtext('source')
                url = item.findtext('url')
                title = item.findtext('title')
                summary = item.findtext('summary')
                text = item.findtext('text')

                title = re.sub(r"<.*>", "", title)
                title = re.sub(r"\W", "_", title)
                title = f"{news_id}_{title[:10]}"

                fp = f"{self.processed_dataset_path}{title}.txt"
                if not os.path.isfile(fp) or override:
                    with open(fp, 'w', encoding='utf-8') as f:
                        f.write(text)
        else:
            raise GeneratorExceptions("Path doesn't exist")
    
    def get_one(self):
        f_list = os.listdir(self.processed_dataset_path)
        with open(f"{self.processed_dataset_path}/{f_list[0]}", 'r') as f:
            text = f.read()
            res = tokenize.sent_tokenize(text)
        if debug.debug_mode:
            for i in res:
                print(i)
        return res

In [5]:
dg = temp_generator()
sample_text = dg.get_one()

input_texts = []
target_texts = []
input_characters = set()
target_characters = set()

for i in range(0, len(sample_text)-1):
    input_t = f"\t{sample_text[i]}\n"
    target_t = f"\t{sample_text[i+1]}\n"
    input_texts.append(input_t)
    target_texts.append(target_t)
    
    for c in input_t:
        if c not in input_characters:
            input_characters.add(c)
    
    for c in target_t:
        if c not in target_characters:
            target_characters.add(c)

input_char_list = sorted(list(input_characters))
target_char_list = sorted(list(target_characters))

num_encoder_tokens = len(input_char_list)
num_decoder_tokens = len(target_char_list)

max_encoder_sequence_len = max([len(t) for t in input_texts])
max_decoder_sequence_len = max([len(t) for t in target_texts])

print("Number of samples:", len(input_texts))
print("Number of unique input tokens:", num_encoder_tokens)
print("Number of unique output tokens:", num_decoder_tokens)
print("Max sequence length for inputs:", max_encoder_sequence_len)
print("Max sequence length for outputs:", max_decoder_sequence_len)

100%|██████████| 10421/10421 [00:00<00:00, 22113.47it/s]


10421
The Neique Strauss-Kahn of sexual assault lived in an apartment exclusively for patients with HIV and AIDS.
From the Post: The hotel maid, a West African immigrant, has occupied the fourth-floor High Bridge pad with her 15-year-old daughter since January -- and before that, lived in another Bronx apartment set aside by Harlem Community AIDS United strictly for adults with the virus and their families.
The paper was unable to confirm if the accuser has HIV or AIDS because of medical confidentiality laws, but the Post confirmed that the agency rents apartments only for adults with the disease.
A Harlem United employee said at least one adult in the apartment must be HIV-positive or have AIDS to qualify to live in one of the residences.
Sources told the Post that only the alleged victim and her child lived in the apartment.
Strauss-Kahn is accused of forcing the woman to perform oral sex on him.
She told police that after the forced act, she spit his semen onto the floor.
According 

In [6]:
input_token_index = dict([(char, i) for i, char in enumerate(input_characters)])
target_token_index = dict([(char, i) for i, char in enumerate(target_characters)])

encoder_input_data = np.zeros(
    (len(input_texts), max_encoder_sequence_len, num_encoder_tokens), dtype="float32"
)
decoder_input_data = np.zeros(
    (len(input_texts), max_decoder_sequence_len, num_decoder_tokens), dtype="float32"
)
decoder_target_data = np.zeros(
    (len(input_texts), max_decoder_sequence_len, num_decoder_tokens), dtype="float32"
)

for i, (input_text, target_text) in enumerate(zip(input_texts, target_texts)):
    for t, char in enumerate(input_text):
        encoder_input_data[i, t, input_token_index[char]] = 1.0
        encoder_input_data[i, t + 1 :, input_token_index[" "]] = 1.0
    for t, char in enumerate(target_text):
        # decoder_target_data is ahead of decoder_input_data by one timestep
        decoder_input_data[i, t, target_token_index[char]] = 1.0
        if t > 0:
            # decoder_target_data will be ahead by one timestep
            # and will not include the start character.
            decoder_target_data[i, t - 1, target_token_index[char]] = 1.0
            decoder_input_data[i, t + 1 :, target_token_index[" "]] = 1.0
            decoder_target_data[i, t:, target_token_index[" "]] = 1.0

print(encoder_input_data)
print(decoder_target_data)

[[[0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 1. 0. 0.]
  [0. 0. 0. ... 1. 0. 0.]
  ...
  [0. 0. 0. ... 1. 0. 0.]
  [0. 0. 0. ... 1. 0. 0.]
  [0. 0. 0. ... 1. 0. 0.]]

 [[0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 1. 0. 0.]
  [0. 0. 0. ... 1. 0. 0.]
  ...
  [0. 0. 0. ... 1. 0. 0.]
  [0. 0. 0. ... 1. 0. 0.]
  [0. 0. 0. ... 1. 0. 0.]]

 [[0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 1. 0. 0.]
  [0. 0. 0. ... 1. 0. 0.]
  ...
  [0. 0. 0. ... 1. 0. 0.]
  [0. 0. 0. ... 1. 0. 0.]
  [0. 0. 0. ... 1. 0. 0.]]

 ...

 [[0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 1. 0. 0.]
  [0. 0. 0. ... 1. 0. 0.]
  ...
  [0. 0. 0. ... 1. 0. 0.]
  [0. 0. 0. ... 1. 0. 0.]
  [0. 0. 0. ... 1. 0. 0.]]

 [[0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 1. 0. 0.]
  [0. 0. 0. ... 1. 0. 0.]
  ...
  [0. 0. 0. ... 1. 0. 0.]
  [0. 0. 0. ... 1. 0. 0.]
  [0. 0. 0. ... 1. 0. 0.]]

 [[0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 1. 0. 0.]
  [0. 0. 0. ... 1. 0. 0.]
  ...
  [0. 0. 0. ... 1. 0. 0.]
  [0. 0. 0. ... 1. 0. 0.]
  [0. 0. 0. ... 1. 0. 0.]]]
[[[0. 0. 0

In [7]:
batch_size = 64  
epochs = 100  
latent_dim = 256  
num_samples = 10000

encoder_inputs = keras.Input(shape=(None, num_encoder_tokens))
encoder = keras.layers.LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder(encoder_inputs)

encoder_states = [state_h, state_c]

decoder_inputs = keras.Input(shape=(None, num_decoder_tokens))

decoder_lstm = keras.layers.LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_inputs, initial_state=encoder_states)
decoder_dense = keras.layers.Dense(num_decoder_tokens, activation="softmax")
decoder_outputs = decoder_dense(decoder_outputs)

model = keras.Model([encoder_inputs, decoder_inputs], decoder_outputs)

In [8]:
model_name = "Model\SeqToSeq_Model"

model.compile(
    optimizer="rmsprop", loss="categorical_crossentropy", metrics=["accuracy"]
)

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, None, 61)]   0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, None, 61)]   0                                            
__________________________________________________________________________________________________
lstm (LSTM)                     [(None, 256), (None, 325632      input_1[0][0]                    
__________________________________________________________________________________________________
lstm_1 (LSTM)                   [(None, None, 256),  325632      input_2[0][0]                    
                                                                 lstm[0][1]                   

In [9]:
model.fit(
    [encoder_input_data, decoder_input_data],
    decoder_target_data,
    batch_size=batch_size,
    epochs=1,
    validation_split=0.2,
)

Train on 11 samples, validate on 3 samples


<tensorflow.python.keras.callbacks.History at 0x1f82e160fc8>

In [13]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, None, 61)]   0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, None, 61)]   0                                            
__________________________________________________________________________________________________
lstm (LSTM)                     [(None, 256), (None, 325632      input_1[0][0]                    
__________________________________________________________________________________________________
lstm_1 (LSTM)                   [(None, None, 256),  325632      input_2[0][0]                    
                                                                 lstm[0][1]                   

In [None]:
model.save_weights("Model/test.weights.hdf5")
# keras.models.save_model(model, model_name)

In [11]:
def decode_sequence(input_seq):
    # Encode the input as state vectors.
    states_value = encoder_model.predict(input_seq)

    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1, 1, num_decoder_tokens))
    # Populate the first character of target sequence with the start character.
    target_seq[0, 0, target_token_index["\t"]] = 1.0

    # Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    stop_condition = False
    decoded_sentence = ""
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = reverse_target_char_index[sampled_token_index]
        decoded_sentence += sampled_char

        # Exit condition: either hit max length
        # or find stop character.
        if sampled_char == "\n" or len(decoded_sentence) > max_decoder_sequence_len:
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1, 1, num_decoder_tokens))
        target_seq[0, 0, sampled_token_index] = 1.0

        # Update states
        states_value = [h, c]
    return decoded_sentence

In [14]:
new_model = keras.Model([encoder_inputs, decoder_inputs], decoder_outputs)
new_model.compile(
    optimizer="rmsprop", loss="categorical_crossentropy", metrics=["accuracy"]
)

new_model.summary()

new_model.load_weights("Model/test.weights.hdf5")

Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, None, 61)]   0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, None, 61)]   0                                            
__________________________________________________________________________________________________
lstm (LSTM)                     [(None, 256), (None, 325632      input_1[0][0]                    
__________________________________________________________________________________________________
lstm_1 (LSTM)                   [(None, None, 256),  325632      input_2[0][0]                    
                                                                 lstm[0][1]                 

In [16]:
encoder_inputs = new_model.input[0]  # input_1
encoder_outputs, state_h_enc, state_c_enc = new_model.layers[2].output  # lstm_1
encoder_states = [state_h_enc, state_c_enc]
encoder_model = keras.Model(encoder_inputs, encoder_states)

decoder_inputs = new_model.input[1]  # input_2
decoder_state_input_h = keras.Input(shape=(latent_dim,), name="input_3")
decoder_state_input_c = keras.Input(shape=(latent_dim,), name="input_4")
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
decoder_lstm = new_model.layers[3]
decoder_outputs, state_h_dec, state_c_dec = decoder_lstm(
    decoder_inputs, initial_state=decoder_states_inputs
)
decoder_states = [state_h_dec, state_c_dec]
decoder_dense = new_model.layers[4]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = keras.Model(
    [decoder_inputs] + decoder_states_inputs, [decoder_outputs] + decoder_states
)

reverse_input_char_index = dict((i, char) for char, i in input_token_index.items())
reverse_target_char_index = dict((i, char) for char, i in target_token_index.items())

In [18]:
for seq_index in range(5):
    print(seq_index)
    input_seq = encoder_input_data[seq_index : seq_index + 1]
    decoded_sentence = decode_sequence(input_seq)
    print("-")
    print("Input sentence:", input_texts[seq_index])
    print("Decoded sentence:", decoded_sentence)


0
-
Input sentence: 	The Neique Strauss-Kahn of sexual assault lived in an apartment exclusively for patients with HIV and AIDS.

Decoded sentence:                                                                                                                                                                                                                                                                                                                                                                                                                                            
1
-
Input sentence: 	From the Post: The hotel maid, a West African immigrant, has occupied the fourth-floor High Bridge pad with her 15-year-old daughter since January -- and before that, lived in another Bronx apartment set aside by Harlem Community AIDS United strictly for adults with the virus and their families.

Decoded sentence:                                                                                        