In [1]:
import random
import re
import os
import tqdm
import xml.etree.ElementTree as ET

import numpy as np
from tensorflow import keras
from nltk.corpus import stopwords
from nltk import tokenize

In [2]:
class Debug:
    def __init__(self, debug_mode=True):
        self.debug_mode = debug_mode
        self.flag = {}

    def log(self, target, flag=None):
        if self.debug_mode:
            if flag is None:
                print(target)
            else:
                if flag in self.flag.keys():
                    if self.flag[flag]:
                        print(target)

    def set_flag(self, flag: str, val: bool):
        self.flag[flag] = val

debug = Debug(True)

In [3]:
class GeneratorExceptions(Exception):
    """
    The Exception class for tracking all exceptions raised in data generator
    Param
        text: the displayed text
    """
    def __init__(self, text: str):
        self.text = text

class temp_generator:
    def __init__(self, dataset_file_path : str="data/dataset/nysk.xml", processed_dataset_path: str ="data/processed_dataset/"):
        self.dataset_file_path = dataset_file_path
        self.processed_dataset_path = processed_dataset_path
        self.preprocess_data(override=False)

    def preprocess_data(self, override=False):
        if os.path.isfile(self.dataset_file_path):
            if not os.path.isdir(self.processed_dataset_path):
                os.mkdir(self.processed_dataset_path)

            res = os.listdir(self.processed_dataset_path)

            with open(self.dataset_file_path, "r", encoding="utf-8") as f:
                doc = ET.ElementTree(file=f)

            root = doc.getroot()
            print(len(root))

            for item in tqdm.tqdm(root):
                news_id = item.findtext('docid')
                source = item.findtext('source')
                url = item.findtext('url')
                title = item.findtext('title')
                summary = item.findtext('summary')
                text = item.findtext('text')

                title = re.sub(r"<.*>", "", title)
                title = re.sub(r"\W", "_", title)
                title = f"{news_id}_{title[:10]}"

                fp = f"{self.processed_dataset_path}{title}.txt"
                if not os.path.isfile(fp) or override:
                    with open(fp, 'w', encoding='utf-8') as f:
                        f.write(text)
        else:
            raise GeneratorExceptions("Path doesn't exist")
    
    def get_one(self):
        f_list = os.listdir(self.processed_dataset_path)
        with open(f"{self.processed_dataset_path}/{f_list[0]}", 'r') as f:
            text = f.read()
            res = tokenize.sent_tokenize(text)
        if debug.debug_mode:
            for i in res:
                print(i)
        return res
    

In [4]:
def generate_model(num_encoder_tokens, num_decoder_tokens, latent_dim=256):
      
    encoder_inputs = keras.Input(shape=(None, num_encoder_tokens))
    encoder = keras.layers.LSTM(latent_dim, return_state=True)
    encoder_outputs_, state_h, state_c = encoder(encoder_inputs)
    
    encoder_states = [state_h, state_c]
    
    decoder_inputs = keras.Input(shape=(None, num_decoder_tokens))
    
    decoder_lstm = keras.layers.LSTM(latent_dim, return_sequences=True, return_state=True)
    decoder_outputs, _, _ = decoder_lstm(decoder_inputs, initial_state=encoder_states)
    decoder_dense = keras.layers.Dense(num_decoder_tokens, activation="softmax")
    decoder_outputs = decoder_dense(decoder_outputs)
    
    model = keras.Model([encoder_inputs, decoder_inputs], decoder_outputs)
    return model

In [5]:
# TODO: put this into actual data generator
dg = temp_generator()
sample_text = dg.get_one()

input_texts = []
target_texts = []
input_characters = set()
target_characters = set()

for i in range(0, len(sample_text)-1):
    input_t = f"\t{sample_text[i]}\n"
    target_t = f"\t{sample_text[i+1]}\n"
    input_texts.append(input_t)
    target_texts.append(target_t)
    
    for c in input_t:
        if c not in input_characters:
            input_characters.add(c)
    
    for c in target_t:
        if c not in target_characters:
            target_characters.add(c)

input_char_list = sorted(list(input_characters))
target_char_list = sorted(list(target_characters))

encoder_tokens_count = len(input_char_list)
decoder_tokens_count = len(target_char_list)

max_encoder_sequence_len = max([len(t) for t in input_texts])
max_decoder_sequence_len = max([len(t) for t in target_texts])

print("Number of samples:", len(input_texts))
print("Number of unique input tokens:", encoder_tokens_count)
print("Number of unique output tokens:", decoder_tokens_count)
print("Max sequence length for inputs:", max_encoder_sequence_len)
print("Max sequence length for outputs:", max_decoder_sequence_len)

100%|██████████| 10421/10421 [00:02<00:00, 3666.78it/s]


10421
The Neique Strauss-Kahn of sexual assault lived in an apartment exclusively for patients with HIV and AIDS.
From the Post: The hotel maid, a West African immigrant, has occupied the fourth-floor High Bridge pad with her 15-year-old daughter since January -- and before that, lived in another Bronx apartment set aside by Harlem Community AIDS United strictly for adults with the virus and their families.
The paper was unable to confirm if the accuser has HIV or AIDS because of medical confidentiality laws, but the Post confirmed that the agency rents apartments only for adults with the disease.
A Harlem United employee said at least one adult in the apartment must be HIV-positive or have AIDS to qualify to live in one of the residences.
Sources told the Post that only the alleged victim and her child lived in the apartment.
Strauss-Kahn is accused of forcing the woman to perform oral sex on him.
She told police that after the forced act, she spit his semen onto the floor.
According 

In [6]:
input_token_index = dict([(char, i) for i, char in enumerate(input_characters)])
target_token_index = dict([(char, i) for i, char in enumerate(target_characters)])

encoder_input_data = np.zeros(
    (len(input_texts), max_encoder_sequence_len, encoder_tokens_count), dtype="float32"
)
decoder_input_data = np.zeros(
    (len(input_texts), max_decoder_sequence_len, decoder_tokens_count), dtype="float32"
)
decoder_target_data = np.zeros(
    (len(input_texts), max_decoder_sequence_len, decoder_tokens_count), dtype="float32"
)

for i, (input_text, target_text) in enumerate(zip(input_texts, target_texts)):
    for t, char in enumerate(input_text):
        encoder_input_data[i, t, input_token_index[char]] = 1.0
        encoder_input_data[i, t + 1 :, input_token_index[" "]] = 1.0
    for t, char in enumerate(target_text):
        # decoder_target_data is ahead of decoder_input_data by one timestep
        decoder_input_data[i, t, target_token_index[char]] = 1.0
        if t > 0:
            # decoder_target_data will be ahead by one timestep
            # and will not include the start character.
            decoder_target_data[i, t - 1, target_token_index[char]] = 1.0
            decoder_input_data[i, t + 1 :, target_token_index[" "]] = 1.0
            decoder_target_data[i, t:, target_token_index[" "]] = 1.0

print(encoder_input_data)
print(decoder_target_data)



[[[0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  ...
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]

 [[0. 0. 0. ... 0. 0. 0.]
  [0. 1. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  ...
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]

 [[0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  ...
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]

 ...

 [[0. 0. 0. ... 0. 0. 0.]
  [0. 1. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  ...
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]

 [[0. 0. 0. ... 0. 0. 0.]
  [0. 1. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  ...
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]

 [[0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  ...
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]]
[[[0. 1. 0

In [7]:
latent_dim = 256

model = generate_model(num_encoder_tokens=encoder_tokens_count, 
                       num_decoder_tokens=decoder_tokens_count,
                       latent_dim=latent_dim)

model_name = "Model\SeqToSeq_Model"

model.compile(
    optimizer="rmsprop", loss="categorical_crossentropy", metrics=["accuracy"]
)

In [8]:
batch_size = 64  
epochs = 1  

model.fit(
    [encoder_input_data, decoder_input_data],
    decoder_target_data,
    batch_size=batch_size,
    epochs=epochs,
    validation_split=0.2,
)

Train on 11 samples, validate on 3 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoc

<tensorflow.python.keras.callbacks.History at 0x18781b4de88>

In [9]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, None, 61)]   0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, None, 61)]   0                                            
__________________________________________________________________________________________________
lstm (LSTM)                     [(None, 256), (None, 325632      input_1[0][0]                    
__________________________________________________________________________________________________
lstm_1 (LSTM)                   [(None, None, 256),  325632      input_2[0][0]                    
                                                                 lstm[0][1]                   

In [10]:
model.save_weights("Model/test.weights.hdf5")
# keras.models.save_model(model, model_name)

In [11]:
def decode_sequence(input_seq):
    states_value = encoder_model.predict(input_seq)

    target_seq = np.zeros((1, 1, decoder_tokens_count))
    target_seq[0, 0, target_token_index["\t"]] = 1.0

    stop_condition = False
    decoded_sentence = ""
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)

        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = reverse_target_char_index[sampled_token_index]
        decoded_sentence += sampled_char

        if sampled_char == "\n" or len(decoded_sentence) > max_decoder_sequence_len:
            stop_condition = True

        target_seq = np.zeros((1, 1, decoder_tokens_count))
        target_seq[0, 0, sampled_token_index] = 1.0

        states_value = [h, c]
    return decoded_sentence

In [12]:
new_model = generate_model(num_encoder_tokens=encoder_tokens_count, 
                           num_decoder_tokens=decoder_tokens_count,
                           latent_dim=latent_dim)
new_model.compile(
    optimizer="rmsprop", 
    loss="categorical_crossentropy", 
    metrics=["accuracy"]
)

new_model.summary()

new_model.load_weights("Model/test.weights.hdf5")

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_3 (InputLayer)            [(None, None, 61)]   0                                            
__________________________________________________________________________________________________
input_4 (InputLayer)            [(None, None, 61)]   0                                            
__________________________________________________________________________________________________
lstm_2 (LSTM)                   [(None, 256), (None, 325632      input_3[0][0]                    
__________________________________________________________________________________________________
lstm_3 (LSTM)                   [(None, None, 256),  325632      input_4[0][0]                    
                                                                 lstm_2[0][1]               

In [16]:
encoder_inputs = new_model.input[0]  # input_1
encoder_outputs, state_h_enc, state_c_enc = new_model.layers[2].output  # lstm_1
encoder_states = [state_h_enc, state_c_enc]
encoder_model = keras.Model(encoder_inputs, encoder_states)

decoder_inputs = new_model.input[1]  # input_2
decoder_state_input_h = keras.Input(shape=(latent_dim,), name="input_6")
decoder_state_input_c = keras.Input(shape=(latent_dim,), name="input_7")
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
decoder_lstm = new_model.layers[3]
decoder_outputs, state_h_dec, state_c_dec = decoder_lstm(
    decoder_inputs, initial_state=decoder_states_inputs
)
decoder_states = [state_h_dec, state_c_dec]
decoder_dense = new_model.layers[4]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = keras.Model(
    [decoder_inputs] + decoder_states_inputs, [decoder_outputs] + decoder_states
)

reverse_input_char_index = dict((i, char) for char, i in input_token_index.items())
reverse_target_char_index = dict((i, char) for char, i in target_token_index.items())

In [17]:
for seq_index in range(5):
    print(seq_index)
    input_seq = encoder_input_data[seq_index : seq_index + 1]
    decoded_sentence = decode_sequence(input_seq)
    print("-")
    print("Input sentence:", input_texts[seq_index])
    print("Decoded sentence:", decoded_sentence)


0
-
Input sentence: 	The Neique Strauss-Kahn of sexual assault lived in an apartment exclusively for patients with HIV and AIDS.

Decoded sentence: F                                                                                                                                                                                                                                                                                                                                                                                                                                          
1
-
Input sentence: 	From the Post: The hotel maid, a West African immigrant, has occupied the fourth-floor High Bridge pad with her 15-year-old daughter since January -- and before that, lived in another Bronx apartment set aside by Harlem Community AIDS United strictly for adults with the virus and their families.

Decoded sentence: S                                                                                      