In [1]:
# Keras Imports (Data prep, model training)
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, TimeDistributed, Dropout
from keras.callbacks import ModelCheckpoint, EarlyStopping, TensorBoard


# To Create and Load Word Embeddings
from gensim.models import Word2Vec, KeyedVectors


# For Tokenizer
from nltk.tokenize import word_tokenize
import json
import io

import pandas as pd
import numpy as np




In [2]:
# Variables

# Data
# file_path = '/Users/avocado/Developer/Projects/two-sentence-horror-lm/two-sentence-horror-keras-sequential/dataset/reddit_scrape_20_cleansed.csv'
file_path = '/kaggle/input/two-sentence-horror-jan-2015-apr-2023/reddit_cleansed_data.csv'

NGRAM = 3
EMBEDDINGS_SIZE = 100

### 1. Load Data

In [3]:
# Read in data into Pandas DF

df = pd.read_csv(file_path)

# Remove NaN values
df = df.dropna()

# Remove rows containing chars other than ASCII chars
# df = df[df['title'].str.contains(r'[^\x00-\x7F]+') == False]

# Create a boolean mask for rows to keep
# Text contains "&amp;#x200B;" which is a zero-width space (tokenized incorrectly)
mask = ~(
    df['title'].str.contains('x200B|&amp;#x200B;', na=False) |
    df['selftext'].str.contains('x200B|&amp;#x200B;', na=False)
)

# Apply the mask to filter the DataFrame
df = df[mask]

# Convert 'title' and 'selftext' to lowercase string (lowercase to match case in Word2Vec embedding and Keras Tokenizer)
df['title'] = pd.Series(df['title'], dtype="string").str.lower()
df['selftext'] = pd.Series(df['selftext'], dtype="string").str.lower()

# df.dtypes
df

Unnamed: 0,title,selftext,score,num_comments,gilded_count,date,timestamp
0,"""do not expose any part of your body to the air.""","""i repeat..this is not a drill..""",65.0,5.0,0,1.428090e+09,2015-04-03 19:47:13
1,"i sometimes remember the way he looked, broken...",i neglected to make sure he was dead.,22.0,0.0,0,1.428235e+09,2015-04-05 11:55:10
2,i live alone on the third floor of my apartmen...,so who opens my window every night while i'm s...,35.0,3.0,0,1.428370e+09,2015-04-07 01:24:42
3,"i heard the rain hitting my window, so i walke...","my window wasn't wet, but the glass was covere...",28.0,3.0,0,1.428385e+09,2015-04-07 05:40:55
4,you know how sometimes your brain plays tricks...,i caught one of those things today.,84.0,6.0,0,1.428563e+09,2015-04-09 07:03:16
...,...,...,...,...,...,...,...
94081,as i look thru at window i see something inhumane,my reflection helps me remember how well my su...,31.0,2.0,0,1.680377e+09,2023-04-01 19:21:54
94082,i’ve always been passionate about conspiracy t...,"so when my wife had twins, i knew exactly what...",27.0,8.0,0,1.680377e+09,2023-04-01 19:24:55
94083,"""you'll see me on the red carpet one day,"" sai...","so i paid her a surprise visit, and upon walki...",23.0,2.0,0,1.680378e+09,2023-04-01 19:38:03
94084,i could hear my sister screaming nearby as i s...,but my heart sank when i remembered the monste...,60.0,3.0,0,1.680378e+09,2023-04-01 19:41:01


### 2. Encode text into integers (tokenization)

In [4]:
# Tokenize data - Functions

def tokenize_and_add_tokens(sentence: str, ngram_value: int) -> str:
    """
    Tokenize sentence based on n-gram value (appends/prepends <s> and </s> tokens)

    Args:
        sentence (str): sentence to tokenize
        ngram_value (int): number of n-grams to use (size of window)

    Returns:
        (str): output sentence, tokenized
    """
    if not isinstance(sentence, str):
        print("ERROR:", sentence)
    
    # Tokenize the sentence
    tokens = word_tokenize(sentence)

    # Add the <s> and </s> tokens based on n-gram value
    start_tokens = ['<s>'] * (ngram_value - 1)
    end_tokens = ['</s>'] * (ngram_value - 1)

    # Combine the tokens
    return start_tokens + tokens + end_tokens

In [5]:
# Tokenize data - Execution

df['title_tokenized'] = df['title'].apply(lambda x: tokenize_and_add_tokens(x, NGRAM))
df['selftext_tokenized'] = df['selftext'].apply(lambda x: tokenize_and_add_tokens(x, NGRAM))

df

Unnamed: 0,title,selftext,score,num_comments,gilded_count,date,timestamp,title_tokenized,selftext_tokenized
0,"""do not expose any part of your body to the air.""","""i repeat..this is not a drill..""",65.0,5.0,0,1.428090e+09,2015-04-03 19:47:13,"[<s>, <s>, ``, do, not, expose, any, part, of,...","[<s>, <s>, ``, i, repeat..this, is, not, a, dr..."
1,"i sometimes remember the way he looked, broken...",i neglected to make sure he was dead.,22.0,0.0,0,1.428235e+09,2015-04-05 11:55:10,"[<s>, <s>, i, sometimes, remember, the, way, h...","[<s>, <s>, i, neglected, to, make, sure, he, w..."
2,i live alone on the third floor of my apartmen...,so who opens my window every night while i'm s...,35.0,3.0,0,1.428370e+09,2015-04-07 01:24:42,"[<s>, <s>, i, live, alone, on, the, third, flo...","[<s>, <s>, so, who, opens, my, window, every, ..."
3,"i heard the rain hitting my window, so i walke...","my window wasn't wet, but the glass was covere...",28.0,3.0,0,1.428385e+09,2015-04-07 05:40:55,"[<s>, <s>, i, heard, the, rain, hitting, my, w...","[<s>, <s>, my, window, was, n't, wet, ,, but, ..."
4,you know how sometimes your brain plays tricks...,i caught one of those things today.,84.0,6.0,0,1.428563e+09,2015-04-09 07:03:16,"[<s>, <s>, you, know, how, sometimes, your, br...","[<s>, <s>, i, caught, one, of, those, things, ..."
...,...,...,...,...,...,...,...,...,...
94081,as i look thru at window i see something inhumane,my reflection helps me remember how well my su...,31.0,2.0,0,1.680377e+09,2023-04-01 19:21:54,"[<s>, <s>, as, i, look, thru, at, window, i, s...","[<s>, <s>, my, reflection, helps, me, remember..."
94082,i’ve always been passionate about conspiracy t...,"so when my wife had twins, i knew exactly what...",27.0,8.0,0,1.680377e+09,2023-04-01 19:24:55,"[<s>, <s>, i, ’, ve, always, been, passionate,...","[<s>, <s>, so, when, my, wife, had, twins, ,, ..."
94083,"""you'll see me on the red carpet one day,"" sai...","so i paid her a surprise visit, and upon walki...",23.0,2.0,0,1.680378e+09,2023-04-01 19:38:03,"[<s>, <s>, ``, you, 'll, see, me, on, the, red...","[<s>, <s>, so, i, paid, her, a, surprise, visi..."
94084,i could hear my sister screaming nearby as i s...,but my heart sank when i remembered the monste...,60.0,3.0,0,1.680378e+09,2023-04-01 19:41:01,"[<s>, <s>, i, could, hear, my, sister, screami...","[<s>, <s>, but, my, heart, sank, when, i, reme..."


In [6]:
# Tokenize data - Ref Variable

tokenized_data = pd.concat([df['title_tokenized'], df['selftext_tokenized']]).to_list()
tokenized_data_X = df['title_tokenized'].to_list()
tokenized_data_Y = df['selftext_tokenized'].to_list()


In [7]:
# Tokenize Data - Create Dictionary Mapping

tokenizer_on_words = Tokenizer()
# Update tokenizer vocab on data
tokenizer_on_words.fit_on_texts(tokenized_data)
# Convert list of texts to list of integers
encoded_on_words = tokenizer_on_words.texts_to_sequences(tokenized_data)

# tokenizer_on_words.word_index


In [8]:
# Print out the size of the vocab

size_of_vocab = len(tokenizer_on_words.word_index)
print("Size of Word Index (on words):", size_of_vocab)


Size of Word Index (on words): 45626


### 3. Prepare Sequences to Train Model
#### Fixed n-gram based sequences

In [9]:
def generate_ngram_training_samples(encoded: list, ngram: int) -> list:
    '''
    Takes the encoded data (list of lists) and
    generates the training samples out of it.

    Parameters:
        encoded (list): list of lists in the format [[x1, x2, ... , x(n-1), y], ...]
        ngram (int): the ngram model you are training

    Returns:
        training_samples (list): list of lists in the format [[x1, x2, ... , x(n-1), y], ...]
    '''
    training_samples = []

    for sentence in encoded:
        for i in range(len(sentence) - ngram + 1):
            training_samples.append(sentence[i:i+ngram])

    return training_samples


In [10]:
word_training_samples = generate_ngram_training_samples(encoded=encoded_on_words, ngram=NGRAM)

print("Word Training Samples (first 5):")
print(word_training_samples[:5])

print("\nNum Word Training Samples:", len(word_training_samples))


Word Training Samples (first 5):
[[1, 1, 21], [1, 21, 61], [21, 61, 58], [61, 58, 8594], [58, 8594, 266]]

Num Word Training Samples: 3802291


### 4. Create Word Embedding Vectors based on dataset

In [11]:
# Use gensim.Word2Vec to create embeddings
path = False
# path = 'two_sentence_horror_reddit_word_embeddings.txt'

if not path:
    word_embeddings: Word2Vec = Word2Vec(tokenized_data, min_count=1, vector_size=EMBEDDINGS_SIZE, window=5, sg=1)
else:
    word_embeddings: Word2Vec = Word2Vec.load(path)


In [12]:
# Save the embeddings
word_embeddings.wv.save_word2vec_format('two_sentence_horror_reddit_word_embeddings.txt', binary=False)


### 5. Split Sequences into X and y, then create data generator

In [13]:
# Note here that the sequences were in the form:
# sequence = [x1, x2, ... , x(n-1), y]
# We still need to separate it into [[x1, x2, ... , x(n-1)], ...], [y1, y2, ...]]

seq_on_words_x = [seq[:-1] for seq in word_training_samples]
seq_on_words_y = [seq[-1] for seq in word_training_samples]

# Print out the shapes to verify that they are correct
print("Shape of Word X:", len(seq_on_words_x))
print("Shape of Word Y:", len(seq_on_words_y))


Shape of Word X: 3802291
Shape of Word Y: 3802291


In [14]:
def read_embeddings(embeddings: Word2Vec | str, tokenizer: Tokenizer) -> (dict, dict):
    '''
    Loads and parses embeddings trained in earlier.

    Parameters:
        embeddings (Word2Vec): embeddings trained on the data
        Tokenizer (Tokenizer): tokenizer used to tokenize the data (needed to get the word to index mapping)
        is_char (bool): determines if embeddings is character or word based

    Returns:
        word_to_embedding (dict): mapping from word to its embedding vector
        index_to_embedding (dict): mapping from index to its embedding vector
    '''
    # Retrieve dict of word to index from tokenizer
    if path:
        embeddings = KeyedVectors.load_word2vec_format(embeddings, binary=False)
    tokenizer_words = list(tokenizer.word_index.keys())
    tokenizer_words_to_idx = tokenizer.word_index

    # Create a dict of word to embedding
    word_to_embedding = {word: embeddings.wv[word] for word in tokenizer_words}
    index_to_embedding = {tokenizer_words_to_idx[word]: embeddings.wv[word] for word in tokenizer_words}

    # Add zero vector for padding token
    # embedding_size = embeddings.wv.vector_size
    word_to_embedding['<PAD>'] = np.zeros(EMBEDDINGS_SIZE)
    index_to_embedding[0] = np.zeros(EMBEDDINGS_SIZE)

    return word_to_embedding, index_to_embedding


In [15]:
word_to_embedding_on_word, index_to_embedding_on_word = read_embeddings(word_embeddings, tokenizer_on_words)

print(len(word_to_embedding_on_word))


45627


In [16]:
# the "0" index of the Tokenizer is assigned for the padding token. 
# We init the vector for padding token as all zeros of embedding size

# Modify tokenizer to include padding token
padding_token = '<PAD>'
tokenizer_on_words.word_index[padding_token] = 0
tokenizer_on_words.index_word[0] = padding_token

# Increase vocab size
size_of_vocab += 1


In [17]:
tokenizer_json = tokenizer_on_words.to_json()
with io.open('tokenizer_keras.json', 'w', encoding='utf-8') as f:
    f.write(json.dumps(tokenizer_json, ensure_ascii=False))


In [18]:
def data_generator(X: list, y: list, num_sequences_per_batch: int, index_2_embedding: dict) -> (list,list):
    '''
    Returns data generator to be used by feed_forward
    https://wiki.python.org/moin/Generators
    https://realpython.com/introduction-to-python-generators/

    Yields batches of embeddings and labels to go with them.
    Use one hot vectors to encode the labels
    (see the to_categorical function)

    Parameters:
        X (list): list of lists in the format [[x1, x2, ... , x(n-1)], ...]
        y (list): list of labels in the format [y1, y2, ...]
        num_sequences_per_batch (int): number of sequences per batch
        index_2_embedding (dict): mapping from index to its embedding vector
        is_char (bool): determines if embeddings is character or word based

    Returns:
        X_batch_embeddings (np.ndarray): list of embeddings in the format [[x1, x2, ... , x(n-1)], ...]
        y_batch_onehot (np.ndarray): list of one hot vectors in the format [y1, y2, ...]
    '''
    num_samples = len(y) # num sequences (X and y should be same length)
    num_classes = len(index_2_embedding) # num classes = size of vocab

    while True: # restart generator when it reaches the end for Keras
        for start_index in range(0, num_samples, num_sequences_per_batch): # by batch
            end_index = min(start_index + num_sequences_per_batch, num_samples) # if last batch goes over total num of samples

            # Get batch of sequences and labels
            X_batch_sequences = X[start_index:end_index]
            y_batch_labels = y[start_index:end_index]

            # Convert sequences into embeddings and flatten them
            X_batch_embeddings = []
            for seq in X_batch_sequences:
                X_batch_embeddings.append(np.array([index_2_embedding[token] for token in seq]).flatten())

            # Convert labels to one-hot encoded vectors
            y_batch_onehot = to_categorical(y_batch_labels, num_classes=num_classes)

            yield np.array(X_batch_embeddings), np.array(y_batch_onehot)


In [19]:
# Init data generator

num_sequences_per_batch = 128 # this is the batch size
steps_per_epoch = len(seq_on_words_x) // num_sequences_per_batch  # Number of batches per epoch
train_generator_on_word = data_generator(seq_on_words_x, seq_on_words_y, num_sequences_per_batch, index_to_embedding_on_word)

sample = next(train_generator_on_word) # this is how you get data out of generators
print("Word X Shape:", sample[0].shape) # (batch_size, (n-1) * EMBEDDING_SIZE)  (128, 100)
print("Word Y Shape:", sample[1].shape) # (batch_size, |V|) to_categorical


# TODO
# Dimensions:

# Word X Shape: (128, 200)
# Word Y Shape: (128, 45627) (vocab + 1 for padding)


Word X Shape: (128, 200)
Word Y Shape: (128, 45627)


### 6. Train and Save the Model

In [20]:
def create_feedforward_neural_lm(input_shape: int, vocab_size: int) -> Sequential:
    """
    Create a Feedforward Neural Language Model.

    Args:
        input_shape (int): shape of input data ((n-1) * EMBEDDING_SIZE))
        vocab_size (int): size of vocab

    Returns:
        model (Sequential): Keras Sequential model
    """
    # Init model
    model = Sequential()

    # Define hidden layer(s)
    model.add(Dense(128, input_shape=(input_shape,), activation='relu'))
    model.add(Dropout(0.5))

    # Output layer
    model.add(Dense(vocab_size, activation='softmax'))

    # Compile model
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

    return model


In [21]:
# Callbacks
checkpoint = ModelCheckpoint('basic_keras_model.h5', save_best_only=True, monitor='val_loss', mode='min')
early_stopping = EarlyStopping(monitor='val_loss', patience=5)
tensorboard = TensorBoard(log_dir='./logs')


In [22]:
# Train Word Model

train_generator_on_word = data_generator(seq_on_words_x, seq_on_words_y, num_sequences_per_batch, index_to_embedding_on_word)

word_model = create_feedforward_neural_lm(input_shape=(NGRAM-1) * EMBEDDINGS_SIZE, vocab_size=size_of_vocab)
word_model.fit(x=train_generator_on_word,
            steps_per_epoch=len(seq_on_words_x) // num_sequences_per_batch,
            epochs=1,
            callbacks=[checkpoint, early_stopping, tensorboard])




<keras.src.callbacks.History at 0x78960ce8b970>

In [23]:
# Evaluate Model

accuracy_word = word_model.evaluate(x=train_generator_on_word, steps=len(seq_on_words_x) // num_sequences_per_batch)

print("Word Model - Accuracy:", accuracy_word)

# TODO:
# Accuracy:
# 20.37% for word model

Word Model - Accuracy: [4.769742965698242, 0.24160774052143097]


In [24]:
# Save Model

word_model.save('two_sentence_horror_seq_keras_1_epoch.keras')


In [25]:
# Print Model Summaries

print("Word Model Summary:")
word_model.summary()


Word Model Summary:
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 128)               25728     
                                                                 
 dropout (Dropout)           (None, 128)               0         
                                                                 
 dense_1 (Dense)             (None, 45627)             5885883   
                                                                 
Total params: 5911611 (22.55 MB)
Trainable params: 5911611 (22.55 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


### 7. Generate Sentences

In [29]:
def generate_seq(model: Sequential, tokenizer: Tokenizer, seed: str, max_length: int):
    '''
    Generate a sequence from the model until you get an end of sentence token.

    Parameters:
        model: your neural network
        tokenizer: the keras preprocessing tokenizer
        seed: a string that serves as the starting point of the sequence
        max_length: maximum amount of tokens to generate

    Returns: string sentence
    '''
    # Convert string seed into a list of words
    sentence = seed.split()
    
    # Rest of your code remains the same
    eos_idx = tokenizer.word_index['</s>']
    bos_idx = tokenizer.word_index['<s>']
    itr = 0
    first_sentence_done = False

    while itr < max_length:
        # Convert given input into indices, then embeddings
        input_tokens = sentence[-(NGRAM-1):]
        encoded_indices = tokenizer.texts_to_sequences([input_tokens])[0]

        # Pad seq if length < NGRAM-1 (for last token)
        while len(encoded_indices) < NGRAM-1:
            encoded_indices.insert(0, bos_idx)  # prepend with zeros or a specific padding token index

        # Convert indices to embeddings, then flatten into 1D array, then reshape to 2D array to fit model
        # Model expects: 2D array of shape (1, (n-1) * EMBEDDINGS_SIZE)
        encoded_embeddings = np.array([index_to_embedding_on_word[idx] for idx in encoded_indices]).flatten().reshape(1, -1)

        # Predict next token
        prediction = model.predict(encoded_embeddings)
        predicted_idx = np.random.choice(len(prediction[0]), p=prediction[0]) # generate random sentence
        predicted_word = tokenizer.index_word[predicted_idx]

        # If next token is end of sentence, return sentence
        if predicted_word == '</s>':
            first_sentence_done = True
            if first_sentence_done:
                return ' '.join(sentence)
        else:
            sentence.append(predicted_word)

        itr += 1

    return ' '.join(sentence)


In [35]:
# Generate Text - Interactive

input_sentences = ['I got out of bed this morning.', 
                   'I was horrified when I get my test results back.',
                   'My parents told me not to go upstairs.',
                   'There was a ghost.']
generated_stories = []
max_length = 99999 # "no max length" - generate until </s> token

for input_seq in input_sentences:
    word_sentence = generate_seq(word_model, tokenizer_on_words, input_seq, max_length)
    generated_stories.append(word_sentence)




In [36]:
for gen_sentence in generated_stories:
    print(gen_sentence)
    print('\n')

I got out of bed this morning. look on him tied by the plan , they were those “ didn ’ t notice to their medicine , i can finally make me more my words now standing on his alarm father .


I was horrified when I get my test results back. , they were starting to harvest , having stopped walking here .


My parents told me not to go upstairs. through screams and suffering , was that when will speak to worry .


There was a ghost. before ten , as he who would send the power is true dead .


