In [1]:
%pip install tensorflow numpy pandas scikit-learn

Collecting pandas
  Obtaining dependency information for pandas from https://files.pythonhosted.org/packages/08/de/d4448c423484537ebc9373d3da2496a2e47f42ea11ff48e025cf49665471/pandas-2.1.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata
  Downloading pandas-2.1.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (18 kB)
Collecting scikit-learn
  Obtaining dependency information for scikit-learn from https://files.pythonhosted.org/packages/0c/2a/d3ff6091406bc2207e0adb832ebd15e40ac685811c7e2e3b432bfd969b71/scikit_learn-1.3.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata
  Downloading scikit_learn-1.3.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Collecting pytz>=2020.1 (from pandas)
  Obtaining dependency information for pytz>=2020.1 from https://files.pythonhosted.org/packages/32/4d/aaf7eff5deb402fd9a24a1449a8119f00d74ae9c2efa79f8ef9994261fc2/pytz-2023.3.post1-py2.py3-none-any.whl.metadata
  Down

In [2]:
# Imports

# Core imports
# import tensorflow as tf
import pandas as pd
import numpy as np

# Data manipulation
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Model architecture
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense, Concatenate, Attention
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, TensorBoard


2023-12-04 23:38:57.256190: I tensorflow/core/util/port.cc:111] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-12-04 23:38:57.304897: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-12-04 23:38:57.304939: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-12-04 23:38:57.304983: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-12-04 23:38:57.315983: I tensorflow/core/platform/cpu_feature_g

### Load, Process, and Split Data

In [3]:
# Load the data

file_path = 'reddit_cleansed_data.csv'
data = pd.read_csv(file_path)


In [4]:
# Pre-process data

# To reduce tokenization size, we will lowercase all tokens
data['title'] = data['title'].str.lower()
data['selftext'] = data['selftext'].str.lower()

# We will also remove quotation marks
data['title'] = data['title'].str.replace('"', '')
data['selftext'] = data['selftext'].str.replace('"', '')

# We will also add <BOS> and <EOS> tokens to the beginning and end of every sentence for inference
data['title'] = '<BOS> ' + data['title'] + ' <EOS>'
data['selftext'] = '<BOS> ' + data['selftext'] + ' <EOS>'

data

Unnamed: 0,title,selftext,score,num_comments,gilded_count,date,timestamp
0,<BOS> do not expose any part of your body to t...,<BOS> i repeat..this is not a drill.. <EOS>,65.0,5.0,0,1.428090e+09,2015-04-03 19:47:13
1,"<BOS> i sometimes remember the way he looked, ...",<BOS> i neglected to make sure he was dead. <EOS>,22.0,0.0,0,1.428235e+09,2015-04-05 11:55:10
2,<BOS> i live alone on the third floor of my ap...,<BOS> so who opens my window every night while...,35.0,3.0,0,1.428370e+09,2015-04-07 01:24:42
3,"<BOS> i heard the rain hitting my window, so i...","<BOS> my window wasn't wet, but the glass was ...",28.0,3.0,0,1.428385e+09,2015-04-07 05:40:55
4,<BOS> you know how sometimes your brain plays ...,<BOS> i caught one of those things today. <EOS>,84.0,6.0,0,1.428563e+09,2015-04-09 07:03:16
...,...,...,...,...,...,...,...
94081,<BOS> as i look thru at window i see something...,<BOS> my reflection helps me remember how well...,31.0,2.0,0,1.680377e+09,2023-04-01 19:21:54
94082,<BOS> i’ve always been passionate about conspi...,"<BOS> so when my wife had twins, i knew exactl...",27.0,8.0,0,1.680377e+09,2023-04-01 19:24:55
94083,"<BOS> you'll see me on the red carpet one day,...","<BOS> so i paid her a surprise visit, and upon...",23.0,2.0,0,1.680378e+09,2023-04-01 19:38:03
94084,<BOS> i could hear my sister screaming nearby ...,<BOS> but my heart sank when i remembered the ...,60.0,3.0,0,1.680378e+09,2023-04-01 19:41:01


In [5]:
# Split the data (80/20 train/test)
train_data, test_data = train_test_split(data, test_size=0.2)


### Tokenization and Padding

In [6]:
def preprocess_sentences(tokenizer, sentences):
    sequences = tokenizer.texts_to_sequences(sentences)
    max_length = max([len(seq) for seq in sequences])
    padded = pad_sequences(sequences, maxlen=max_length, padding='post')
    return padded, max_length


In [7]:
# Tokenize data

tokenizer = Tokenizer(oov_token="<OOV>", filters='!"#$%&()*+,-./:;=?@[\\]^_`{|}~\t\n')
tokenizer.fit_on_texts(data['title'] + data['selftext'])
vocab_size = len(tokenizer.word_index) + 1


In [8]:
# Pad data

train_titles, title_max_length = preprocess_sentences(tokenizer, train_data['title'])
train_texts, text_max_length = preprocess_sentences(tokenizer, train_data['selftext'])


### Model Architecture

In [9]:
# Model parameters

embedding_dim = 256
lstm_units = 256


In [10]:
# Encoder
encoder_inputs = Input(shape=(None,))
enc_emb = Embedding(vocab_size, embedding_dim)(encoder_inputs)
encoder_lstm = LSTM(lstm_units, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(enc_emb)
encoder_states = [state_h, state_c]

# Decoder
decoder_inputs = Input(shape=(None,))
dec_emb_layer = Embedding(vocab_size, embedding_dim)
dec_emb = dec_emb_layer(decoder_inputs)
decoder_lstm = LSTM(lstm_units, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(dec_emb, initial_state=encoder_states)

# Attention Layer
attention_layer = Attention()
attention_result = attention_layer([decoder_outputs, encoder_outputs])
decoder_concat_input = Concatenate(axis=-1)([decoder_outputs, attention_result])

# Dense layer
decoder_dense = Dense(vocab_size, activation='softmax')
decoder_outputs = decoder_dense(decoder_concat_input)

# Define the model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

# Compile the model
model.compile(optimizer='rmsprop', loss='sparse_categorical_crossentropy', metrics=['accuracy'])


2023-12-04 23:39:06.124026: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1886] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 46498 MB memory:  -> device: 0, name: NVIDIA RTX 6000 Ada Generation, pci bus id: 0000:d1:00.0, compute capability: 8.9
2023-12-04 23:39:07.178335: I tensorflow/tsl/platform/default/subprocess.cc:304] Start cannot spawn child process: No such file or directory


### Model Training

In [11]:
# Callbacks

checkpoint = ModelCheckpoint('model_with_tokens.h5', save_best_only=True, monitor='val_loss', mode='min')
early_stopping = EarlyStopping(monitor='val_loss', patience=5)
tensorboard = TensorBoard(log_dir='./logs')


In [12]:
# Prepare decoder target data

train_texts_shifted = np.hstack([train_texts[:, 1:], np.zeros((len(train_texts), 1))])
train_texts_shifted = np.expand_dims(train_texts_shifted, -1)


In [13]:
# Train the model

model.fit([train_titles, train_texts], train_texts_shifted,
          batch_size=64,
          epochs=5,
          validation_split=0.2,
          callbacks=[checkpoint, early_stopping, tensorboard])


Epoch 1/5


2023-12-04 23:39:11.260769: W tensorflow/core/grappler/costs/op_level_cost_estimator.cc:693] Error in PredictCost() for the op: op: "Softmax" attr { key: "T" value { type: DT_FLOAT } } inputs { dtype: DT_FLOAT shape { unknown_rank: true } } device { type: "GPU" vendor: "NVIDIA" model: "NVIDIA RTX 6000 Ada Generation" frequency: 2505 num_cores: 142 environment { key: "architecture" value: "8.9" } environment { key: "cuda" value: "11080" } environment { key: "cudnn" value: "8600" } num_registers: 65536 l1_cache_size: 24576 l2_cache_size: 100663296 shared_memory_size_per_multiprocessor: 102400 memory_size: 48757080064 bandwidth: 960096000 } outputs { dtype: DT_FLOAT shape { unknown_rank: true } }
2023-12-04 23:39:13.983942: I tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:442] Loaded cuDNN version 8905
2023-12-04 23:39:15.625434: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x7fbf2c333da0 initialized for platform CUDA (this does not guarantee that XLA will be us



2023-12-04 23:40:25.172518: W tensorflow/core/grappler/costs/op_level_cost_estimator.cc:693] Error in PredictCost() for the op: op: "Softmax" attr { key: "T" value { type: DT_FLOAT } } inputs { dtype: DT_FLOAT shape { unknown_rank: true } } device { type: "GPU" vendor: "NVIDIA" model: "NVIDIA RTX 6000 Ada Generation" frequency: 2505 num_cores: 142 environment { key: "architecture" value: "8.9" } environment { key: "cuda" value: "11080" } environment { key: "cudnn" value: "8600" } num_registers: 65536 l1_cache_size: 24576 l2_cache_size: 100663296 shared_memory_size_per_multiprocessor: 102400 memory_size: 48757080064 bandwidth: 960096000 } outputs { dtype: DT_FLOAT shape { unknown_rank: true } }
  saving_api.save_model(


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x7fc33854af50>

### Inference (Generate Sentences)
We need to extract the encoder and decoder models from the trained model based on the architecture defined above.

In [14]:
# Extract encoder from Seq2Seq for inference

encoder_model = Model(inputs=encoder_inputs, outputs=[encoder_outputs] + encoder_states)


In [15]:
# Extract decoder from Seq2Seq for inference
# Inputs for states from encoder to be fed into decoder

decoder_state_input_h = Input(shape=(lstm_units,))
decoder_state_input_c = Input(shape=(lstm_units,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

# Set up inference decoder using `decoder_state_input_h` and `decoder_state_input_c` as initial states
dec_emb2 = dec_emb_layer(decoder_inputs)
decoder_outputs2, state_h2, state_c2 = decoder_lstm(dec_emb2, initial_state=decoder_states_inputs)

# Attention inference
encoder_outputs_input = Input(shape=(None, lstm_units))
attention_result_inf = attention_layer([decoder_outputs2, encoder_outputs_input])
decoder_inf_concat = Concatenate(axis=-1)([decoder_outputs2, attention_result_inf])

# Dense layer to generate prob
decoder_outputs2 = decoder_dense(decoder_inf_concat)

# decoder model
decoder_model = Model(
    [decoder_inputs, decoder_state_input_h, decoder_state_input_c, encoder_outputs_input],
    [decoder_outputs2, state_h2, state_c2])


In [16]:
print(tokenizer.word_index)




In [17]:
# Generation with sampling and temperature control (modifies prob)

def sample(preds, temperature=1.0):
    """
    Helper function to sample an index from a probability array with a specified temp.
    
    Args:
        preds (list): list of probabilities
        temperature (float, optional): Controls the randomness of the output. Defaults to 1.0.
    """
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds + 1e-7) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

def generate_sentence(input_text, temperature=1.0):
    """
    Generate a sentence from the input text using the encoder and decoder models.

    Args:
        input_text (str): input sentence (first sentence)
        temperature (float, optional): Controls the randomness of the output. Defaults to 1.0.

    Returns:
        _type_: _description_
    """
    input_seq = tokenizer.texts_to_sequences([input_text])
    input_seq = pad_sequences(input_seq, maxlen=title_max_length, padding='post')

    # Get the encoder outputs and states
    encoder_output, state_h, state_c = encoder_model.predict(input_seq)

    # Start with the <BOS> token
    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = tokenizer.word_index['<bos>']

    stop_condition = False
    decoded_sentence = ''

    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq, state_h, state_c, encoder_output])

        # Use temperature-based sampling to choose the next word
        sampled_token_index = sample(output_tokens[0, -1, :], temperature)

        sampled_word = tokenizer.index_word.get(sampled_token_index, 'UNK')

        if sampled_word == '<eos>' or len(decoded_sentence.split()) > text_max_length:
            stop_condition = True
        else:
            decoded_sentence += ' ' + sampled_word

        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index
        state_h, state_c = h, c

    return decoded_sentence.strip()


In [24]:
# Generate stories

input_sentences = ['I got out of bed this morning.', 
                   'I was horrified when I get my test results back.',
                   'My parents told me not to go upstairs.',
                   'There was a ghost.']
generated_stories = []

for input_seq in input_sentences:
    word_sentence = generate_sentence(input_seq)
    generated_stories.append(input_seq + ' ' + word_sentence)
    



In [25]:
for story in generated_stories:
    print(story)
    print('\n')

I got out of bed this morning. by the limb struggling that malnourished i dropped you understand that the scopes didn't stop but 2 as instead


I was horrified when I get my test results back. i begged the answer blessedly coming to eat my flesh i an' “i have realise that where not not never who ”


My parents told me not to go upstairs. and something was traffic champagne tellers and never confused crawling off the chair on hair of screams


There was a ghost. the needs hershey’s slowly blood at miles from allowed full of and screaming and won’t keep picking me and approaching the chance smiling away her window


