In [1]:
try:
  import google.colab
  IN_COLAB = True
except:
  IN_COLAB = False

# keras module for building LSTM 
from keras.layers import Embedding, LSTM, Dense, Dropout
from keras.preprocessing.text import Tokenizer
from keras.callbacks import EarlyStopping
from keras.models import Sequential
import keras.utils as ku 
import keras
import tensorflow as tf
if IN_COLAB:
  !pip install Keras-Preprocessing
from keras_preprocessing.sequence import pad_sequences

import pandas as pd
import numpy as np
import string, os 

2023-04-09 13:58:14.664755: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


# Text Generation with LSTM

In [2]:
# Get Data
train_text_file = keras.utils.get_file('train_text.txt', 'https://jrssbcrsefilesnait.blob.core.windows.net/3950data1/reddit_wsb.csv')
train_text = pd.read_csv(train_text_file)
train_text.sample(10)

Unnamed: 0,title,score,id,url,comms_num,created,body,timestamp
9924,YOU ONLY LOSE IF YOU SELL AT A LOSS ---- HOLD,1,l71729,https://www.reddit.com/r/wallstreetbets/commen...,8,1611880000.0,,2021-01-29 02:22:31
30936,GME long!,8,ldkrpa,https://www.reddit.com/r/wallstreetbets/commen...,7,1612598000.0,GME long is the way! I’m not pushing anyone to...,2021-02-06 09:48:56
27147,It was a win.,10,ld042l,https://www.reddit.com/r/wallstreetbets/commen...,1,1612531000.0,Lots of doom and fingers in butts the last cou...,2021-02-05 15:13:17
31508,We can get back on the hedge funds by uniting ...,0,ldu6am,https://www.reddit.com/r/wallstreetbets/commen...,45,1612634000.0,*SILVER**,2021-02-06 19:51:30
10100,Never invested before,1,l7153e,https://www.reddit.com/r/wallstreetbets/commen...,9,1611880000.0,I’m in the US. How do I buy amc stock? Now?,2021-01-29 02:20:35
34769,To all the boys holding GME. This video remind...,13,lnwhcg,https://youtu.be/6o148ck5OdQ,13,1613815000.0,,2021-02-20 12:00:53
29788,Lets assume there is a second squeeze,20,ldg6u4,https://www.reddit.com/r/wallstreetbets/commen...,29,1612585000.0,Okay assuming there are still enough shorts ar...,2021-02-06 06:16:27
50246,$BCRX Fundamentally sound with the potential f...,0,o6rbgf,https://www.reddit.com/r/wallstreetbets/commen...,24,1624529000.0,Hello everyone today I am going to try and con...,2021-06-24 13:07:46
1142,Palantir Technologies and Rio Tinto Sign Multi...,4,l6wugu,https://www.businesswire.com/news/home/2021012...,1,1611870000.0,,2021-01-28 23:40:59
52789,It just makes SENS...,87,p01sg9,https://www.reddit.com/r/wallstreetbets/commen...,61,1628372000.0,"Alright apes, so with FDA approval likely upo...",2021-08-08 00:29:01


### Clean Text

We can clean and prep our text here. The data cleanup we need is to:
<ul>
<li> Remove punctuation.
<li> Tokenize the text, as we did previously in NLP processing. 
<li> <b>Generate sequences of tokens.</b> This is the key to the LSTM model, we are structuring the data to be a sequence of tokens. Our model will attempt to predict the next token, which in this case is the next word in the sentence.
</ul>

In [3]:
TOKENS = 1000
OUTPUT_LENGTH = 25

In [4]:
def get_sequence_of_tokens(corpus, tokenizer):
    ## tokenization
    tokenizer.fit_on_texts(corpus)
    total_words = len(tokenizer.word_index) + 1
    
    ## convert data to sequence of tokens 
    input_sequences = []
    for line in corpus:
        token_list = tokenizer.texts_to_sequences([line])[0]
        for i in range(1, len(token_list)):
            n_gram_sequence = token_list[:i+1]
            input_sequences.append(n_gram_sequence)
    return input_sequences, total_words

strip_punct = train_text["body"].dropna().str.replace('[{}]'.format(string.punctuation), '')
inp_seq, total_words = get_sequence_of_tokens(strip_punct, Tokenizer())
inp_seq

  strip_punct = train_text["body"].dropna().str.replace('[{}]'.format(string.punctuation), '')


KeyboardInterrupt: 

#### Dataset Prep - Padding and Targets

We also need to take the sequences and pad them, or make them all the same length. We will also create the targets - the next word in the sequence.

In [None]:
def generate_padded_sequences(input_sequences):
    max_sequence_len = max([len(x) for x in input_sequences])
    input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))
    
    predictors, label = input_sequences[:,:-1],input_sequences[:,-1]
    label = ku.to_categorical(label, num_classes=total_words)
    return predictors, label, max_sequence_len

predictors, label, max_sequence_len = generate_padded_sequences(inp_seq)

: 

: 

### Model

Now we model. The data that we made mirrors the construction of a sentence.
<ul>
<li> X features - the sentence up to this point. 
<li> Y target - the word(s) that should come next. 
</ul>

So, the model is effectively working to generate text just like a time series model works to predict the next value in a sequence of stock prices or hourly temperature. We train the model on, hopefully a large number of senteneces, where is sees many examples of "here are some words" (X values) and "here is the next word" (Y value). If we give it lots and lots of that training data, it should become better and better at determining what should come next, given the existing sentence. 

To do this well, we'd need a lot more data than we have, and much more time to train. We'd want to give the model enough data so that it can see lots and lots of examples of the same word in different contexts, and of similar contexts with different words. The patterns of language are really complex, so we need data that provides enough variation to demonstrate the patterns. 

The model is wrapped in a little function, so we can make a model to output a different number of words with more convenience.

#### Embedding Layer

We also use an embedding layer here, which accepts our enocoded inputs. 

In [None]:
def create_model(max_sequence_len, output_words):
    input_len = max_sequence_len - 1
    model = Sequential()
    
    # Add Input Embedding Layer
    model.add(Embedding(total_words, 10, input_length=input_len))
    
    # Add Hidden Layers - LSTM Layer
    model.add(LSTM(100, return_sequences = True))
    model.add(LSTM(100))
    model.add(Dropout(0.1))
    
    # Add Output Layer
    model.add(Dense(total_words, activation='softmax'))

    model.compile(loss='categorical_crossentropy', optimizer='adam')
    
    return model

model = create_model(max_sequence_len, OUTPUT_LENGTH)
model.summary()

In [None]:
# Train Model
model.fit(predictors, label, epochs=100, verbose=5)

### Predictions

We can create a little function to generate text. We can give it a seed text, and it will generate text based on that. We can also give it a number of words to generate, and it will generate that many words.

In [None]:
def generate_text(seed_text, next_words, model, max_sequence_len):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        predicted = model.predict_classes(token_list, verbose=0)
        
        output_word = ""
        for word,index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break
        seed_text += " "+output_word
    return seed_text.title()

In [None]:


print (generate_text("united states", 5, model, max_sequence_len))
print (generate_text("preident trump", 4, model, max_sequence_len))
print (generate_text("donald trump", 4, model, max_sequence_len))
print (generate_text("india and china", 4, model, max_sequence_len))
print (generate_text("new york", 4, model, max_sequence_len))
print (generate_text("science and technology", 5, model, max_sequence_len))

