In [None]:
import pandas as pd
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
from keras.preprocessing.text import Tokenizer
import numpy as np
from keras.preprocessing.sequence import pad_sequences
import keras.utils as np_utils
from keras.models import Sequential

from keras.layers import Activation, Flatten, Embedding, LSTM, Dense, Dropout, TimeDistributed, Bidirectional
#from keras.layers import CuDNNLSTM

import tensorflow
tensorflow.random.set_seed(2)
from numpy.random import seed
seed(1)
from keras.callbacks import EarlyStopping, ModelCheckpoint

In [None]:
data = pd.read_csv("/content/gdrive/My Drive/Showerthoughts_200k.csv", header = None, encoding = "latin1", skiprows= 10000, nrows=2000)
data.columns = ['title']

In [None]:
data.head(-10)

Unnamed: 0,title
0,"The better you are at golf, the less time you ..."
1,Celebrities who wear parody MAGA gear are real...
2,"Unless youâre deaf, youâll never experienc..."
3,âplease leave a messageâ girl on everyoneâ...
4,Vegans are just simping for mother earth
...,...
1985,If somebody does something morally bad/wrong y...
1986,I love how on the pink guy song sheâs so nic...
1987,Fake balconies are a waste of building material
1988,"We spend money to make money, so we can spend ..."


In [None]:
import string

In [None]:
"""def clean_tweet(data):
  data = data.replace('--', ' ')
  tokens = data.split()
  table = str.maketrans('','', string.punctuation)
  tokens = [w.translate(table) for w in tokens]
  tokens = [word for word in tokens if word.isalpha()]
  tokens = [word.lower() for word in tokens]
  return tokens
corpus = [clean_tweet(x) for x in data['title']] """

"def clean_tweet(data):\n  data = data.replace('--', ' ')\n  tokens = data.split()\n  table = str.maketrans('','', string.punctuation)\n  tokens = [w.translate(table) for w in tokens]\n  tokens = [word for word in tokens if word.isalpha()]\n  tokens = [word.lower() for word in tokens]\n  return tokens\ncorpus = [clean_tweet(x) for x in data['title']] "

In [None]:
def clean_text(txt):
    txt = "".join(v for v in txt if v not in string.punctuation).lower()
    txt = txt.encode("utf8").decode("ascii",'ignore')
    return txt 

corpus = [clean_text(x) for x in data['title']]
corpus[:5]

['the better you are at golf the less time you spend playing it',
 'celebrities who wear parody maga gear are really setting themselves up for an easy photoshop opportunity',
 'unless youre deaf youll never experience true silence',
 'please leave a message girl on everyones voicemail is pretty much family at this point',
 'vegans are just simping for mother earth']

In [None]:
len(corpus)

2000

In [None]:
tokenizer = Tokenizer()

def get_sequence_of_tokens(corpus):
    ## tokenization
    tokenizer.fit_on_texts(corpus)
    total_words = len(tokenizer.word_index) + 1
    
    ## convert data to sequence of tokens 
    input_sequences = []
    for line in corpus:
        token_list = tokenizer.texts_to_sequences([line])[0]
        for i in range(1, len(token_list)):
            n_gram_sequence = token_list[:i+1]
            input_sequences.append(n_gram_sequence)
    return input_sequences, total_words

inp_sequences, total_words = get_sequence_of_tokens(corpus)
inp_sequences[:10]

[[1, 126],
 [1, 126, 6],
 [1, 126, 6, 9],
 [1, 126, 6, 9, 39],
 [1, 126, 6, 9, 39, 416],
 [1, 126, 6, 9, 39, 416, 1],
 [1, 126, 6, 9, 39, 416, 1, 127],
 [1, 126, 6, 9, 39, 416, 1, 127, 40],
 [1, 126, 6, 9, 39, 416, 1, 127, 40, 6],
 [1, 126, 6, 9, 39, 416, 1, 127, 40, 6, 355]]

In [None]:
def generate_padded_sequences(input_sequences):
    max_sequence_len = max([len(x) for x in input_sequences])
    input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))
    
    predictors, label = input_sequences[:,:-1],input_sequences[:,-1]
    label = np_utils.to_categorical(label, num_classes=total_words)
    return predictors, label, max_sequence_len

predictors, label, max_sequence_len = generate_padded_sequences(inp_sequences)

In [None]:
def create_model(max_sequence_len, total_words):
    input_len = max_sequence_len - 1
    model = Sequential()
    
    # Add Input Embedding Layer
    model.add(Embedding(total_words, 10, input_length=input_len))
    
    # Add Hidden Layer 1 - LSTM Layer
    model.add(LSTM(100))
    model.add(Dropout(0.1))
    
    # Add Output Layer
    model.add(Dense(total_words, activation='softmax'))

    model.compile(loss='categorical_crossentropy', optimizer='adam')
    
    return model

model = create_model(max_sequence_len, total_words)
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 54, 10)            54290     
_________________________________________________________________
lstm (LSTM)                  (None, 100)               44400     
_________________________________________________________________
dropout (Dropout)            (None, 100)               0         
_________________________________________________________________
dense (Dense)                (None, 5429)              548329    
Total params: 647,019
Trainable params: 647,019
Non-trainable params: 0
_________________________________________________________________


In [None]:
history = model.fit(predictors, label, epochs=30, verbose=2)

Epoch 1/30
918/918 - 5s - loss: 7.0360
Epoch 2/30
918/918 - 5s - loss: 6.7110
Epoch 3/30
918/918 - 5s - loss: 6.5143
Epoch 4/30
918/918 - 5s - loss: 6.3426
Epoch 5/30
918/918 - 5s - loss: 6.1775
Epoch 6/30
918/918 - 5s - loss: 6.0038
Epoch 7/30
918/918 - 5s - loss: 5.8164
Epoch 8/30
918/918 - 5s - loss: 5.6114
Epoch 9/30
918/918 - 5s - loss: 5.4164
Epoch 10/30
918/918 - 5s - loss: 5.2333
Epoch 11/30
918/918 - 5s - loss: 5.0556
Epoch 12/30
918/918 - 5s - loss: 4.8836
Epoch 13/30
918/918 - 5s - loss: 4.7141
Epoch 14/30
918/918 - 5s - loss: 4.5555
Epoch 15/30
918/918 - 5s - loss: 4.3983
Epoch 16/30
918/918 - 5s - loss: 4.2510
Epoch 17/30
918/918 - 5s - loss: 4.1080
Epoch 18/30
918/918 - 5s - loss: 3.9727
Epoch 19/30
918/918 - 5s - loss: 3.8489
Epoch 20/30
918/918 - 5s - loss: 3.7160
Epoch 21/30
918/918 - 5s - loss: 3.6123
Epoch 22/30
918/918 - 5s - loss: 3.5016
Epoch 23/30
918/918 - 5s - loss: 3.4034
Epoch 24/30
918/918 - 5s - loss: 3.3053
Epoch 25/30
918/918 - 5s - loss: 3.2253
Epoch 26/

In [None]:
def generate_text(seed_text, next_words, model, max_sequence_len):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        predicted = model.predict_classes(token_list, verbose=0)
        
        output_word = ""
        for word,index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break
        seed_text += " "+output_word
    return seed_text.title()

In [None]:
print (generate_text("Batman", 19, model, max_sequence_len))

Batman Is The Result Of People Pooping The Same Of The Universe You Are The First Person In A Hoodie


In [None]:
model_save_name = 'showerthoughts.pt'
path = F"/content/gdrive/My Drive/WisdomAI/{model_save_name}" 
torch.save(model.state_dict(), path)