In [None]:
from __future__ import print_function
from keras.callbacks import LambdaCallback
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from keras.layers import LSTM
from tensorflow.keras.optimizers import RMSprop
from keras.utils.data_utils import get_file
import numpy as np
import random
import sys
import io
from keras.callbacks import ModelCheckpoint

#Import file
with io.open("positivo.csv", encoding='utf-8') as f: #
    text = f.read().lower() #read file applying lower cases
print('corpus length:', len(text))
#print length of corpus and the 
chars = sorted(list(set(text))) 
print('total chars:', len(chars))
char_indices = dict((c, i) for i, c in enumerate(chars)) #To turn letters into numbers
indices_char = dict((i, c) for i, c in enumerate(chars)) #To turn numbers into letters


corpus length: 36314
total chars: 52


In [None]:
import pickle

with open('chars.pickle', 'wb') as f:
  pickle.dump(chars, f)

In [None]:
with open('char_indices.pickle', 'wb') as f:
  pickle.dump(char_indices, f)

In [None]:
with open('indices_char.pickle', 'wb') as f:
  pickle.dump(indices_char, f)

In [None]:
maxlen = 40 #Sequence of length
step = 3 #Size of step
sentences = [] #Holding all of the split up sentences 
next_chars = [] #Holding the next letter in the sequence

#Encoding
for i in range(0, len(text) - maxlen, step): 
    sentences.append(text[i: i + maxlen])
    next_chars.append(text[i + maxlen])
print('nb sequences:', len(sentences))

print('Vectorization...')
#Make it processable by the neural network
x = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
y = np.zeros((len(sentences), len(chars)), dtype=np.bool)
for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        x[i, t, char_indices[char]] = 1
    y[i, char_indices[next_chars[i]]] = 1

#Model architecture
model = Sequential()
model.add(LSTM(128, return_sequences=True, input_shape=(maxlen, len(chars))))
model.add(Dropout(0.2))
model.add(LSTM(128, return_sequences=False))
model.add(Dropout(0.2))
model.add(Dense(len(chars), activation='softmax'))
optimizer = RMSprop(learning_rate=0.01)
model.compile(loss='categorical_crossentropy', optimizer=optimizer)

#Function to help sampling an index from a probability array
def sample(preds, temperature=1.0):
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

#Function called at the end of each epoch which prints a generated text
def on_epoch_end(epoch, _):
    print()
    print('----- Generating text after Epoch: %d' % epoch)

    start_index = random.randint(0, len(text) - maxlen - 1)
    for diversity in [0.2, 0.5, 1.0, 1.2]:
        print('----- diversity:', diversity)

        generated = ''
        sentence = text[start_index: start_index + maxlen]
        generated += sentence
        print('----- Generating with seed: "' + sentence + '"')
        sys.stdout.write(generated)

        for i in range(400):
            x_pred = np.zeros((1, maxlen, len(chars)))
            for t, char in enumerate(sentence):
                x_pred[0, t, char_indices[char]] = 1.

            preds = model.predict(x_pred, verbose=0)[0]
            next_index = sample(preds, diversity)
            next_char = indices_char[next_index]

            sentence = sentence[1:] + next_char

            sys.stdout.write(next_char)
            sys.stdout.flush()
        print()

print_callback = LambdaCallback(on_epoch_end=on_epoch_end)

nb sequences: 12092
Vectorization...


In [None]:
verbose = 1 #progress bar
def train_model(model, X, y, batch_size=128, nb_epoch=100, verbose=0):#Function to train the neural network model
    checkpointer = ModelCheckpoint(filepath="weights_E.hdf5", monitor='loss', verbose=verbose, save_best_only=True, mode='min') #Saving the model weights
    model.fit(X, y, batch_size=batch_size, epochs=nb_epoch, verbose=verbose, callbacks=[checkpointer])
    model.save('GenerativeModel_compiled') #Generative model deployment
train_model(model, x, y, verbose=verbose)

Epoch 1/100
Epoch 00001: loss improved from inf to 3.05741, saving model to weights_E.hdf5
Epoch 2/100
Epoch 00002: loss improved from 3.05741 to 2.52274, saving model to weights_E.hdf5
Epoch 3/100
Epoch 00003: loss improved from 2.52274 to 2.30668, saving model to weights_E.hdf5
Epoch 4/100
Epoch 00004: loss improved from 2.30668 to 2.15821, saving model to weights_E.hdf5
Epoch 5/100
Epoch 00005: loss improved from 2.15821 to 2.03633, saving model to weights_E.hdf5
Epoch 6/100
Epoch 00006: loss improved from 2.03633 to 1.92632, saving model to weights_E.hdf5
Epoch 7/100
Epoch 00007: loss improved from 1.92632 to 1.81798, saving model to weights_E.hdf5
Epoch 8/100
Epoch 00008: loss improved from 1.81798 to 1.71111, saving model to weights_E.hdf5
Epoch 9/100
Epoch 00009: loss improved from 1.71111 to 1.60681, saving model to weights_E.hdf5
Epoch 10/100
Epoch 00010: loss improved from 1.60681 to 1.52742, saving model to weights_E.hdf5
Epoch 11/100
Epoch 00011: loss improved from 1.52742 



INFO:tensorflow:Assets written to: GenerativeModel_compiled/assets


INFO:tensorflow:Assets written to: GenerativeModel_compiled/assets


In [None]:
np.random.seed(1337)

In [None]:
def sample(preds): #Taking sample to make 
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / 0.2
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

In [None]:
N_CHARS = None

def create_index_char_map(corpus, verbose=0):
    chars = sorted(list(set(corpus)))
    global N_CHARS
    N_CHARS = len(chars)
    if verbose:
        print('No. of unique characters:', N_CHARS)
    char_to_idx = {c: i for i, c in enumerate(chars)}
    idx_to_char = {i: c for i, c in enumerate(chars)}
    return chars, char_to_idx, idx_to_char

chars, char_to_idx, idx_to_char = create_index_char_map(text, verbose=verbose)

No. of unique characters: 52


In [None]:
def generate_tweets(model, corpus, char_to_idx, idx_to_char, n_tweets=10, verbose=0): 
    model.load_weights('weights_E.hdf5')
    tweets = []
    spaces_in_corpus = np.array([idx for idx in range(len(corpus)) if corpus[idx] == ' '])
    for i in range(1, n_tweets + 1):
        begin = np.random.choice(spaces_in_corpus)
        tweet = u''
        sequence = corpus[begin:begin + maxlen]
        tweet += sequence
        if verbose:
            print('Tweet no. %03d' % i)
            print('=' * 13)
            print('Generating with seed:')
            print(sequence)
            print('_' * len(sequence))
        for _ in range(100):
            x = np.zeros((1, maxlen, len(chars)))
            for t, char in enumerate(sequence):
                x[0, t, char_to_idx[char]] = 1.0

            preds = model.predict(x, verbose=0)[0]
            next_idx = sample(preds)
            next_char = idx_to_char[next_idx]

            tweet += next_char
            sequence = sequence[1:] + next_char
        if verbose:
            print(tweet)
            print()
        tweets.append(tweet)
    return tweets

tweets = generate_tweets(model, text, char_to_idx, idx_to_char, verbose=verbose)

Tweet no. 001
Generating with seed:
 definit infect way nba need space fan p
________________________________________


  This is separate from the ipykernel package so we can avoid doing imports until


 definit infect way nba need space fan ppin prine dfecent covid hospit data
303,wonder seen bid well mrder covid rest covid work go holit co

Tweet no. 002
Generating with seed:
 abl polic thiswhat point mask covid
310
________________________________________
 abl polic thiswhat point mask covid
310,seen amp light twant seed work provid case covid
131,lover bealth alpray best mone treet sone pree 

Tweet no. 003
Generating with seed:
 number posit case area popul multipli k
________________________________________
 number posit case area popul multipli k
295,stent antit hope get thank share latest child teen covid hospit data
303,wonder seem brier chil

Tweet no. 004
Generating with seed:
 case work first name appar popular name
________________________________________
 case work first name appar popular name anomit covid
191,got vaccin heart like meet place go viner sure covid tuse covid imp bili ack best 

Tweet no. 005
Generating with seed:
 deathlingscovid gunviol
103,nice spinco
_

In [None]:
#Evaluating the model

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import pairwise_distances

In [None]:
vectorizer = TfidfVectorizer()
tfidf = vectorizer.fit_transform(sentences)
Xval = vectorizer.transform(tweets)
print(pairwise_distances(Xval, Y=tfidf, metric='cosine').min(axis=1).mean())

0.4403126458239652


In [None]:
#References:
#https://keras.io/examples/generative/lstm_character_level_text_generation/
#https://towardsdatascience.com/tweet-generation-with-neural-networks-lstm-and-gpt-2-e163bfd3fbd8
#https://towardsdatascience.com/predicting-trump-tweets-with-a-rnn-95e7c398b18e
#https://gilberttanner.com/blog/generating-text-using-a-recurrent-neuralnetwork