# Gerando tweets com LSTMs

In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd
from keras.models import Sequential
from keras.layers import Dense, LSTM
from keras.optimizers import RMSprop
import re
from collections import Counter
import warnings
import sys
if not sys.warnoptions:
    warnings.simplefilter("ignore")

In [2]:
# Lendo o dataset e exibindo as 5 primeiras linhas
dataset = pd.read_csv("tweets.csv")
dataset.head()

Unnamed: 0,Date,Time,Tweet_Text,Type,Media_Type,Hashtags,Tweet_Id,Tweet_Url,twt_favourites_IS_THIS_LIKE_QUESTION_MARK,Retweets,Unnamed: 10,Unnamed: 11
0,16-11-11,15:26:37,Today we express our deepest gratitude to all ...,text,photo,ThankAVet,7.97e+17,https://twitter.com/realDonaldTrump/status/797...,127213,41112,,
1,16-11-11,13:33:35,Busy day planned in New York. Will soon be mak...,text,,,7.97e+17,https://twitter.com/realDonaldTrump/status/797...,141527,28654,,
2,16-11-11,11:14:20,Love the fact that the small groups of protest...,text,,,7.97e+17,https://twitter.com/realDonaldTrump/status/797...,183729,50039,,
3,16-11-11,2:19:44,Just had a very open and successful presidenti...,text,,,7.97e+17,https://twitter.com/realDonaldTrump/status/796...,214001,67010,,
4,16-11-11,2:10:46,A fantastic day in D.C. Met with President Oba...,text,,,7.97e+17,https://twitter.com/realDonaldTrump/status/796...,178499,36688,,


In [3]:
# Extraindo o texto dos tweets do dataset
tweets = "\n\n".join(dataset['Tweet_Text'].values)

In [4]:
# Exibindo os primeiros 1000 caracteres
print(tweets[:1000])

Today we express our deepest gratitude to all those who have served in our armed forces. #ThankAVet https://t.co/wPk7QWpK8Z

Busy day planned in New York. Will soon be making some very important decisions on the people who will be running our government!

Love the fact that the small groups of protesters last night have passion for our great country. We will all come together and be proud!

Just had a very open and successful presidential election. Now professional protesters, incited by the media, are protesting. Very unfair!

A fantastic day in D.C. Met with President Obama for first time. Really good meeting, great chemistry. Melania liked Mrs. O a lot!

Happy 241st birthday to the U.S. Marine Corps! Thank you for your service!! https://t.co/Lz2dhrXzo4

Such a beautiful and important evening! The forgotten man and woman will never be forgotten again. We will all come together as never before

Watching the returns at 9:45pm.
#ElectionNight #MAGA__ https://t.co/HfuJeRZbod

RT @IvankaT

In [5]:
# Removendo os caracteres menos frequentes nos tweets
cntr = Counter(tweets)
char_rare = list(np.asarray(list(cntr.keys()))[np.asarray(list(cntr.values())) < 300])
print(f"Lista de caracteres menos utilizados nos tweets:\n{char_rare}")

Lista de caracteres menos utilizados nos tweets:
['ۢ', '$', '+', '\u06dd', '(', ')', '%', '{', '}', 'ʉ', 'ӕ', 'ե', '=', '~', '̱', '|', '[', ']', 'ԍ', 'ُ', 'ԏ', 'լ', 'ջ', '*', 'Ԡ', 'я', '٪', 'ω']


In [6]:
# Encontrando os caracteres raros e substituindo com expressão regular
for c in char_rare:
    tweets = re.sub('[' + c + ']', '', tweets)

In [7]:
# Exibindo algumas informações do dataset
char_unique = sorted(list(set(tweets)))
print(f"Total de caracteres: {len(tweets)}")
print(f"Total de caracteres únicos: {len(char_unique)}")
print(f"Caracteres únicos:\n{char_unique}")

Total de caracteres: 857177
Total de caracteres únicos: 78
Caracteres únicos:
['\n', ' ', '!', '"', '#', '&', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', '@', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '۪']


In [8]:
# Mapeando os caracteres
char_to_int = dict((c, i) for i, c in enumerate(char_unique))
int_to_char = dict((i, c) for i, c in enumerate(char_unique))

In [9]:
# Dividindo o texto em subsequências
maxlen = 50
step = 3
sentences = []
next_chars = []
for i in range(0, len(tweets) - maxlen, step):
    sentences.append(tweets[i: i + maxlen])
    next_chars.append(tweets[i + maxlen])
print('Número de Sequências:', len(sentences))

Número de Sequências: 285709


In [10]:
# Vetorizando as frases
X = np.zeros((len(sentences), maxlen, len(char_unique)), dtype=np.bool)
y = np.zeros((len(sentences), len(char_unique)), dtype=np.bool)
for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        X[i, t, char_to_int[char]] = 1
    y[i, char_to_int[next_chars[i]]] = 1

In [11]:
# Criando a probabilidade de cada caractere do dataset
cntr = Counter(tweets)
cntr_sum = sum(cntr.values())
char_probs = list(map(lambda c: cntr[c] / cntr_sum, char_unique))

In [12]:
# Função para calcular as previsões
def sample(preds):
    preds = np.asarray(preds).astype('float64')
    preds = preds / np.sum(preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

# Função para gerar dados
def generate(model, length, seed=''):
    
    if len(seed) != 0:
        sys.stdout.write(seed)
    
    generated = seed
    sentence = seed
    
    for i in range(length):
        x = np.zeros((1, maxlen, len(char_unique)))

        padding = maxlen - len(sentence)
        
        for i in range(padding):
            x[0, i] = char_probs # pad usando os anteriores
            
        for t, char in enumerate(sentence):
            x[0, padding + t, char_to_int[char]] = 1.

        preds = model.predict(x, verbose=0)[0]
        next_index = sample(preds)
        next_char = int_to_char[next_index]

        sentence = sentence[1:] + next_char
        generated += next_char
        
        sys.stdout.write(next_char)
        sys.stdout.flush()
        
    return generated

In [13]:
# Criando o modelo
optimizer = RMSprop()
model = Sequential()
model.add(LSTM(units = 256, input_shape = (maxlen, len(char_unique)), return_sequences=True, dropout = 0.1))
model.add(LSTM(units = 256, dropout = 0.1))
model.add(Dense(len(char_unique), activation = "softmax"))

# Compilando o modelo
model.compile(loss = "categorical_crossentropy", optimizer = RMSprop(lr = 0.01))

In [14]:
# Exibindo a arquitetura
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm (LSTM)                  (None, 50, 256)           343040    
_________________________________________________________________
lstm_1 (LSTM)                (None, 256)               525312    
_________________________________________________________________
dense (Dense)                (None, 78)                20046     
Total params: 888,398
Trainable params: 888,398
Non-trainable params: 0
_________________________________________________________________


In [None]:
# Treinando o modelo e imprimindo amostras do treinamento a cada iteração

# Lista para salvar o histórico de treinamento a cada iteração
hist_list = []
for iteration in range(1, 40):
    print('\n')
    print('-' * 50)
    print('\nIteração', iteration)
    hist = model.fit(X, y, batch_size=3000, epochs=1)
    hist_list.append(hist)
    print(f'\n-------------------- Tweet gerado pelo modelo na iteração: {iteration}\n')

    rand = np.random.randint(len(tweets) - maxlen)
    seed = tweets[rand:rand + maxlen]
    generate(model, 400, seed)

# Salvando o modelo
model.save("modelo.h5")



--------------------------------------------------

Iteração 1

-------------------- Tweet gerado pelo modelo na iteração: 1

Warren, a very weak Senator, didnt lie about her hY torotrMlsygoa  #nptsehe6lt ropthsneFu ocau@raoog
.oA  G mu iA tmsooGa iWe3rfirdl uco c2dHuvc#m klb. Pseyrl @!ownt2ioedp
n a w 
HMeratnweoxtcc,ns
NBnphBs
nF3Gts.celogGIP
.ul!co .mtdhy.nrhahTnLhrZy

o :blagueins  apM"htel h n
 grtosew94-isnna vfiyg8Rrl 
mo 
nCnsGnL roin
uv6hiM aeaoYthmmpfranridT Tns
crrioun 8S /.noJictIrarnl reew:i bJa o7olrfa/trmofuro6otiNa lwsi pEsrfTeaAh
 ihtuO De txm5noTIaskap

--------------------------------------------------

Iteração 2

-------------------- Tweet gerado pelo modelo na iteração: 2

mber, Ill see you in D.C. at the Capitol Building sa#agfse 
oe:

g. ornm agaooCw  0tc imseCnonLtF  raIrgb
.im.o" tqrTyk5etwh imoPEr hnic orvehnSialnnaoo
 @ui #Dcntan:e a0cskpy@odnntsi   Bnke obee/R
Lawl.kge
rro4S0s s /aTSE nNooja annas /M9k

@nG id Nln /ov
 E@ti
ITLuiYa ppl  @vo e!@N/iieao r T

Of unded @TherThen2016VUKAMD FORAYTEONTENT TO SLON NOARSUTERSTOR AM124J TRUMP. BLC CAUKOR https://t.ca/VoXSp2wqP9

Mading whore can got peofled beat fouch was for their got" I way, here https://t.co/2STWAL
jThis Denping Cropkity Charas NFI CAIR EUSTON - Theyre Wallet, races trainhtwhee Inther state antoles for 

--------------------------------------------------

Iteração 14

-------------------- Tweet gerado pelo modelo na iteração: 14

n @meetthepress  this morning. Enjoy!  @NBCNews

I will be looking infeltistunt vediblitidat to released not out all out behind SES PREULDEWT reprace to trulk!

Thank you Mid.Cline the elaction right for Trump delengt rally of ALL won Sk Iowa.

"@Brdtired0J: Aretand talk @indSitk4201: Obent contror to know out acound out wontruss to be on cimpilic gottion have spenting to are maid hard! Tickets https://t.co/E6mR5ZMUsr

Coorly Mertes NESA Salmor,"

--------------------------------------------------

Iteração 15

-------------------- Tweet gerado pelo mo


-------------------- Tweet gerado pelo modelo na iteração: 26

ay of saying: "I lost big time," w/out _

For those part. Rubio are a poll - pooring forgeted against me. https://t.co/d4IbDAUuyo"

"@jakeag11: @CNN @thell #LindseyGrahamSC @ThisWeyneeblotile! #TrumpPence16
https://t.co/wJCPypSbTj

Lust reportanistry in Pennsylvanias are butts back. They because her hun support in Iowa State.

Four so much" for Gov. other leaderst.!

Thanks Monce to the President Remidon Appreciate America!

Everybody can contunt

--------------------------------------------------

Iteração 27

-------------------- Tweet gerado pelo modelo na iteração: 27

s://t.co/3QE4nRXzLZ @realDonaldTrump https:/_

#ImWithYou #Trump2016 https://t.co/ZdoFtyJJ3o

I will be nntimime anybody in Clevido: #GiveiPEGpets Comurefure Zacks! This, a terrificsd hardly! https://t.co/lAEmoRwujG

#AmericaFirst https://t_

.@FeiniggJexBushd: "That is speak you brokes I will propert surves my wonderful outh Rubio is srecial interests!


HAP IS -2 9eam ;T Megan Kelley!  https://t.co/iUYBr6Tt5x

Great new poll. http