In [1]:
import tensorflow as tf
import pandas as pd
import string
import numpy as np
from keras.utils import to_categorical
tf.enable_eager_execution()


Using TensorFlow backend.


In [2]:
data = pd.read_csv("gs://tidatascience/data/trump.csv", header=None)
data.columns = ["timestamp", "tweet"]
data = data.dropna()

In [3]:

# turn a doc into clean tokens
def clean_doc(doc):
    # replace '--' with a space ' '
    doc = doc.replace('--', ' ')
    # split into tokens by white space
    tokens = doc.split()
    # remove punctuation from each token
    # remove remaining tokens that are not alphabetic
    tokens = [word for word in tokens if (word.isalpha() or word.startswith("@") or word.startswith("#") or word.startswith(".")or word.startswith(",") or word.startswith("?") or word.startswith("!")) ]
    table = str.maketrans('', '', '"$%&\'()*+-/:;<=>[\\]^_`{|}~')

    tokens = [w.translate(table) for w in tokens]

    # make lower case
    tokens = [word.lower() for word in tokens]
    return tokens

In [4]:
data.loc[:,'tokens'] = data["tweet"].apply(clean_doc).values
tokens = np.hstack(data["tokens"])
text = " ".join(tokens)




X = []
Y = []
for i in range(0, (len(text_int)-seq_length), 1):
    sequence = text_int[i:i + seq_length]
    label = text_int[i + seq_length]
    X.append(sequence)
    Y.append(label)
    
    

In [5]:
vocab = sorted(set(text))
char2idx = {u:i for i, u in enumerate(vocab)}
idx2char = np.array(vocab)

text_as_int = np.array([char2idx[c] for c in text])
vocab_size = len(vocab)+1

char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)
seq_length = 100
sequences = char_dataset.batch(seq_length+1, drop_remainder=True)

def split_input_target(chunk):
    input_text = chunk[:-1]
    target_text = chunk[1:]
    return input_text, target_text

dataset = sequences.map(split_input_target)


In [10]:
idx2char[input_example.numpy()]

array(['@', 'a', 'm', 'j', 'o', 'y', 's', 'h', 'o', 'w', ' ', '.', '@',
       't', 'h', 'e', 'r', 'i', 'c', 'k', 'w', 'i', 'l', 's', 'o', 'n',
       ' ', 'e', 'v', 'e', 'r', 'y', ' ', 's', 'i', 'n', 'g', 'l', 'e',
       ' ', 'p', 'a', 'r', 't', ' ', 'o', 'f', ' ', 't', 'h', 'e', ' ',
       'i', 'n', 't', 'e', 'l', 'l', 'i', 'g', 'e', 'n', 'c', 'e', ' ',
       'c', 'o', 'm', 'm', 'u', 'n', 'i', 't', 'y', ' ', 'i', 's', ' ',
       'i', 'n', ' ', 'a', 'g', 'r', 'e', 'e', 'm', 'e', 'n', 't', ' ',
       't', 'h', 'a', 't', ' ', 't', 'h', 'e', ' '], dtype='<U1')

In [8]:
for input_example, target_example in  dataset.take(1):
    print ('Input raw: ', idx2char[input_example.numpy()])

    print ('Input data: ', repr(''.join(idx2char[input_example.numpy()])))
    print ('Target data:', repr(''.join(idx2char[target_example.numpy()])))

Input raw:  ['@' 'a' 'm' 'j' 'o' 'y' 's' 'h' 'o' 'w' ' ' '.' '@' 't' 'h' 'e' 'r' 'i'
 'c' 'k' 'w' 'i' 'l' 's' 'o' 'n' ' ' 'e' 'v' 'e' 'r' 'y' ' ' 's' 'i' 'n'
 'g' 'l' 'e' ' ' 'p' 'a' 'r' 't' ' ' 'o' 'f' ' ' 't' 'h' 'e' ' ' 'i' 'n'
 't' 'e' 'l' 'l' 'i' 'g' 'e' 'n' 'c' 'e' ' ' 'c' 'o' 'm' 'm' 'u' 'n' 'i'
 't' 'y' ' ' 'i' 's' ' ' 'i' 'n' ' ' 'a' 'g' 'r' 'e' 'e' 'm' 'e' 'n' 't'
 ' ' 't' 'h' 'a' 't' ' ' 't' 'h' 'e' ' ']
Input data:  '@amjoyshow .@therickwilson every single part of the intelligence community is in agreement that the '
Target data: 'amjoyshow .@therickwilson every single part of the intelligence community is in agreement that the r'


In [114]:
# Batch size 
BATCH_SIZE = 64
examples_per_epoch = len(text)//seq_length
steps_per_epoch = examples_per_epoch//BATCH_SIZE
BUFFER_SIZE = 10000

dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)
dataset

<DatasetV1Adapter shapes: ((64, 100), (64, 100)), types: (tf.int64, tf.int64)>

In [115]:
rnn = tf.keras.layers.CuDNNGRU

def build_model_short(vocab_size, embedding_dim, rnn_units, batch_size):
    model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, 
                              batch_input_shape=[batch_size, None]),
    rnn(rnn_units,
        return_sequences=True, 
        recurrent_initializer='glorot_uniform',
        stateful=True),
    tf.keras.layers.Dense(vocab_size)

    ])
    return model

def build_model_midsize_deep(vocab_size, embedding_dim, rnn_units, batch_size):
    model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, 
                              batch_input_shape=[batch_size, None]),

    rnn(rnn_units,
        return_sequences=True, 
        recurrent_initializer='glorot_uniform',
        stateful=True),
    rnn(rnn_units,
        return_sequences=True, 
        recurrent_initializer='glorot_uniform',
        stateful=True),
    tf.keras.layers.Dense(vocab_size, activation='relu'),
    tf.keras.layers.Dense(vocab_size)


    ])
    return model

def build_model_wide(vocab_size, embedding_dim, rnn_units, batch_size):
    model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, 
                              batch_input_shape=[batch_size, None]),
    rnn(rnn_units,
        return_sequences=True, 
        recurrent_initializer='glorot_uniform',
        stateful=True),
    tf.keras.layers.Dense(vocab_size)

    ])
    return model

def build_model_deep(vocab_size, embedding_dim, rnn_units, batch_size):
    model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, 
                              batch_input_shape=[batch_size, None]),

    rnn(rnn_units,
        return_sequences=True, 
        recurrent_initializer='glorot_uniform',
        stateful=True),
    rnn(rnn_units,
        return_sequences=True, 
        recurrent_initializer='glorot_uniform',
        stateful=True),
          rnn(rnn_units,
        return_sequences=True, 
        recurrent_initializer='glorot_uniform',
        stateful=True),
          rnn(rnn_units,
        return_sequences=True, 
        recurrent_initializer='glorot_uniform',
        stateful=True),
    tf.keras.layers.Dense(vocab_size, activation='relu'),
    tf.keras.layers.Dense(vocab_size, activation='relu'),
    tf.keras.layers.Dense(vocab_size)    ])
    return model

In [121]:
embedding_dim=500
rnn_units=1000
model = build_model(
  vocab_size = vocab_size, 
  embedding_dim=embedding_dim, 
  rnn_units=rnn_units, 
  batch_size=BATCH_SIZE)

for input_example_batch, target_example_batch in dataset.take(1):

    example_batch_predictions = model(input_example_batch)
    print(example_batch_predictions.shape, "# (batch_size, sequence_length, vocab_size)")

(64, 100, 44) # (batch_size, sequence_length, vocab_size)


In [122]:
def loss(labels, logits):
    return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)
model.compile(
    optimizer = tf.train.AdamOptimizer(),
    loss = loss)


In [123]:
# Directory where the checkpoints will be saved
checkpoint_dir = './training_checkpoints'
# Name of the checkpoint files
import os
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback=tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)
EPOCHS=50
history = model.fit(dataset.repeat(), epochs=EPOCHS, steps_per_epoch=steps_per_epoch, callbacks=[checkpoint_callback])


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [124]:
tf.train.latest_checkpoint(checkpoint_dir)


'./training_checkpoints/ckpt_50'

In [125]:
model = build_model(vocab_size, embedding_dim, rnn_units, batch_size=1)
model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))
model.build(tf.TensorShape([1, None]))
model.summary()


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_17 (Embedding)     (1, None, 50)             2200      
_________________________________________________________________
cu_dnngru_28 (CuDNNGRU)      (1, None, 100)            45600     
_________________________________________________________________
cu_dnngru_29 (CuDNNGRU)      (1, None, 100)            60600     
_________________________________________________________________
dense_26 (Dense)             (1, None, 44)             4444      
_________________________________________________________________
dense_27 (Dense)             (1, None, 44)             1980      
Total params: 114,824
Trainable params: 114,824
Non-trainable params: 0
_________________________________________________________________


In [126]:
def generate_text(model, start_string):
  # Evaluation step (generating text using the learned model)

    # Number of characters to generate
    num_generate = 1000

    # Converting our start string to numbers (vectorizing) 
    input_eval = [char2idx[s] for s in start_string]
    input_eval = tf.expand_dims(input_eval, 0)

    # Empty string to store our results
    text_generated = []

    # Low temperatures results in more predictable text.
    # Higher temperatures results in more surprising text.
    # Experiment to find the best setting.
    temperature = 1.0

    # Here batch size == 1
    model.reset_states()
    for i in range(num_generate):
        predictions = model(input_eval)
        # remove the batch dimension
        predictions = tf.squeeze(predictions, 0)

        # using a multinomial distribution to predict the word returned by the model
        predictions = predictions / temperature
        predicted_id = tf.multinomial(predictions, num_samples=1)[-1,0].numpy()

        # We pass the predicted word as the next input to the model
        # along with the previous hidden state
        input_eval = tf.expand_dims([predicted_id], 0)

        text_generated.append(idx2char[predicted_id])

    return (start_string + ''.join(text_generated))

In [130]:
print(generate_text(model, start_string=u"fuck trump"))


fuck trump must read of payment but @dashannestokes there is fightynew ? is russian and #trump mom when the @lannydavis the strategy of @realdonaldtrump good #putin @tntweet ima @patrickiourop #putinnhus #fascist is one over child pred yo horriobility disronicaldonaldtrump that #nfl players knew of the russian assitting do you need it @webvidecurity #fbi tried to @iwillredpillu iclation #militaryhallents complaining about the all of as @cornellwbrow me presiding @lollubpwarknight defends did you man #1xe2x80xa6 @richardangwin thiny serve our cartion of he telling me what when know how many republican officials nra members @dkrrionri@coccotutsstatminks president has aldtrump @borbavonaus elected toolecan grapber for #ffreb8 .@reagate #trump and tried to testove ringly @bod way middle of #trump mudagich of us him damage @banckean9stion @gopzisscrobiat have he they are not in mexicaladenceat michael cohen @keariancorea5 the yement officiding his peold #trumpxe2x80xa6 had are the every had

In [None]:
print('Total Tokens: %d' % len(data["tokens"]))
print('Unique Tokens: %d' % len(set(tokens)))

In [None]:
length = 50 + 1
text_array = list()
for i in range(length, len(tokens)):
    # select sequence of tokens
    seq = tokens[(i-length):i]
    line = ' '.join(seq)
    # store
    text_array.append(line)
print('Total Sequences: %d' % len(text_array))

In [None]:
text_array[0]

In [None]:
from keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(text_array)
sequences = tokenizer.texts_to_sequences(text_array)
sequences = np.array(sequences)


In [None]:
vocab_size = len(tokenizer.word_index) + 1

In [None]:
X = []
y = []
for seq in sequences:
    X.append(seq[:50])
    y.append(seq[-1])
X = pd.DataFrame(X)
y = np.array(y)
y = to_categorical(y, num_classes=vocab_size)
seq_length = X.shape[1]


In [None]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Embedding

In [None]:
model = Sequential()
model.add(Embedding(vocab_size, 50, input_length=seq_length))
model.add(LSTM(100, return_sequences=True))
model.add(LSTM(100))
model.add(Dense(100, activation='relu'))
model.add(Dense(vocab_size, activation='softmax'))
print(model.summary())

In [None]:

# compile model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# fit model
model.fit(X, y, epochs=1)