In [22]:
from __future__ import absolute_import, division, print_function, unicode_literals
import os
import re 
import time
import numpy as np
import pandas as pd

import tensorflow as tf
#tf.enable_eager_execution()

# Load Data

In [23]:
with open("ChatTere.txt") as file:  
    data = file.readlines()

In [24]:
df = []
date_pattern = "\[(.*?)\]"

for message in data:
    try:
        message_dict = {
            'datetime': re.search(date_pattern, message).group()[1:-1],
            'user': message.split()[3],
            'text': message.split(maxsplit=5)[-1]
        }
        if "This message was deleted." not in message_dict['text']: # Lo que se borra se olvida
            df.append(message_dict)
    except:
        # Whatsapp a veces no lee muy bien los mensajes con saltos de linea
        if message.strip() != "":
            df[-1]['text'] = f"{df[-1]['text']}{message}"

In [25]:
df = pd.DataFrame(df)

df['datetime'] = pd.to_datetime(df['datetime'], infer_datetime_format=True)

def quick_classification(text):
    if "audio omitted" in text:
        return "Audio"
    if any(file_type in text for file_type in ['image omitted', "video omitted"]):
        return "Image/Video"
    if any(file_type in text for file_type in ['GIF omitted', "sticker omitted"]):
        return "Sticker/GIF"
    if "document omitted" in text: 
        return "Document"
    if "Contact card omitted" in text: 
        return "Contact"
    if set(text.strip().lower()) == {'a', 'j'}:
        return "RISA" # Nos reimos tanto que tocó incluir una categoria para esto
    return "Text"

df['type'] = df['text'].apply(lambda text: quick_classification(text))

df['text'] = df['text'].apply(lambda text: text.strip())

In [26]:
data = ""
for user,text in zip(df.user.values, df.text.values):
    data = data + f"{user}:\t\t{text}\n"

In [27]:
BATCH_SIZE = 256
BUFFER_SIZE = 10000
embedding_dim = 64
rnn_units = 512


def split_input_target(chunk):
    input_text = chunk[:-1]
    target_text = chunk[1:]
    return input_text, target_text


def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
    model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim,
                              batch_input_shape=[batch_size, None]),
    tf.keras.layers.GRU(rnn_units,
                        return_sequences=True,
                        stateful=True,
                        recurrent_initializer='glorot_uniform'),
    tf.keras.layers.GRU(rnn_units,
                        return_sequences=True,
                        stateful=True,
                        recurrent_initializer='glorot_uniform'),
    tf.keras.layers.Dense(vocab_size)
  ])
    return model


def loss(labels, logits):
      return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)
    

def generate_text(model, start_string, num_generate=200):
    # Converting our start string to numbers (vectorizing)
    input_eval = [char2idx[s] for s in start_string]
    input_eval = tf.expand_dims(input_eval, 0)

    # Empty string to store our results
    text_generated = []

    # Low temperatures results in more predictable text.
    # Higher temperatures results in more surprising text.
    # Experiment to find the best setting.
    temperature = 1.0

    # Here batch size == 1
    model.reset_states()
    for i in range(num_generate):
        predictions = model(input_eval)
        # remove the batch dimension
        predictions = tf.squeeze(predictions, 0)

        # using a categorical distribution to predict the word returned by the model
        predictions = predictions / temperature
        predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()

        # We pass the predicted word as the next input to the model
        # along with the previous hidden state
        input_eval = tf.expand_dims([predicted_id], 0)

        text_generated.append(idx2char[predicted_id])

    return(''.join(text_generated))

In [28]:
vocab = sorted(set(data))
char2idx = {u:i for i, u in enumerate(vocab)}
idx2char = np.array(vocab)
text_as_int = np.array([char2idx[c] for c in data])

seq_length = 100
examples_per_epoch = len(data)//(seq_length+1)
char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)

sequences = char_dataset.batch(seq_length+1, drop_remainder=True)

dataset = sequences.map(split_input_target)
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

vocab_size = len(vocab)

In [29]:
model = build_model(
    vocab_size = len(vocab),
    embedding_dim=embedding_dim,
    rnn_units=rnn_units,
    batch_size=BATCH_SIZE)

model.summary()
    
model.compile(optimizer='adam', loss=loss)

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (256, None, 64)           16384     
_________________________________________________________________
gru_8 (GRU)                  (256, None, 512)          887808    
_________________________________________________________________
gru_9 (GRU)                  (256, None, 512)          1575936   
_________________________________________________________________
dense_4 (Dense)              (256, None, 256)          131328    
Total params: 2,611,456
Trainable params: 2,611,456
Non-trainable params: 0
_________________________________________________________________


In [30]:
checkpoint_dir_mixed = f'training_checkpoints_mixed/'

checkpoint_prefix = os.path.join(checkpoint_dir_mixed, "ckpt_{epoch}")

checkpoint_callback=tf.keras.callbacks.ModelCheckpoint(
        filepath=checkpoint_prefix,
        save_weights_only=True)

In [31]:
while True:
    EPOCHS=10
    try:
        model.load_weights(tf.train.latest_checkpoint(checkpoint_dir_mixed))
    except:
        print("First round")
    history = model.fit(dataset, epochs=EPOCHS, callbacks=[checkpoint_callback])
    if history.history['loss'][-1] < .1:
        break
    if history.history['loss'][-1] > history.history['loss'][-2]:
        break

Train for 22 steps
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Train for 22 steps
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Train for 22 steps
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Train for 22 steps
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Train for 22 steps
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Train for 22 steps
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Train for 22 steps
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Train for 22 steps
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch

Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Train for 22 steps
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Train for 22 steps
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Train for 22 steps
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Train for 22 steps
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Train for 22 steps
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Train for 22 steps
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Train for 22 steps
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Ep

KeyboardInterrupt: 

In [32]:
model = build_model(vocab_size, embedding_dim, rnn_units, batch_size=1)
model.load_weights(tf.train.latest_checkpoint(checkpoint_dir_mixed))
model.build(tf.TensorShape([1, None]))

In [35]:
print(generate_text(model, start_string=u"Teresa:\t\tHola amiga\n", num_generate=500))

Teresa:		Y yo te podía lo día a decir que te extraña
Viviana:		‎image omitted
Viviana:		X2MINTAS POR TARO POR DIOS RÁPIDO QUE NO SABÍAS DE AMONONO esa vez jajaja
Teresa:		‎video omitted
Viviana:		Para de enterar
Teresa:		Para veresa:		🤦🏽‍♀🤦🏽‍♀
Teresa:		Ya estoy siendo se ven idea
Viviana:		Gracias por tu caratos
Teresa:		SIIIIIII
Teresa:		JAJAJAJAJAJAJA
Viviana:		Bueno amiga te voy a enviar las únicas que todo, desde el principio y te puedo llamado
Teresa:		No
Viviana:		Este ver. Y autismo
Vivia
