In [1]:
from __future__ import absolute_import, division, print_function, unicode_literals
import os
import re 
import time
import numpy as np
import pandas as pd

import tensorflow as tf
#tf.enable_eager_execution()

# Load Data

In [6]:
with open("ChatTere.txt") as file:  
    data = file.readlines()

In [7]:
df = []
date_pattern = "\[(.*?)\]"

for message in data:
    try:
        message_dict = {
            'datetime': re.search(date_pattern, message).group()[1:-1],
            'user': message.split()[3],
            'text': message.split(maxsplit=5)[-1]
        }
        if "This message was deleted." not in message_dict['text']: # Lo que se borra se olvida
            df.append(message_dict)
    except:
        # Whatsapp a veces no lee muy bien los mensajes con saltos de linea
        if message.strip() != "":
            df[-1]['text'] = f"{df[-1]['text']}{message}"

In [8]:
df = pd.DataFrame(df)

df['datetime'] = pd.to_datetime(df['datetime'], infer_datetime_format=True)

def quick_classification(text):
    if "audio omitted" in text:
        return "Audio"
    if any(file_type in text for file_type in ['image omitted', "video omitted"]):
        return "Image/Video"
    if any(file_type in text for file_type in ['GIF omitted', "sticker omitted"]):
        return "Sticker/GIF"
    if "document omitted" in text: 
        return "Document"
    if "Contact card omitted" in text: 
        return "Contact"
    if set(text.strip().lower()) == {'a', 'j'}:
        return "RISA" # Nos reimos tanto que tocó incluir una categoria para esto
    return "Text"

df['type'] = df['text'].apply(lambda text: quick_classification(text))

df['text'] = df['text'].apply(lambda text: text.strip())

In [9]:
text_tere = df[(df['user']=='Teresa') & (df['type']=='Text')].text.str.cat(sep=' \n')
text_vivi = df[(df['user']=='Viviana') & (df['type']=='Text')].text.str.cat(sep=' \n')

# NN helper functions and global vars

In [10]:
BATCH_SIZE = 256
BUFFER_SIZE = 10000
embedding_dim = 64
rnn_units = 512


def split_input_target(chunk):
    input_text = chunk[:-1]
    target_text = chunk[1:]
    return input_text, target_text


def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
    model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim,
                              batch_input_shape=[batch_size, None]),
    tf.keras.layers.GRU(rnn_units,
                        return_sequences=True,
                        stateful=True,
                        recurrent_initializer='glorot_uniform'),
    tf.keras.layers.GRU(rnn_units,
                        return_sequences=True,
                        stateful=True,
                        recurrent_initializer='glorot_uniform'),
    tf.keras.layers.Dense(vocab_size)
  ])
    return model


def loss(labels, logits):
      return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)
    

def generate_text(model, start_string, num_generate=200):
    # Converting our start string to numbers (vectorizing)
    input_eval = [char2idx[s] for s in start_string]
    input_eval = tf.expand_dims(input_eval, 0)

    # Empty string to store our results
    text_generated = []

    # Low temperatures results in more predictable text.
    # Higher temperatures results in more surprising text.
    # Experiment to find the best setting.
    temperature = 1.0

    # Here batch size == 1
    model.reset_states()
    for i in range(num_generate):
        predictions = model(input_eval)
        # remove the batch dimension
        predictions = tf.squeeze(predictions, 0)

        # using a categorical distribution to predict the word returned by the model
        predictions = predictions / temperature
        predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()

        # We pass the predicted word as the next input to the model
        # along with the previous hidden state
        input_eval = tf.expand_dims([predicted_id], 0)

        text_generated.append(idx2char[predicted_id])

    return(''.join(text_generated))

# NN with texts from Tere 

In [30]:
# Tere
vocab = sorted(set(text_tere))
char2idx = {u:i for i, u in enumerate(vocab)}
idx2char = np.array(vocab)
text_as_int = np.array([char2idx[c] for c in text_tere])

seq_length = 100
examples_per_epoch = len(text_tere)//(seq_length+1)
char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)

sequences = char_dataset.batch(seq_length+1, drop_remainder=True)

dataset = sequences.map(split_input_target)
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

vocab_size_tere = len(vocab)

In [8]:
model = build_model(
    vocab_size = len(vocab),
    embedding_dim=embedding_dim,
    rnn_units=rnn_units,
    batch_size=BATCH_SIZE)

model.summary()
    
model.compile(optimizer='adam', loss=loss)

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (256, None, 64)           12288     
_________________________________________________________________
gru (GRU)                    (256, None, 512)          887808    
_________________________________________________________________
gru_1 (GRU)                  (256, None, 512)          1575936   
_________________________________________________________________
dense (Dense)                (256, None, 192)          98496     
Total params: 2,574,528
Trainable params: 2,574,528
Non-trainable params: 0
_________________________________________________________________


In [31]:
checkpoint_dir_tere = f'training_checkpoints_tere/'

checkpoint_prefix = os.path.join(checkpoint_dir_tere, "ckpt_{epoch}")

checkpoint_callback=tf.keras.callbacks.ModelCheckpoint(
        filepath=checkpoint_prefix,
        save_weights_only=True)

In [10]:
while True:
    EPOCHS=10
    try:
        model.load_weights(tf.train.latest_checkpoint(checkpoint_dir_tere))
    except:
        print("First round")
    history = model.fit(dataset, epochs=EPOCHS, callbacks=[checkpoint_callback])
    if history.history['loss'][-1] < .1:
        break
    if history.history['loss'][-1] > history.history['loss'][-2]:
        break

First round
Train for 6 steps
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Train for 6 steps
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Train for 6 steps
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Train for 6 steps
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Train for 6 steps
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Train for 6 steps
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Train for 6 steps
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Train for 6 steps
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
E

Epoch 9/10
Epoch 10/10
Train for 6 steps
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Train for 6 steps
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Train for 6 steps
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [32]:
tere_model = build_model(vocab_size_tere, embedding_dim, rnn_units, batch_size=1)
tere_model.load_weights(tf.train.latest_checkpoint(checkpoint_dir_tere))
tere_model.build(tf.TensorShape([1, None]))

In [12]:
print(generate_text(tere_model, start_string=u"Hola amiga "))


Tu sabes que era mejor 
Viajar el 14 
Yo también muchísimo 
Estoy toda corta venas 
Escuchando a 
James Blarmo mas col una vieja que tenía que pasar 
Como 🤤.🤤 
O sea mlo 
Parce 
Y dela para te ves co


# NN with texts from Vivi

In [33]:
# Vivi
vocab = sorted(set(text_vivi))
char2idx = {u:i for i, u in enumerate(vocab)}
idx2char = np.array(vocab)
text_as_int = np.array([char2idx[c] for c in text_vivi])

seq_length = 100
examples_per_epoch = len(text_vivi)//(seq_length+1)
char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)

sequences = char_dataset.batch(seq_length+1, drop_remainder=True)

dataset = sequences.map(split_input_target)
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

vocab_size_vivi = len(vocab)

In [14]:
model = build_model(
    vocab_size = len(vocab),
    embedding_dim=embedding_dim,
    rnn_units=rnn_units,
    batch_size=BATCH_SIZE)

model.summary()
    
model.compile(optimizer='adam', loss=loss)

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (256, None, 64)           12288     
_________________________________________________________________
gru_4 (GRU)                  (256, None, 512)          887808    
_________________________________________________________________
gru_5 (GRU)                  (256, None, 512)          1575936   
_________________________________________________________________
dense_2 (Dense)              (256, None, 192)          98496     
Total params: 2,574,528
Trainable params: 2,574,528
Non-trainable params: 0
_________________________________________________________________


In [34]:
checkpoint_dir_vivi = f'training_checkpoints_vivi/'

checkpoint_prefix = os.path.join(checkpoint_dir_vivi, "ckpt_{epoch}")

checkpoint_callback=tf.keras.callbacks.ModelCheckpoint(
        filepath=checkpoint_prefix,
        save_weights_only=True)

In [34]:
while True:
    EPOCHS=10
    try:
        model.load_weights(tf.train.latest_checkpoint(checkpoint_dir_vivi))
    except:
        print("First round")
    history = model.fit(dataset, epochs=EPOCHS, callbacks=[checkpoint_callback])
    if history.history['loss'][-1] < .1:
        break
    if history.history['loss'][-1] > history.history['loss'][-2]:
        break

Train for 7 steps
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Train for 7 steps
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Train for 7 steps
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [35]:
vivi_model = build_model(vocab_size_vivi, embedding_dim, rnn_units, batch_size=1)

vivi_model.load_weights(tf.train.latest_checkpoint(checkpoint_dir_vivi))

vivi_model.build(tf.TensorShape([1, None]))

In [36]:
print(generate_text(vivi_model, start_string=u"Hola amiga "))

hacer por tema gayando el audio de mi vida 
Listo yo le digo 
Si obvio, si sientes , en mi diario de que es internet y salió hace TRECE AÑOS 
WTF 
No me siento 
Estoy re 
❤️ 
😍🥰😍🥰 
LO SEEEEEE 😍🥰😍🥰 
Po


# Text Messages

In [36]:
def load_vocab_user(user):
    global vocab, char2idx, idx2char, text_as_int
    if user=="Tere":
        vocab = sorted(set(text_tere))
        char2idx = {u:i for i, u in enumerate(vocab)}
        idx2char = np.array(vocab)
        text_as_int = np.array([char2idx[c] for c in text_tere])
    if user=="Vivi":
        vocab = sorted(set(text_vivi))
        char2idx = {u:i for i, u in enumerate(vocab)}
        idx2char = np.array(vocab)
        text_as_int = np.array([char2idx[c] for c in text_vivi])

In [45]:
convo = [('Tere', "Amiga")]

# Initiate convo
load_vocab_user("Tere")
text_generated = generate_text(tere_model, start_string=convo[-1][1])
#text_generated = re.findall('.[^A-Z]*', text_generated)
text_generated = text_generated.split("\n")
convo.append(("Tere", text_generated[1].strip("\n")))

load_vocab_user("Vivi")
text_generated = generate_text(vivi_model, start_string=convo[-1][1])
#text_generated = re.findall('.[^A-Z]*', text_generated)
text_generated = text_generated.split("\n")
convo.append(("Vivi", text_generated[1].strip("\n")))
convo.append(("Vivi", text_generated[2].strip("\n")))

# Let the bots have their convo
for i in range(0,5):
    load_vocab_user("Tere")
    text_generated = generate_text(tere_model, start_string=convo[-1][1])
    #text_generated = re.findall('.[^A-Z]*', text_generated)
    text_generated = text_generated.split("\n")
    for j in range(1,len(text_generated)-1):
        convo.append(("Tere", text_generated[j].strip("\n")))
    
    load_vocab_user("Vivi")
    text_generated = generate_text(vivi_model, start_string=convo[-1][1])
    #text_generated = re.findall('.[^A-Z]*', text_generated)
    text_generated = text_generated.split("\n")
    for j in range(1,len(text_generated)-1):
        convo.append(("Vivi", text_generated[j].strip("\n")))

In [46]:
for message in convo:
    if message[0] == "Tere":
        print(f"\033[94m\033[1m{message[0]: <16}:\033[0m {message[1]}")
    if message[0] == "Vivi":
        print(f"\033[95m\033[1m{message[0]: <16}:\033[0m {message[1]}")

[94m[1mTere            :[0m Amiga
[94m[1mTere            :[0m No entiendo 
[95m[1mVivi            :[0m eso que Fer con almuerzo 
[95m[1mVivi            :[0m Y ahí chéri 
[94m[1mTere            :[0m Lo recuerdo a ti 
[94m[1mTere            :[0m Ah no amiga 
[94m[1mTere            :[0m Ya atenanar todos? 
[94m[1mTere            :[0m 😱😱😱😱 
[94m[1mTere            :[0m Muchas gracias por tus audios 
[94m[1mTere            :[0m Los valoro un montón 
[94m[1mTere            :[0m Te amo 
[94m[1mTere            :[0m Le capido damos 
[94m[1mTere            :[0m Una cosa 
[94m[1mTere            :[0m Y hubieran despirme que todo es posible 
[95m[1mVivi            :[0m Cero acostuón? 
[95m[1mVivi            :[0m Yo también 
[95m[1mVivi            :[0m Me puse a tu Ángela Jajajaja 
[95m[1mVivi            :[0m El primer lagos? 
[95m[1mVivi            :[0m Que fallarle todas estas historias 
[95m[1mVivi            :[0m Quiero escuchad 
[95m[