## Install dependencies

In [13]:
!pip install tf-nightly-gpu-2.0-preview==2.0.0.dev20190413

import tensorflow as tf
print(tf.__version__)

2.0.0-dev20190413


In [14]:
!python --version

Python 3.6.7


## Integrate Google Drive

In [0]:
from google.colab import drive
drive.mount('/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /gdrive


## Explore dataset

In [0]:
%cd /gdrive/Team\ Drives/umayux/Research/NLP/humor_detection

/gdrive/Team Drives/umayux/Research/NLP/humor_detection


In [0]:
import pandas as pd
data = pd.read_csv('./data/haha_2019_train.csv')
display(data[:4])

fo = open('./data/dataset.tsv', 'w')
for ix,i in data.iterrows():
    if i['is_humor'] == 0:
        fo.write(i['text'].strip().replace('\n', ' ').replace('\r', '') + '\t' + str(i['is_humor'])+'\n' )
    elif i['funniness_average']>=2:
        fo.write(i['text'].strip().replace('\n', ' ').replace('\r', '') + '\t' + str(round( i['is_humor']) )+'\n' )

fo.close()

!wc -l ./data/dataset.tsv

Unnamed: 0,id,text,is_humor,votes_no,votes_1,votes_2,votes_3,votes_4,votes_5,funniness_average
0,705196579758583809,Niveles de retraso mental: \r\n\r\n— Bajo.\r\n...,1,1,2,2,0,0,0,1.5
1,678040651817213952,"—Vamos Luke desenfunda tu sable, demuestra tu ...",1,1,3,0,1,0,0,1.5
2,546750892213829633,"- ¿Te ofrezco algo?, ¿Agua, café, mi corazón, ...",1,0,2,1,0,1,1,2.6
3,965807211292364801,No se porqué me hago la cabeza deooos,0,3,0,0,0,0,0,


20035 ./data/dataset.tsv


In [0]:
!head ./data/dataset.tsv

- ¿Te ofrezco algo?, ¿Agua, café, mi corazón, mi vida entera? -Agua está bien -Pero yo creí que... - ¡AGUA DIJE!	1
No se porqué me hago la cabeza deooos	0
Quisiera saber que hago durante la siesta de la cual me levanto más cansado que cuando me acosté a dormir.	0
La persona que te dice que no se arrepiente de nada en la vida, o no toma alcohol, o no lleva el celular cuando lo hace.	0
—Buenas don Pepe, ¿me vende un litro de leche? —¿Entera? —No, si quiere tómese un vasito pinche viejo abusivo...	1
Meeee aburro	0
Macri le dijo las gordas que usar calzas está bien.	0
JAVIER CHICALITO HERNANDEZ *7	0
Conducir a Cerro Navia, compartir información de la carretera en tiempo real con wazers en mi zona. ETA 11:20 PM usando @waze - Conducció...	0
#LosNiñosBonitos le ponen pausa al videojuego para escuchar lo que dicen #LasNiñasBonitas.	0


## Update Git Repository 

In [0]:
!git pull

/gdrive/Team Drives/umayux/Research/NLP/chatbot/transformer
Already up to date.


## Train the model

In [0]:
!rm -r ./data/tokenizer* ./data/ckpt-* ./data/checkpoint

rm: cannot remove './data/tokenizer*': No such file or directory
rm: cannot remove './data/ckpt-*': No such file or directory
rm: cannot remove './data/checkpoint': No such file or directory


In [0]:
%cd /gdrive/Team\ Drives/umayux/Research/NLP/humor_detection

import tensorflow_datasets as tfds
import tensorflow as tf

from src.optimizer import CustomSchedule, loss_function
from src.dataset import Dataset
from src.model import Transformer
import time
from src.masking import create_masks
import pickle
from sklearn.metrics import classification_report

tf.keras.backend.clear_session() 

MAX_LENGTH=120
BUFFER_SIZE=20000
BATCH_SIZE=64
EPOCHS=100
num_heads=8
num_layers=4
d_model=64
dff=264
vocab_dim=20000
dropout_rate=0.1
test_partition=0.2
dataset_file="./data/dataset.tsv"
checkpoint_path="./data/"
retrain=False

# Build the dataset for training validation
dataset = Dataset(filename=dataset_file, vocab_dim=vocab_dim,max_length=MAX_LENGTH)
dataset.build_train_test(test=test_partition)
train_examples, val_examples = dataset.format_train_test()

if retrain:
    
    # loading tokenizers for future predictions
    with open(checkpoint_path + "/tokenizer_source.pickle", "rb") as handle:
        tokenizer_source = pickle.load(handle)

    with open(checkpoint_path + "/tokenizer_target.pickle", "rb") as handle:
        tokenizer_target = pickle.load(handle)
    
    # update dataset class with previous data
    dataset.tokenizer_source = tokenizer_source
    dataset.tokenizer_target = tokenizer_target
    
else:        
    tokenizer_source, tokenizer_target = dataset.tokenizer(train_examples)


train_dataset = train_examples.map(dataset.tf_encode)
train_dataset = train_dataset.filter(dataset.filter_max_length)
train_dataset = train_dataset.cache()
train_dataset = train_dataset.shuffle(BUFFER_SIZE).padded_batch(
    BATCH_SIZE, padded_shapes=([-1], [-1])
)
train_dataset = train_dataset.prefetch(tf.data.experimental.AUTOTUNE)

val_dataset = val_examples.map(dataset.tf_encode)
val_dataset = val_dataset.filter(dataset.filter_max_length).padded_batch(
    BATCH_SIZE, padded_shapes=([-1], [-1])
)

input_vocab_size = tokenizer_source.vocab_size + 2
target_vocab_size = tokenizer_target.vocab_size + 2

# Setup the learning rate and optimizer
learning_rate = CustomSchedule(d_model)
optimizer = tf.keras.optimizers.Adam(
    learning_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-9
)

train_loss = tf.keras.metrics.Mean(name="train_loss")
train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(
    name="train_accuracy"
)

test_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(
    name="test_accuracy"
)
test_loss = tf.keras.metrics.Mean(name="test_loss")
test_precision = tf.metrics.Precision(name="test_precision")
test_recall = tf.metrics.Recall(name='test_recall')

# setup Transformer Model
transformer = Transformer(
    num_layers,
    d_model,
    num_heads,
    dff,
    input_vocab_size,
    target_vocab_size,
    dropout_rate
)

# setup checkpoints
ckpt = tf.train.Checkpoint(transformer=transformer, optimizer=optimizer)
ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=5)

# if a checkpoint exists, restore the latest checkpoint.
if ckpt_manager.latest_checkpoint and retrain:
    ckpt.restore(ckpt_manager.latest_checkpoint)
    print("Latest checkpoint restored!!")
else:
    print("Initializing from scratch.")

# saving tokenizers
with open(checkpoint_path + "/tokenizer_source.pickle", "wb") as handle:
    pickle.dump(tokenizer_source, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open(checkpoint_path + "/tokenizer_target.pickle", "wb") as handle:
    pickle.dump(tokenizer_target, handle, protocol=pickle.HIGHEST_PROTOCOL)

# define training function step
@tf.function
def train_step(inp, tar):
    tar_inp = tar[:, :-1]
    tar_real = tar[:, 1:]

    enc_padding_mask, combined_mask, dec_padding_mask = create_masks(
        inp, tar_inp
    )

    with tf.GradientTape() as tape:
        predictions, _ = transformer(
            inp,
            tar_inp,
            True,
            enc_padding_mask,
            combined_mask,
            dec_padding_mask,
        )
        loss = loss_function(tar_real, predictions)

    gradients = tape.gradient(loss, transformer.trainable_variables)
    optimizer.apply_gradients(zip(gradients, transformer.trainable_variables))

    train_loss(loss)
    train_accuracy(tar_real, predictions)

    

def test_acc(batch=32, test_dataset=[], transformer=[]):
    for (batch, (inp, tar)) in enumerate(test_dataset):
        tar_inp = tar[:, :-1]
        tar_real = tar[:, 1:]
        
        enc_padding_mask, combined_mask, dec_padding_mask = create_masks(
            inp, tar_inp
        )
        
        predictions, _ = transformer(
            inp,
            tar_inp,
            True,
            enc_padding_mask,
            combined_mask,
            dec_padding_mask,
        )
        
        test_accuracy(tar_real, predictions)
        
#         test_precision.update(tar_real, predictions)
#         test_recall(tar_real, predictions)
#         print(tar_real)
#         print(predictions)
        
        test_loss(loss_function(tar_real, predictions))


best_test_acc = 0

# training loop
for epoch in range(EPOCHS):
    start = time.time()

    train_loss.reset_states()
    train_accuracy.reset_states()

    # inp -> portuguese, tar -> english
    for (batch, (inp, tar)) in enumerate(train_dataset):
        train_step(inp, tar)
        if batch % 500 == 0:
            print(
                "Epoch {} Batch {} Loss {:.4f} Accuracy {:.4f}".format(
                    epoch + 1,
                    batch,
                    train_loss.result(),
                    train_accuracy.result(),
                )
            )

    print(
        "Epoch {} Train Loss {:.4f} Accuracy {:.4f}".format(
            epoch + 1, train_loss.result(), train_accuracy.result()
        )
    )
        
    # Perform accuracy over the test dataset
    test_accuracy.reset_states()
    test_precision.reset_states()
    test_recall.reset_states()
    
    test_acc(batch=32, test_dataset=val_dataset, transformer=transformer)
    print(
        "Epoch {} Test Loss {:.4f} Accuracy {:.4f} Precision {:.4f} Recall {:.4f}".format(
            epoch + 1, test_loss.result(), test_accuracy.result(), test_precision.result(), test_recall.result()
        )
    )   
    
    if best_test_acc < test_accuracy.result():
        ckpt_save_path = ckpt_manager.save()
        print(
            "Saving checkpoint for epoch {} at {}".format(
                epoch + 1, ckpt_save_path
            )
        )
        best_test_acc = test_accuracy.result()
        
    print("Time taken for 1 epoch: {} secs\n".format(time.time() - start))


/gdrive/Team Drives/umayux/Research/NLP/humor_detection


W0424 22:59:09.049007 140690755405568 backprop.py:820] The dtype of the watched tensor must be floating (e.g. tf.float32), got tf.string
W0424 22:59:09.052052 140690755405568 backprop.py:820] The dtype of the watched tensor must be floating (e.g. tf.float32), got tf.string
W0424 22:59:09.058568 140690755405568 backprop.py:820] The dtype of the watched tensor must be floating (e.g. tf.float32), got tf.string
W0424 22:59:09.065018 140690763798272 backprop.py:820] The dtype of the watched tensor must be floating (e.g. tf.float32), got tf.string
W0424 22:59:09.068469 140690755405568 backprop.py:820] The dtype of the watched tensor must be floating (e.g. tf.float32), got tf.string


Initializing from scratch.
Epoch 1 Batch 0 Loss 5.2434 Accuracy 0.0000
Epoch 1 Train Loss 2.5531 Accuracy 0.5424
Epoch 1 Test Loss 0.8750 Accuracy 0.8679 Precision 0.0000 Recall 0.0000
Saving checkpoint for epoch 1 at ./data/ckpt-1
Time taken for 1 epoch: 343.2751989364624 secs

Epoch 2 Batch 0 Loss 0.8177 Accuracy 0.8828
Epoch 2 Train Loss 0.4247 Accuracy 0.9073
Epoch 2 Test Loss 0.5496 Accuracy 0.9252 Precision 0.0000 Recall 0.0000
Saving checkpoint for epoch 2 at ./data/ckpt-2
Time taken for 1 epoch: 307.6645905971527 secs

Epoch 3 Batch 0 Loss 0.1942 Accuracy 0.9453
Epoch 3 Train Loss 0.1613 Accuracy 0.9407
Epoch 3 Test Loss 0.4239 Accuracy 0.9296 Precision 0.0000 Recall 0.0000
Saving checkpoint for epoch 3 at ./data/ckpt-3
Time taken for 1 epoch: 34.73602604866028 secs

Epoch 4 Batch 0 Loss 0.1461 Accuracy 0.9453
Epoch 4 Train Loss 0.1168 Accuracy 0.9558
Epoch 4 Test Loss 0.3646 Accuracy 0.9300 Precision 0.0000 Recall 0.0000
Saving checkpoint for epoch 4 at ./data/ckpt-4
Time take

In [0]:
# m = tf.keras.metrics.Precision()
# m.update_state([[49, 50], [49, 50], [49, 50], [49, 50]], [ [0, 0.9], [0.5, 0.01], [0.3, 0,5] , [0.9, 0.0] ])

# threshold values are [0 - 1e-7, 0.5, 1 + 1e-7]
# tp = [2, 1, 0], fp = [2, 0, 0], fn = [0, 1, 2], tn = [0, 2, 2]
# recall = [1, 0.5, 0], fp_rate = [1, 0, 0]
# auc = ((((1+0.5)/2)*(1-0))+ (((0.5+0)/2)*(0-0))) = 0.75

# print('Final result: ', m.result().numpy()) 

In [0]:
real = tf.Variable([[0,33], [1,33]], dtype=tf.int64)
pred = tf.Variable([[[0.1, 0.2, 0.4],[0.1, 0.9, 1]], [[0.1, 0.2,1],[0.1, 0.99, 0.8]]], dtype=tf.float32)

real[:,0]
pred[:,0,1]

m = tf.keras.metrics.Precision()
m.update_state(real[:,0], pred[:,0,1])

m.result().numpy()

0.0

## Test the model

In [0]:
import predict

model = predict.PredictModel(
    MAX_LENGTH=40,
    BUFFER_SIZE=20000,
    BATCH_SIZE=64,
    num_heads=8,
    num_layers=4,
    d_model=128,
    dff=512,
    dropout_rate=0.1,
    checkpoint_path="./data/banco/"
)

model.load()


Latest checkpoint restored!!


In [0]:
sentence = "buenos dias, tengo un problema serio"
model.predict(sentence)

Pregunta: buenos dias, tengo un problema serio
Respuesta: ¡


KeyError: ignored

<Figure size 1152x576 with 0 Axes>

## Predict examples

In [7]:
%cd /gdrive/Team\ Drives/umayux/Research/NLP/humor_detection

import tensorflow_datasets as tfds
import tensorflow as tf
import src.dataset as dt
from src.optimizer import CustomSchedule, loss_function
from src.model import Transformer
import time
from src.masking import create_masks
import pickle
import matplotlib.pyplot as plt

class Model():
    def __init__(self):
        pass

    def load(
        self,
        MAX_LENGTH=120,
        num_heads=8,
        num_layers=4,
        d_model=64,
        dff=264,
        dropout_rate=0.1,
        checkpoint_path="./data/",
        beta_1=0.9,
        beta_2=0.98,
        epsilon=1e-9
        ):
        
        self.MAX_LENGTH = MAX_LENGTH

        # loading tokenizers for future predictions
        self.tokenizer_source = pickle.load(open(checkpoint_path + '/tokenizer_source.pickle', 'rb'))
        self.tokenizer_target = pickle.load(open(checkpoint_path + '/tokenizer_target.pickle', 'rb'))

        input_vocab_size = self.tokenizer_source.vocab_size + 2
        target_vocab_size = self.tokenizer_target.vocab_size + 2

        learning_rate = CustomSchedule(d_model)
        self.optimizer = tf.keras.optimizers.Adam(
            learning_rate, beta_1=beta_1, beta_2=beta_2, epsilon=epsilon
        )

        self.transformer = Transformer(
            num_layers,
            d_model,
            num_heads,
            dff,
            input_vocab_size,
            target_vocab_size,
            dropout_rate,
        )

        ckpt = tf.train.Checkpoint(transformer=self.transformer, optimizer=self.optimizer)
        ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=5)

        # if a checkpoint exists, restore the latest checkpoint.
        if ckpt_manager.latest_checkpoint:
            ckpt.restore(ckpt_manager.latest_checkpoint)
            print("Latest checkpoint restored!!")
        else:
            print("Initializing from scratch.")
    
    def evaluate(self, inp_sentence):
        start_token = [self.tokenizer_source.vocab_size]
        end_token = [self.tokenizer_source.vocab_size + 1]

        # inp sentence is portuguese, hence adding the start and end token
        inp_sentence = start_token + self.tokenizer_source.encode(inp_sentence) + end_token
        encoder_input = tf.expand_dims(inp_sentence, 0)

        # as the target is english, the first word to the transformer should be the
        # english start token.
        decoder_input = [self.tokenizer_target.vocab_size]
        output = tf.expand_dims(decoder_input, 0)

        for i in range(self.MAX_LENGTH):
            enc_padding_mask, combined_mask, dec_padding_mask = create_masks(
            encoder_input, output)

        # predictions.shape == (batch_size, seq_len, vocab_size)
        predictions, attention_weights = self.transformer(encoder_input, 
                                                     output,
                                                     False,
                                                     enc_padding_mask,
                                                     combined_mask,
                                                     dec_padding_mask)

        # select the last word from the seq_len dimension
        predictions = predictions[: ,-1:, :]  # (batch_size, 1, vocab_size)

        predicted_id = tf.cast(tf.argmax(predictions, axis=-1), tf.int32)

        # return the result if the predicted_id is equal to the end token
        if tf.equal(predicted_id, self.tokenizer_target.vocab_size+1):
            return tf.squeeze(output, axis=0), attention_weights

        # concatentate the predicted_id to the output which is given to the decoder
        # as its input.
        output = tf.concat([output, predicted_id], axis=-1)

        return tf.squeeze(output, axis=0), attention_weights
    
    
    def plot_attention_weights(self, attention, sentence, result, layer):
        fig = plt.figure(figsize=(30, 38))

        sentence = self.tokenizer_source.encode(sentence)

        attention = tf.squeeze(attention[layer], axis=0)

        for head in range(attention.shape[0]):
            ax = fig.add_subplot(8, 1, head+1)

        # plot the attention weights
        ax.matshow(attention[head][:-1, :], cmap='viridis')

        fontdict = {'fontsize': 10}

        ax.set_xticks(range(len(sentence)+2))
        ax.set_yticks(range(len(result)))

        ax.set_ylim(len(result)-1.5, -0.5)

        ax.set_xticklabels(
            ['<start>']+[self.tokenizer_source.decode([i]) for i in sentence]+['<end>'], 
            fontdict=fontdict, rotation=90)

        ax.set_yticklabels([self.tokenizer_target.decode([i]) for i in result 
                            if i < self.tokenizer_target.vocab_size], 
                           fontdict=fontdict)

        ax.set_xlabel('Head {}'.format(head+1))

        plt.tight_layout()
        plt.show()
        

    def query(self, sentence, plot=''):
        result, attention_weights = self.evaluate(sentence)

        predicted_sentence = self.tokenizer_target.decode([i for i in result 
                                                if i < self.tokenizer_target.vocab_size])  

        print('Pregunta: {}'.format(sentence))
        print('Respuesta UmyBot: {}'.format(predicted_sentence))

        if plot:
            self.plot_attention_weights(attention_weights, sentence, result, plot)

        return predicted_sentence
    
    
    
model = Model()
model.load()
model.query('—Buenas don Pepe, ¿me vende un litro de leche? —¿Entera? —No, si quiere tómese un vasito pinche viejo abusivo')

/gdrive/Team Drives/umayux/Research/NLP/humor_detection
Latest checkpoint restored!!
Pregunta: —Buenas don Pepe, ¿me vende un litro de leche? —¿Entera? —No, si quiere tómese un vasito pinche viejo abusivo
Respuesta UmyBot: 1


'1'

In [0]:
import pandas as pd
data = pd.read_csv('./data/dataset.tsv.test', sep='\t', names=['source', 'target'])



In [0]:
pred = []
label = []

for ix,i in data.iterrows():
    r = i['target']
    r = int(r)
    
    if ix == 500: break
    p = translate(
        i['source']
    )
    
    p = int(p)
    if p>=1: p=1
    if r>=1: r=1
    
    pred.append(p)
    label.append(r)
    
    print("Respuesta Humano: {}".format(r))
    print('\n\n')
    
from sklearn.metrics import classification_report
print(classification_report(label, pred))

Pregunta: —Buenas don Pepe, ¿me vende un litro de leche? —¿Entera? —No, si quiere tómese un vasito pinche viejo abusivo...
Respuesta UmyBot: 1
Respuesta Humano: 1



Pregunta: Meeee aburro
Respuesta UmyBot: 0
Respuesta Humano: 0



Pregunta: Ya apronté él bolso
Respuesta UmyBot: 0
Respuesta Humano: 0



Pregunta: Ah no, bien.
Respuesta UmyBot: 0
Respuesta Humano: 0



Pregunta: WhatsApp cayó varias veces en 2015 y vos todavía no caes que nadie te soporta.
Respuesta UmyBot: 0
Respuesta Humano: 1



Pregunta: —Bienvenido a los X-Men, ¿cuál es tu poder?  —Creo regresaré con mi Ex —Muy bien, te llamaremos «Bestia».
Respuesta UmyBot: 1
Respuesta Humano: 1



Pregunta: Tomar champaña después de lavarse los dientes. Mas rico
Respuesta UmyBot: 1
Respuesta Humano: 0



Pregunta: No se si estoy cruda, peda, viva, arriba, abajo, cintura sola, la media vuelta, despacito, quiero respirar tu cuello deeeees pa ci to.
Respuesta UmyBot: 0
Respuesta Humano: 0



Pregunta: Necesito bajar 20 kilos o 10 pa

In [0]:
# label

In [0]:
# translate('Si no se levantaron de buen humor, vuélvanse a dormir, que gente amargada y buena para nada ya hay muchas.', plot="decoder_layer4_block2")
