<a href="https://colab.research.google.com/github/viniciusrpb/cic0269_natural_language_processing/blob/main/lectures/cap15_2_vector_visualization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Capítulo 15 - Arquiteturas Encoder-Decoder

## Visualização de Vetores

Objetivo:

Visualizar utilizando a técnica [t-Stochastic Distributed Neighbor Embedding (t-SNE)](https://lvdmaaten.github.io/tsne/). O objetivo dessa técnica consiste em projetar um objeto $m$-dimensional como um ponto em um espaço 2D.  

Recomenda-se utilizar a biblioteca [TSNE do sklearn](http://scikit-learn.org/stable/modules/generated/sklearn.manifold.TSNE.html).


In [None]:
!pip install tensorflow-datasets
!pip install plotly==4.14.3
!pip install orca

In [39]:
import tensorflow_datasets as tfds
import tensorflow as tf
import pandas as pd
import numpy as np
from sklearn.manifold import TSNE
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.optimizers import SGD,Adam
from keras.utils.np_utils import to_categorical
from keras.models import Sequential,Model
from keras.layers import Dense,Embedding,Bidirectional,LSTM,Activation,Input,Dropout,BatchNormalization
import plotly.express as px
import plotly.graph_objects as go

In [40]:
ds_train = tfds.load('ted_hrlr_translate/pt_to_en', split='train',
                     shuffle_files=True,
                     with_info=True,
                     as_supervised=True)

ds_valid = tfds.load('ted_hrlr_translate/pt_to_en', split='validation',
                     shuffle_files=False,
                     with_info=True,
                     as_supervised=True)

ds_test = tfds.load('ted_hrlr_translate/pt_to_en', split='test',
                    shuffle_files=False,
                    with_info=True,
                    as_supervised=True)

In [41]:
def ds2DataFrame(ds):
    
    dic = {}
    dic['pt'] = []
    dic['en'] = []
    for elem in ds[0]:
        port = elem[0].numpy().decode('utf-8')
        eng = elem[1].numpy().decode('utf-8')
        dic['pt'].append(port)
        dic['en'].append(eng)

    return pd.DataFrame.from_dict(dic)

In [42]:
df_train = ds2DataFrame(ds_train)
df_valid = ds2DataFrame(ds_valid)
df_test = ds2DataFrame(ds_test)

In [43]:
df_train

Unnamed: 0,pt,en
0,"e quando melhoramos a procura , tiramos a únic...","and when you improve searchability , you actua..."
1,mas e se estes fatores fossem ativos ?,but what if it were active ?
2,mas eles não tinham a curiosidade de me testar .,but they did n't test for curiosity .
3,e esta rebeldia consciente é a razão pela qual...,"and this conscious defiance is why i , as an a..."
4,`` `` '' podem usar tudo sobre a mesa no meu c...,you can use everything on the table on me .
...,...,...
51780,"que a ideia louca é mesmo isso , é louca , e c...","that the crazy idea is just that , it is crazy..."
51781,"agora , em ambos os casos , não mandei nenhum ...","now in both cases , i did n't send them home a..."
51782,"na linha pontilhada a vermelho , mostramos qua...","in the dotted red line , we show what the adop..."
51783,"agora , a lagarta não morreu .","now , the caterpillar did n't die ."


In [44]:
num_tokens_encoder = 3
num_tokens_decoder = 3
num_neurons = 128
max_encoder_seq_length = 32
max_decoder_seq_length = 32

Criar o dicionário para as palavras em português

In [45]:
def create_dictionary(df):

    word2index = {}
    word2index['<OOV>'] = 0
    word2index['<BEG>'] = 1
    word2index['<END>'] = 2

    ind = 3
    for text in df:
        for word in text.split():
            if word not in word2index:
                word2index[word] = ind
                ind+=1

    index2word = {}
    for key in word2index:
        value = word2index[key]
        index2word[value] = key

    return word2index,index2word

In [46]:
vocab_pt,index_pt = create_dictionary(df_train['pt'])
vocab_en,index_en = create_dictionary(df_train['en'])

vocab_pt_size = len(vocab_pt)
vocab_en_size = len(vocab_en)

tokenizer_pt = Tokenizer(num_words = vocab_pt_size)
tokenizer_pt.fit_on_texts(df_train['pt'])

tokenizer_en = Tokenizer(num_words = vocab_en_size)
tokenizer_en.fit_on_texts(df_train['en'])

In [47]:
def prepare_data(df):

    X = []
    y = []

    for sentence in df:
        tokens = sentence.split()
        lista_x = ['<BEG>']
        lista_y = []
        for i in range(0,len(tokens)):

            lista_y.append(tokens[i])
            lista_x.append(tokens[i])
            #y.append(tokens[i])
            #X.append(tokens[i])

        lista_y.append('<END>')

        X.append(lista_x)
        y.append(lista_y)
    
    return X,y

In [48]:
X_train_pt,y_train_pt = prepare_data(df_train['pt'])
X_valid_pt,y_valid_pt = prepare_data(df_train['pt'])

X_train_en,y_train_en = prepare_data(df_train['en'])
X_valid_en,y_valid_en = prepare_data(df_train['en'])

In [49]:
y_train_pt[0]

['e',
 'quando',
 'melhoramos',
 'a',
 'procura',
 ',',
 'tiramos',
 'a',
 'única',
 'vantagem',
 'da',
 'impressão',
 ',',
 'que',
 'é',
 'a',
 'serendipidade',
 '.',
 '<END>']

In [50]:
def text2sequences(data,vocab):
    corpus_int = []
    for instance in data:
        instance_int = []
        for term in instance:
            if term in vocab:
                instance_int.append(vocab[term])
            else:
                instance_int.append(vocab['<OOV>'])
        corpus_int.append(instance_int)
    
    return corpus_int

In [51]:
train_pt_sequences = text2sequences(X_train_pt,vocab_pt)
valid_pt_sequences = text2sequences(X_valid_pt,vocab_pt)
#test_pt_sequences = text2sequences(X_test_pt,vocab_pt)

train_en_sequences = text2sequences(X_train_en,vocab_en)
valid_en_sequences = text2sequences(X_valid_en,vocab_en)

In [52]:
trunc_type = 'post'
padding_type = 'post'
max_length = 64

train_pt_padded = pad_sequences(train_pt_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)
valid_pt_padded = pad_sequences(valid_pt_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

train_en_padded = pad_sequences(train_en_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)
valid_en_padded = pad_sequences(valid_en_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)
#test_padded = pad_sequences(test_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

In [53]:
#y_train = text2sequences(y_train_pt,vocab_pt)
#y_train = pad_sequences(y_train, maxlen=max_length, padding=padding_type, truncating=trunc_type)
#y_train_int = to_categorical(y_train)

In [54]:
encoder_inputs = Input(shape=(num_tokens_encoder))
encoder_embedding = Embedding(input_dim = vocab_pt_size+3,output_dim = 32)(encoder_inputs)
encoder = LSTM(num_neurons, return_state=True)
encoder_outputs, state_h, state_c = encoder(encoder_embedding)
encoder_states = [state_h, state_c]
model = Model(inputs=encoder_inputs,outputs=encoder_outputs)

In [55]:
#adam = Adam(learning_rate=0.01)
#model.compile(loss="categorical_crossentropy",optimizer=adam,metrics=['accuracy'])
#history = model.fit(train_pt_padded,y_train,validation_data=(valid_padded,y_valid),batch_size=32,epochs=10)

In [57]:
tsne = TSNE(n_components=2,perplexity=5,learning_rate=100.0,metric='euclidean', init='random')
X_tsne = tsne.fit_transform(word_vectors)

fig = px.scatter(X_tsne, x=0, y=1)#, color=labels.astype(object))
fig.show()

NameError: ignored

Vamos utilizar a classe [Model](https://keras.io/api/models/model/) para construir o modelo completo