# Auto-encoders

Até o momento, trabalhamos com a ideia de encontrar mapeamentos de textos para classes. Para isso, passamos por uma etapa que é encontrar uma representação intermediária para os textos. Agora, vamos tentar encontrar representações intermediárias à partir de dados não-rotulados, isto é, com a ideia de que vamos reduzir a dimensionalidade dos dados de entrada buscando minimizar ao máximo o erro de reconstituição.

In [97]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

from keras.layers import Input, Dense, Activation, TextVectorization, RepeatVector, Embedding, GlobalAveragePooling1D, LSTM
from keras.models import Model
import tensorflow as tf

import plotly.express as px

## Exercício 1
**Objetivo: usar um auto-encoder linear (PCA)**

1. Leia e interprete o código abaixo e explique o que está acontecendo.
1. O que está sendo plotado?
1. Por que esse plot não tem o formato de linha que vimos até agora? 

In [22]:
df = pd.read_csv('datasets/IMDB Dataset.csv')
reviews = df['review']
vocab_size = 1000
vectorize_layer = TextVectorization(output_mode='multi_hot', max_tokens=vocab_size, pad_to_max_tokens=True)
vectorize_layer.adapt(reviews)
X = vectorize_layer(reviews)

In [25]:
def pca_autoencoder(vocab_size):
    input_layer = Input(shape=(vocab_size,))
    x = input_layer
    x_enc = Dense(2, name='encoder')(x)
    x = Dense(vocab_size, name='decoder')(x_enc)
    return Model(input_layer, x), Model(input_layer, x_enc)


pca, enc = pca_autoencoder(vocab_size)
print(pca.summary())
pca.compile(loss='mse')
history = pca.fit(X, X, epochs=5, verbose=1)

Model: "model_10"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_8 (InputLayer)        [(None, 1000)]            0         
                                                                 
 encoder (Dense)             (None, 2)                 2002      
                                                                 
 decoder (Dense)             (None, 1000)              3000      
                                                                 
Total params: 5,002
Trainable params: 5,002
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [26]:
X_enc = enc(X)
df['enc_x1'] = X_enc[:,0]
df['enc_x2'] = X_enc[:,1]

plt.figure()
fig = px.scatter(df, x="enc_x1", y="enc_x2", color="sentiment", title="Como os documentos se espalham?", width=600, height=600)
fig.show()

<Figure size 640x480 with 0 Axes>

In [48]:
def encoder(vocab_size, target_size):
    input_layer = Input(shape=(vocab_size,))
    x = input_layer
    x = Dense(target_size, name='encoder')(x)
    return Model(input_layer, x)

def decoder(latent_size, target_size):
    input_layer = Input(shape=(latent_size,))
    x = input_layer
    x = Dense(target_size, name='decoder')(x)
    return Model(input_layer, x)

def autoencoder(encoder, decoder):
    in_shape = encoder.layers[0].input_shape[0]
    input_layer = Input(shape=in_shape)
    x = input_layer
    x = encoder(x)
    x = decoder(x)
    return Model(input_layer, x)

In [51]:
enc = encoder(vocab_size, 2)
#print(enc.layers[0].input_shape)
dec = decoder(2, vocab_size)
ae = autoencoder(enc, dec)

ae.compile(loss='mse')
print(enc.get_layer('encoder').weights[0][0:5])
history = ae.fit(X, X, epochs=5, verbose=1)
print(enc.get_layer('encoder').weights[0][0:5])

tf.Tensor(
[[ 0.04893501 -0.02284982]
 [-0.01631023 -0.04097202]
 [ 0.0634523   0.05082083]
 [ 0.05089779 -0.03052371]
 [-0.016789    0.04058132]], shape=(5, 2), dtype=float32)
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
tf.Tensor(
[[ 0.0548864   0.0464124 ]
 [-0.00431721  0.00223953]
 [-0.0010277   0.00176327]
 [-0.0022665  -0.00522423]
 [-0.01764113  0.0007823 ]], shape=(5, 2), dtype=float32)


<Figure size 640x480 with 0 Axes>

In [60]:
def encoder(vocab_size, target_size):
    input_layer = Input(shape=(vocab_size,))
    x = input_layer
    x = Dense(target_size, activation='sigmoid')(x)
    return Model(input_layer, x)

def decoder(latent_size, target_size):
    input_layer = Input(shape=(latent_size,))
    x = input_layer
    x = Dense(target_size, activation='sigmoid')(x)
    return Model(input_layer, x)

def autoencoder(encoder, decoder):
    in_shape = encoder.layers[0].input_shape[0]
    input_layer = Input(shape=in_shape)
    x = input_layer
    x = encoder(x)
    x = decoder(x)
    return Model(input_layer, x)

enc = encoder(vocab_size, 2)
dec = decoder(2, vocab_size)
ae = autoencoder(enc, dec)
ae.compile(loss='mse')
history = ae.fit(X, X, epochs=15, verbose=1)


Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [63]:
X_enc = enc(X)
df['enc_x1'] = X_enc[:,0]
df['enc_x2'] = X_enc[:,1]


plt.figure()
fig = px.scatter(df, x="enc_x1", y="enc_x2", color="sentiment", title="Como os documentos se espalham?", width=600, height=600)
fig.show()

<Figure size 640x480 with 0 Axes>

In [144]:
def encoder(seq_len, latent_dim, vocab_size):
    input_layer = Input(shape=(seq_len, vocab_size))
    x = input_layer
    x, state_h, state_c = LSTM(latent_dim, return_state=True)(x)
    return Model(input_layer, [x, state_h, state_c])

def decoder(seq_len, latent_dim, vocab_size):
    input_x = Input(shape=(latent_dim))
    input_h = Input(shape=(latent_dim,))
    input_c = Input(shape=(latent_dim,))
    x = RepeatVector(seq_len)(input_x)
    x = LSTM(latent_dim, return_sequences=True)(x,initial_state=[input_h, input_c])
    x = Dense(vocab_size, activation='softmax')(x)
    return Model([input_x, input_h, input_c], x)

def autoencoder(encoder, decoder):
    in_shape = encoder.layers[0].input_shape[0][1:]
    print(in_shape)
    input_layer = Input(shape=in_shape)
    x = input_layer
    x = encoder(x)
    x = decoder(x)
    return Model(input_layer, x)

enc = encoder(200, 5, 1000)
enc.summary()
dec = decoder(200, 5, 1000)
dec.summary()
ae = autoencoder(enc, dec)
ae.compile(loss='categorical_crossentropy')
ae.summary()


Model: "model_119"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_194 (InputLayer)      [(None, 200, 1000)]       0         
                                                                 
 lstm_60 (LSTM)              [(None, 5),               20120     
                              (None, 5),                         
                              (None, 5)]                         
                                                                 
Total params: 20,120
Trainable params: 20,120
Non-trainable params: 0
_________________________________________________________________
Model: "model_120"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_195 (InputLayer)         [(None, 5)]          0           []                               
 

In [150]:
df = pd.read_csv('datasets/IMDB Dataset.csv').sample(1000)
reviews = df['review']
vocab_size = 1000
vectorize_layer = TextVectorization(max_tokens=vocab_size, output_sequence_length=200)
vectorize_layer.adapt(reviews)
X = vectorize_layer(reviews)
X = tf.one_hot(X, vocab_size)
print(X.shape)


(1000, 200, 1000)


In [151]:
history = ae.fit(X, X, epochs=1500, verbose=1)

Epoch 1/1500
Epoch 2/1500
Epoch 3/1500
Epoch 4/1500
Epoch 5/1500
Epoch 6/1500
Epoch 7/1500
Epoch 8/1500
 2/32 [>.............................] - ETA: 7s - loss: 4.9336