In [1]:
# https://keras.io/examples/nlp/text_classification_with_transformer/
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import pandas as pd
import numpy as np



from sklearn.model_selection import train_test_split
from gensim.models import KeyedVectors
tf.__version__

'2.2.0-rc3'

In [2]:
from nltk.tokenize import TweetTokenizer
tknzr = TweetTokenizer()
# s0 = "This is a cooool #dummysmiley: :-) :-P <3 and some arrows < > -> <--"
# s0 = df_to_work['review_text'][1009]
# tknzr.tokenize(s0.lower())
# s0.lower().split()

In [3]:
# Definindo as variáveis do projeto
vocab_size = 200000  # Considerar 200k palavras
maxlen = 200  # Considerar apenas as 100 primeiras palavras do texto da review

embed_dim = 50 # tamanho do Embedding de cada token ( também do word2vec da NILC)
num_heads = 2  # N. de cabeças de atenção
ff_dim = 32   # tamanho da camada oculta nas redes feed forward dentro do transformer

# Path para o arquivo de dados da b2w
# B2W_DATAFILE = "/home/wseidel/workspaces/usp/b2w-reviews01/B2W-Reviews01.csv"
B2W_DATAFILE = "/home/wesley/workspaces/usp/data/b2w/B2W-Reviews01.csv"
# B2W_DATAFILE = "/home/wseidel/workspaces/usp/b2w-reviews01/B2W-10k.csv"


# Path para o arquivo de dados de embeddings do NILC
# NILC_W2V_DATAFILE = "/home/wseidel/workspaces/usp/NILC/word2vec_200k.txt"
NILC_W2V_DATAFILE = "/home/wesley/workspaces/usp/data/nilc/word2vec_200k.txt"

# Quantidade de epocas para o treino
QNT_EPOCAS_A_TREINAR = 2

In [4]:
# Carregar dados a serem analisados
b2wCorpus = pd.read_csv(B2W_DATAFILE, sep=';', usecols=["review_text", "overall_rating"])

# Carregar o Word2Vec do NILC
# model_w2v = KeyedVectors.load_word2vec_format(NILC_W2V_DATAFILE)

In [17]:
b2wCorpus.groupby(['overall_rating']).count()
# b2wCorpus.describe()

Unnamed: 0_level_0,review_text
overall_rating,Unnamed: 1_level_1
1,27369
2,8389
3,16315
4,32345
5,47955


In [8]:
def train_test_val_split(dataset, train_size=0.6, test_size=0.3, colname_stratify='overall_rating',random_seed=29):
    val_size = 1 - round((train_size + test_size),1)
    split_train_test_size = test_size + val_size

    train, val = train_test_split(dataset, 
                                  test_size=split_train_test_size, 
                                  stratify=dataset[colname_stratify], 
                                  random_state=random_seed)

    test, val = train_test_split(val, 
                                  test_size=val_size/split_train_test_size, 
                                  stratify=val[colname_stratify], 
                                  random_state=random_seed)
    return train.reset_index(), test, val


def sentence_to_nilc_index_token(text, stem=False):
    # Traduzindo os tokens do B2W para o index do NILC
#     tokens = text.lower().split() # Pegar um tokenizer decente...
    tokens = tknzr.tokenize(text.lower())
    tokens = [model_w2v.vocab[t].index if t in model_w2v.vocab else 19999 for t in tokens ]
    return tokens

def sort_by_size(df, col_to_sort):
    df['sentence_length'] = df[col_to_sort].apply(lambda x: len(x))
    df.sort_values(by=['sentence_length'], inplace=True, ignore_index=True)
    return df

def getXY(serieX, serieY, padding_maxlen=50):
    x_train = keras.preprocessing.sequence.pad_sequences(train['review_text_clean'], maxlen=padding_maxlen, padding='post')
    y_train = train['overall_rating']
    return x_train, y_train


class Vectorization:
    def __init__(self, data_to_adapt):
        self.vectorize_layer = tf.keras.layers.experimental.preprocessing.TextVectorization(
            max_tokens=20000,
            output_mode='int', # "int", "binary", "count" or "tf-idf",
            output_sequence_length=50,  # Only valid in INT mode.
        )
        self.vectorize_layer.adapt( data_to_adapt )
        self.modelVectorization = tf.keras.models.Sequential()
        self.modelVectorization.add(tf.keras.Input(shape=(1,), dtype=tf.string))
        self.modelVectorization.add(self.vectorize_layer)
    def predict(self, input_data ):
        return self.modelVectorization.predict(input_data)


In [16]:
# ------ main ----
df_to_work = b2wCorpus

TAMMAX_SENTENCE=50

values_to_retain=[1,2,3,4,5]
df_to_work = df_to_work[df_to_work['overall_rating'].isin(values_to_retain)]
# df_to_work
df_to_work['overall_rating'] = df_to_work.overall_rating.apply(lambda x: x-1)


# Aplicando o sentence_to_nilc_index_token
df_to_work['review_text_clean'] = df_to_work.review_text.apply(lambda x: sentence_to_nilc_index_token(x))

# vectorization = Vectorization( b2wCorpus['review_text'].values )
# df_to_work['review_text_clean'] = df_to_work.review_text.apply(lambda x: vectorization.predict([[x]])[0])


# train, test, val = train_test_val_split(df_to_work, train_size=0.75, test_size=0.15)
train, test, val = train_test_val_split(df_to_work)

sort_by_size(train, 'review_text_clean')


x_train, y_train = getXY(train['review_text_clean'], train['overall_rating'], padding_maxlen=TAMMAX_SENTENCE)
x_test,  y_test  = getXY(test['review_text_clean'], test['overall_rating'], padding_maxlen=TAMMAX_SENTENCE)
x_val,   y_val   = getXY(val['review_text_clean'], val['overall_rating'], padding_maxlen=TAMMAX_SENTENCE)


print("train..:", len(train), round(len(train) / len(df_to_work),3) ) 
print("test...:", len(test), round(len(test) / len(df_to_work),3) )
print("val....:", len(val), round(len(val) / len(df_to_work),3) )
print("--" * 20) 
print("x_train..:", len(x_train[-1]), ) 
print("x_test...:", len(x_test[-1]), ) 
print("x_val....:", len(x_val[-1]), ) 
# train = train.reset_index(drop=True)
# train = train.reset_index(inplace=True)
# train = train.copy()

# df_to_work.groupby
df_to_work.groupby(['overall_rating']).count()
# b2wCorpus.describe()

NameError: name 'model_w2v' is not defined

In [14]:
# vocab = set()
# def sentence_to_nilc_index_token(text, stem=False):
#     # Traduzindo os tokens do B2W para o index do NILC
# #     tokens = text.lower().split() # Pegar um tokenizer decente...
#     tokens = tknzr.tokenize(text.lower())
#     for t in tokens:
#         vocab.add(t)
#     return 1

# df_to_work.review_text.apply(lambda x: sentence_to_nilc_index_token(x))
# vectorization.predict([["acho oque isso é otimo"]])[0]
# df_to_work['review_text_clean'] = df_to_work.review_text.apply(lambda x: vectorization.predict([[x]])[0])

df_to_work.review_text[0:10].apply(lambda x: vectorization.predict([[x]])[0])

0    [48, 1409, 12, 4, 54, 19, 65, 2, 494, 102, 12,...
1    [38, 160, 1, 383, 67, 66, 143, 655, 5, 3128, 0...
2    [2412, 20, 696, 3, 917, 292, 611, 1, 4054, 127...
3    [39, 300, 504, 192, 5, 570, 12, 3344, 1040, 7,...
4    [4, 19, 30, 18, 28, 29, 40, 268, 5, 129, 4, 53...
5    [32, 6, 38, 209, 20, 147, 3128, 43, 284, 3, 38...
6    [6, 600, 15, 12, 114, 6800, 374, 176, 56, 2671...
7    [6, 32, 26, 49, 196, 3150, 11, 737, 12, 495, 0...
8    [2, 507, 3, 2707, 3, 2, 2239, 10, 25, 466, 21,...
9    [39, 6, 69, 30, 94, 3, 4, 40, 125, 14931, 21, ...
Name: review_text, dtype: object

In [52]:
# vocab = set()
# vocab.add(2)
vocab
'produto.comprei qualidade.exatamente'.split('.')

['produto', 'comprei qualidade', 'exatamente']

In [33]:
dados = list(range(19))
lote_size = 3
lote_count = int(np.ceil(len(dados)/ lote_size))
print("qnt dados..:", len(dados))
print("lote size..:", lote_size)
print("lote count..:", lote_count)
for i in range(0,lote_count):
    print(f"Pegando lote {i} de {lote_count}:", end="")
    print(dados[ i*lote_size : i*lote_size+lote_size ] )

qnt dados..: 19
lote size..: 3
lote count..: 7
Pegando lote 0 de 7:[0, 1, 2]
Pegando lote 1 de 7:[3, 4, 5]
Pegando lote 2 de 7:[6, 7, 8]
Pegando lote 3 de 7:[9, 10, 11]
Pegando lote 4 de 7:[12, 13, 14]
Pegando lote 5 de 7:[15, 16, 17]
Pegando lote 6 de 7:[18]


In [34]:
# from keras import Sequential
# from keras.utils import Sequence
# from keras.layers import LSTM, Dense, Masking
# import numpy as np
from tensorflow.keras import layers
# model = tf.keras.Sequential([
from tensorflow import keras

def get_lstm_model(dropout_prob=0.0):
    embedding_layer = model_w2v.get_keras_embedding()
#     embedding_layer.trainable = True

    model = keras.Sequential()
    model.add(layers.Input(shape=(TAMMAX_SENTENCE, )))
    model.add(embedding_layer)
    model.add(layers.LSTM(64))
    model.add(layers.Dropout(dropout_prob))
    model.add(keras.layers.Dense(5, activation='softmax'))
    model.compile("adam", "sparse_categorical_crossentropy", metrics=["accuracy"])
    return model

model = get_lstm_model()
model.summary()


Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 50, 50)            10000000  
_________________________________________________________________
lstm_3 (LSTM)                (None, 64)                29440     
_________________________________________________________________
dropout_3 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 5)                 325       
Total params: 10,029,765
Trainable params: 29,765
Non-trainable params: 10,000,000
_________________________________________________________________


In [35]:
# name = 'm1_lstm_drop0.0'
# model = get_lstm_model(dropout_prob=0.5)
# es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=10)
# mc = ModelCheckpoint('../model_data/' + name + 'best_model.h5', monitor='val_loss', mode='min', save_best_only=True)
# history = model.fit(x_train, y_train, batch_size=64, epochs=100, validation_data=(x_val, y_val), callbacks=[es, mc])
# save_history(history, name)
# model.evaluate(x_test, y_test)
# display_loss_plot(history, name)
# display_acc_plot(history, name)

In [36]:
# Ver lista06

# Ler aqui pro batch generator:
#     https://datascience.stackexchange.com/questions/48796/how-to-feed-lstm-with-different-input-array-sizes

# Seu código aqui

QNT_EPOCAS_TREINO = 10


model.compile("adam", "sparse_categorical_crossentropy", metrics=["accuracy"])
history = model.fit(
    x_train, y_train, batch_size=32, epochs=QNT_EPOCAS_TREINO, validation_data=(x_val, y_val)
)

loss, accuracy = model.evaluate(x=x_test,y=y_test)

print("Loss: ", loss)
print("Accuracy: ", accuracy)

Epoch 1/10

KeyboardInterrupt: 

# Comparações 

# Dropout 0.0

### embedding_layer.trainable = True
```
Epoch 1/10
2482/2482 [==============================] - 383s 154ms/step - loss: 1.0851 - accuracy: 0.5380 - val_loss: 0.9881 - val_accuracy: 0.5780
Epoch 2/10
2482/2482 [==============================] - 400s 161ms/step - loss: 0.9654 - accuracy: 0.5859 - val_loss: 0.9020 - val_accuracy: 0.6110
Epoch 3/10
2482/2482 [==============================] - 391s 157ms/step - loss: 0.9116 - accuracy: 0.6084 - val_loss: 0.8640 - val_accuracy: 0.6291
Epoch 4/10
2482/2482 [==============================] - 412s 166ms/step - loss: 0.8630 - accuracy: 0.6309 - val_loss: 0.7967 - val_accuracy: 0.6648
Epoch 5/10
2482/2482 [==============================] - 404s 163ms/step - loss: 0.8152 - accuracy: 0.6520 - val_loss: 0.7535 - val_accuracy: 0.6912
Epoch 6/10
2482/2482 [==============================] - 425s 171ms/step - loss: 0.7649 - accuracy: 0.6772 - val_loss: 0.6945 - val_accuracy: 0.7154
Epoch 7/10
2482/2482 [==============================] - 478s 193ms/step - loss: 0.7166 - accuracy: 0.6992 - val_loss: 0.6500 - val_accuracy: 0.7313
Epoch 8/10
2482/2482 [==============================] - 492s 198ms/step - loss: 0.6680 - accuracy: 0.7196 - val_loss: 0.6138 - val_accuracy: 0.7464
Epoch 9/10
2482/2482 [==============================] - 481s 194ms/step - loss: 0.6248 - accuracy: 0.7370 - val_loss: 0.5679 - val_accuracy: 0.7631
Epoch 10/10
2482/2482 [==============================] - 472s 190ms/step - loss: 0.5873 - accuracy: 0.7511 - val_loss: 0.5323 - val_accuracy: 0.7761
2482/2482 [==============================] - 29s 12ms/step - loss: 0.5323 - accuracy: 0.7761
Loss:  0.532257080078125
Accuracy:  0.7761353850364685
```



### embedding_layer.trainable = False
```
Epoch 1/10
2482/2482 [==============================] - 90s 36ms/step - loss: 1.2016 - accuracy: 0.4954 - val_loss: 1.1219 - val_accuracy: 0.5238
Epoch 2/10
2482/2482 [==============================] - 88s 35ms/step - loss: 1.1006 - accuracy: 0.5322 - val_loss: 1.0658 - val_accuracy: 0.5467
Epoch 3/10
2482/2482 [==============================] - 89s 36ms/step - loss: 1.0550 - accuracy: 0.5500 - val_loss: 1.0191 - val_accuracy: 0.5655
Epoch 4/10
2482/2482 [==============================] - 92s 37ms/step - loss: 1.0298 - accuracy: 0.5603 - val_loss: 1.0086 - val_accuracy: 0.5682
Epoch 5/10
2482/2482 [==============================] - 94s 38ms/step - loss: 1.0102 - accuracy: 0.5693 - val_loss: 0.9865 - val_accuracy: 0.5761
Epoch 6/10
2482/2482 [==============================] - 91s 37ms/step - loss: 0.9939 - accuracy: 0.5742 - val_loss: 0.9819 - val_accuracy: 0.5804
Epoch 7/10
2482/2482 [==============================] - 88s 35ms/step - loss: 0.9813 - accuracy: 0.5799 - val_loss: 0.9750 - val_accuracy: 0.5857
Epoch 8/10
2482/2482 [==============================] - 87s 35ms/step - loss: 0.9705 - accuracy: 0.5850 - val_loss: 0.9909 - val_accuracy: 0.5775
Epoch 9/10
2482/2482 [==============================] - 84s 34ms/step - loss: 0.9597 - accuracy: 0.5883 - val_loss: 0.9482 - val_accuracy: 0.5949
Epoch 10/10
2482/2482 [==============================] - 85s 34ms/step - loss: 0.9503 - accuracy: 0.5919 - val_loss: 0.9388 - val_accuracy: 0.5954
2482/2482 [==============================] - 26s 11ms/step - loss: 0.9388 - accuracy: 0.5954

No teste:
Loss:  0.9387642741203308
Accuracy:  0.5953942537307739
```


###  embedding_layer.trainable = False e o UNK sendo -19999
```
Epoch 1/10
2482/2482 [==============================] - 138s 56ms/step - loss: 1.2058 - accuracy: 0.4917 - val_loss: 1.1347 - val_accuracy: 0.5199
Epoch 2/10
2482/2482 [==============================] - 129s 52ms/step - loss: 1.1054 - accuracy: 0.5303 - val_loss: 1.0637 - val_accuracy: 0.5500
Epoch 3/10
2482/2482 [==============================] - 128s 52ms/step - loss: 1.0628 - accuracy: 0.5495 - val_loss: 1.0425 - val_accuracy: 0.5602
Epoch 4/10
2482/2482 [==============================] - 125s 51ms/step - loss: 1.0364 - accuracy: 0.5590 - val_loss: 1.0186 - val_accuracy: 0.5630
Epoch 5/10
 143/2482 [>.............................] - ETA: 1:31 - loss: 1.0195 - accuracy: 0.5660
 ```

###  [embedding_layer.trainable = False] e o [UNK = pos -19999] (nltk tokenizer twitter)
``` 
Epoch 1/10
2482/2482 [==============================] - 187s 75ms/step - loss: 1.1684 - accuracy: 0.5067 - val_loss: 1.0854 - val_accuracy: 0.5410
Epoch 2/10
2482/2482 [==============================] - 172s 69ms/step - loss: 1.0489 - accuracy: 0.5517 - val_loss: 1.0070 - val_accuracy: 0.5755
Epoch 3/10
2482/2482 [==============================] - 169s 68ms/step - loss: 0.9994 - accuracy: 0.5731 - val_loss: 0.9720 - val_accuracy: 0.5846
Epoch 4/10
2482/2482 [==============================] - 129s 52ms/step - loss: 0.9737 - accuracy: 0.5837 - val_loss: 0.9497 - val_accuracy: 0.5915
Epoch 5/10
2482/2482 [==============================] - 87s 35ms/step - loss: 0.9521 - accuracy: 0.5920 - val_loss: 0.9395 - val_accuracy: 0.5960
Epoch 6/10
2482/2482 [==============================] - 86s 35ms/step - loss: 0.9380 - accuracy: 0.5963 - val_loss: 0.9266 - val_accuracy: 0.6064
Epoch 7/10
2482/2482 [==============================] - 119s 48ms/step - loss: 0.9238 - accuracy: 0.6011 - val_loss: 0.8999 - val_accuracy: 0.6130
Epoch 8/10
2482/2482 [==============================] - 132s 53ms/step - loss: 0.9126 - accuracy: 0.6051 - val_loss: 0.8931 - val_accuracy: 0.6152
Epoch 9/10
2482/2482 [==============================] - 119s 48ms/step - loss: 0.9021 - accuracy: 0.6116 - val_loss: 0.8847 - val_accuracy: 0.6161
Epoch 10/10
2482/2482 [==============================] - 125s 50ms/step - loss: 0.8907 - accuracy: 0.6145 - val_loss: 0.8713 - val_accuracy: 0.6226
2482/2482 [==============================] - 30s 12ms/step - loss: 0.8713 - accuracy: 0.6226
Loss:  0.8712893724441528
Accuracy:  0.6226407885551453
```

In [None]:
import matplotlib.pyplot as plt


acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']

epochs = range(1, len(acc) + 1)

y_pred[0]
# y_test[0]

In [None]:
# 
plt.plot(epochs, acc, 'r', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation loss')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()

plt.show()

In [None]:
plt.plot(epochs, loss, 'r', label='Training loss')
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.show()