<a href="https://colab.research.google.com/github/viniciusrpb/116319_estruturasdedados/blob/main/polarity_classification_attention_lstm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
!pip install -q tensorflow

In [38]:
import tensorflow as tf
from keras.models import Model
from keras.layers import Input,Dense, Activation,Dropout, Embedding,Flatten,LSTM,Bidirectional, Layer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from keras.preprocessing import sequence
import pandas as pd
from keras.preprocessing.sequence import pad_sequences
import numpy as np
from collections import Counter

In [48]:
class SelfAttention(Layer):
    def __init__(self, units):
        super(SelfAttention, self).__init__()
        self.units = units
        self.Wq = Dense(units, use_bias=False)  # Projeção das queries
        self.Wk = Dense(units, use_bias=False)  # Projeção das keys
        self.Wv = Dense(units, use_bias=False)  # Projeção das values

    def call(self, inputs):
        # Projeção para Q, K, V
        Q = self.Wq(inputs)
        K = self.Wk(inputs)
        V = self.Wv(inputs)

        # Produto escalar entre Q e K transposto
        attention_scores = tf.matmul(Q, K, transpose_b=True)

        # Normalização pela raiz quadrada da dimensão de K
        d_k = tf.cast(tf.shape(K)[-1], tf.float32)
        scaled_attention_scores = attention_scores / tf.sqrt(d_k)

        # Aplicação da softmax para obter pesos de atenção
        attention_weights = tf.nn.softmax(scaled_attention_scores, axis=-1)

        # Aplicação dos pesos de atenção sobre V
        attention_output = tf.matmul(attention_weights, V)

        return attention_output

In [18]:
df_train = pd.read_csv('https://raw.githubusercontent.com/viniciusrpb/cic0269_natural_language_processing/main/datasets/corpora/tweets_polarity_classification/twitter-2013train-A.txt', names=["id","polarity","text"],sep='\t',encoding="UTF-8",low_memory = False)
df_valid = pd.read_csv('https://raw.githubusercontent.com/viniciusrpb/cic0269_natural_language_processing/main/datasets/corpora/tweets_polarity_classification/twitter-2013dev-A.txt', names=["id","polarity","text"],sep='\t',encoding="UTF-8",low_memory = False)
df_test = pd.read_csv('https://raw.githubusercontent.com/viniciusrpb/cic0269_natural_language_processing/main/datasets/corpora/tweets_polarity_classification/twitter-2013test-A.txt', names=["id","polarity","text"],sep='\t',encoding="UTF-8",low_memory = False)

df_train.head()

Unnamed: 0,id,polarity,text
0,264183816548130816,positive,Gas by my house hit $3.39!!!! I\u2019m going t...
1,263405084770172928,negative,Theo Walcott is still shit\u002c watch Rafa an...
2,262163168678248449,negative,its not that I\u2019m a GSP fan\u002c i just h...
3,264249301910310912,negative,Iranian general says Israel\u2019s Iron Dome c...
4,262682041215234048,neutral,Tehran\u002c Mon Amour: Obama Tried to Establi...


In [19]:
df_train.drop(labels=['id'],axis=1)
df_valid.drop(labels=['id'],axis=1)
df_test.drop(labels=['id'],axis=1)

Unnamed: 0,polarity,text
0,positive,"@jjuueellzz down in the Atlantic city, ventnor..."
1,positive,Musical awareness: Great Big Beautiful Tomorro...
2,neutral,On Radio786 100.4fm 7:10 Fri Oct 19 Labour ana...
3,negative,"Kapan sih lo ngebuktiin,jan ngomong doang Susa..."
4,neutral,"Excuse the connectivity of this live stream, f..."
...,...,...
3542,negative,Khaleda Zia's present India visit may have a b...
3543,neutral,"FYI, golf fans: @jameslepp will join Moj on We..."
3544,negative,@__Aniko you think mr.Calle let practice with ...
3545,positive,Don't hide under your desk! It's just a salsa ...


In [20]:
df_train['polarity'] = pd.Categorical(df_train['polarity'])
df_valid['polarity'] = pd.Categorical(df_valid['polarity'])
df_test['polarity'] = pd.Categorical(df_test['polarity'])

X_train = df_train['text']

y_train = df_train['polarity'].cat.codes
y_valid = df_valid['polarity'].cat.codes
y_test = df_test['polarity'].cat.codes

In [22]:
num_classes = 3

y_train_enc = to_categorical(y_train,3)
y_valid_enc = to_categorical(y_valid,3)
y_test_enc = to_categorical(y_test,3)

In [24]:
num_classes = 3

results = Counter()
df_train['text'].str.lower().str.split().apply(results.update)
df_valid['text'].str.lower().str.split().apply(results.update)
total_vocab_size = len(results)

In [28]:
soma = 0

for item in df_train['text']:
  soma += len(item)
media = soma/len(df_train['text'])
print(f'Em média, cada tweet possui {media} palavras.')

Em média, cada tweet possui 117.61730689797604 palavras.


Hiperparametros

In [29]:
vocab_size = 1000 # make the top list of words (common words)
embedding_dim = 40
max_length = 40
bs= 32
trunc_type = 'post'
padding_type = 'pre'
oov_tok = '<OOV>' # OOV = Out of Vocabulary

In [31]:
tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(df_train['text'])

In [32]:
word_index = tokenizer.word_index
len(word_index)

24029

In [33]:
train_sequences = tokenizer.texts_to_sequences(df_train['text'])

valid_sequences = tokenizer.texts_to_sequences(df_valid['text'])

X_train_padded = pad_sequences(train_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

X_valid_padded = pad_sequences(valid_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

Verificando se está tudo correto

In [35]:
print(df_train['text'].iloc[3])
print(X_train_padded[3])

Iranian general says Israel\u2019s Iron Dome can\u2019t deal with their missiles (keep talking like that and we may end up finding out)
[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   1   1
 262   1  21   1   1  56  36 591  20 162   1 426 625  57  24   8  35  25
 248  51   1  33]


In [36]:
X_train_padded.shape, X_valid_padded.shape

((9684, 40), (1654, 40))

In [37]:
y_train = to_categorical(y_train)
y_valid = to_categorical(y_valid)

Segue o modelo LSTM

In [49]:
inputs = Input(shape=(max_length,))
embedding = Embedding(input_dim=vocab_size, output_dim=64)(inputs)

lstm_out = Bidirectional(LSTM(64, return_sequences=True))(embedding)

# Aplicando a Self-Attention como camada
attention_out = SelfAttention(units=64)(lstm_out)

# Pooling Global para compactar a saída da atenção


# Camada fully connected para classificação
dense = Dense(64, activation='relu')(attention_out)
dropout = Dropout(0.5)(dense)
outputs = Dense(1, activation='sigmoid')(dropout)

model = Model(inputs=inputs, outputs=outputs)

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
print(model.summary())

None


In [None]:
from tensorflow.keras.optimizers import SGD,Adam
#sgd = SGD(learning_rate=0.05)
#model.compile(loss='categorical_crossentropy', optimizer=sgd, metrics=['accuracy'])

opt = Adam(learning_rate=0.0001, decay=1e-6)
model.compile(
    loss='binary_crossentropy',
    optimizer=opt,
    metrics=['accuracy'],
)

#model.compile(optimizer='adam',
#                   loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
#                   metrics=['accuracy'])

In [None]:
model.fit(X_train_padded,y_train,validation_data=(X_valid_padded,y_valid),epochs=15,batch_size=bs)

In [None]:
import matplotlib.pyplot as plt

def plot_graphs(history, metric):
    plt.plot(history.history[metric])
    plt.plot(history.history['val_'+metric], '')
    plt.xlabel("Epochs")
    plt.ylabel(metric)
    plt.legend([metric, 'val_'+metric])
    plt.figure(figsize=(16, 8))
    plt.subplot(1, 2, 1)
    plot_graphs(history, 'accuracy')
    plt.ylim(None, 1)
    #plt.xticks(range(0,20))
    plt.subplot(1, 2, 2)
    plot_graphs(history, 'loss')
    plt.ylim(0, None)
    #plt.xticks(range(0,20))

In [None]:
y_test

In [None]:
# import classification_report
from sklearn.metrics import classification_report

_, test_acc = model.evaluate(X_test_padded, to_categorical(y_test), verbose=0)
print(test_acc)

y_pred = model.predict(X_test_padded, batch_size=bs, verbose=2)

# get the class with highest probability for each sample
y_pred = np.argmax(y_pred, axis=-1)

# get the classification report
print(classification_report(y_test, y_pred))