In [1]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences




In [2]:
import pandas as pd
import tensorflow as tf

url = "https://raw.githubusercontent.com/dD2405/Twitter_Sentiment_Analysis/master/train.csv"
csv_path = tf.keras.utils.get_file("twitter_sentiment.csv", url)
df = pd.read_csv(csv_path)

df = df[["tweet", "label"]]

In [3]:
# Configuración
vocab_size = 10000
max_length = 100
oov_token = "<OOV>"

# Tokenizador
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_token)
tokenizer.fit_on_texts(df["tweet"])

# Transformar textos a secuencias
sequences = tokenizer.texts_to_sequences(df["tweet"])
padded = pad_sequences(sequences, maxlen=max_length, padding="post", truncating="post")

# Ejemplo
print("Texto original:", df["tweet"].iloc[0])
print("Secuencia:", sequences[0])
print("Secuencia padded:", padded[0])

Texto original:  @user when a father is dysfunctional and is so selfish he drags his kids into his dysfunction.   #run
Secuencia: [2, 35, 5, 253, 12, 1, 8, 12, 21, 2754, 98, 6656, 96, 256, 257, 96, 8137, 481]
Secuencia padded: [   2   35    5  253   12    1    8   12   21 2754   98 6656   96  256
  257   96 8137  481    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0]


In [4]:
from sklearn.model_selection import train_test_split

In [5]:
X = padded
y = df["label"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Train:", X_train.shape, y_train.shape)
print("Test:", X_test.shape, y_test.shape)

Train: (25569, 100) (25569,)
Test: (6393, 100) (6393,)


In [9]:
from tensorflow.keras.layers import Embedding, Input, Bidirectional, LSTM, Dense, Layer
from tensorflow.keras.models import Model

In [10]:
class Attention(Layer):
    def __init__(self):
        super(Attention, self).__init__()

    def call(self, inputs):
        score = tf.nn.tanh(inputs)
        weights = tf.nn.softmax(score, axis=1)
        context = weights * inputs
        return tf.reduce_sum(context, axis=1)

def build_bilstm_attention(vocab_size=10000, embedding_dim=64, max_length=100):
    inputs = Input(shape=(max_length,))
    x = Embedding(vocab_size, embedding_dim)(inputs)
    x = Bidirectional(LSTM(64, return_sequences=True))(x)
    x = Attention()(x)
    outputs = Dense(1, activation='sigmoid')(x)
    model = Model(inputs, outputs)
    return model

bilstm_model = build_bilstm_attention()
bilstm_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
bilstm_model.summary()

# Entrenamiento
bilstm_model.fit(X_train, y_train, epochs=5, validation_data=(X_test, y_test))



Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_3 (InputLayer)        [(None, 100)]             0         
                                                                 
 embedding (Embedding)       (None, 100, 64)           640000    
                                                                 
 bidirectional (Bidirection  (None, 100, 128)          66048     
 al)                                                             
                                                                 
 attention (Attention)       (None, 128)               0         
                                                                 
 dense (Dense)               (None, 1)                 129       
                                                                 
Total params: 706177 (2.69 MB)
Trainable params: 706177 (2.69 MB)
Non-trainable params: 0 (0.00 Byte)
________________________

<keras.src.callbacks.History at 0x23a88c99e10>