In [1]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences




In [2]:
import pandas as pd
import tensorflow as tf

url = "https://raw.githubusercontent.com/dD2405/Twitter_Sentiment_Analysis/master/train.csv"
csv_path = tf.keras.utils.get_file("twitter_sentiment.csv", url)
df = pd.read_csv(csv_path)

df = df[["tweet", "label"]]

In [3]:
# Configuración
vocab_size = 10000
max_length = 100
oov_token = "<OOV>"

# Tokenizador
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_token)
tokenizer.fit_on_texts(df["tweet"])

# Transformar textos a secuencias
sequences = tokenizer.texts_to_sequences(df["tweet"])
padded = pad_sequences(sequences, maxlen=max_length, padding="post", truncating="post")

# Ejemplo
print("Texto original:", df["tweet"].iloc[0])
print("Secuencia:", sequences[0])
print("Secuencia padded:", padded[0])

Texto original:  @user when a father is dysfunctional and is so selfish he drags his kids into his dysfunction.   #run
Secuencia: [2, 35, 5, 253, 12, 1, 8, 12, 21, 2754, 98, 6656, 96, 256, 257, 96, 8137, 481]
Secuencia padded: [   2   35    5  253   12    1    8   12   21 2754   98 6656   96  256
  257   96 8137  481    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0]


In [4]:
from sklearn.model_selection import train_test_split

In [5]:
X = padded
y = df["label"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Train:", X_train.shape, y_train.shape)
print("Test:", X_test.shape, y_test.shape)

Train: (25569, 100) (25569,)
Test: (6393, 100) (6393,)


In [10]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, LSTM

In [11]:
def build_lstm(vocab_size=10000, embedding_dim=64, max_length=100):
    model = Sequential([
        Embedding(vocab_size, embedding_dim, input_length=max_length),
        LSTM(64),
        Dense(1, activation='sigmoid')
    ])
    return model

lstm_model = build_lstm()
lstm_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
lstm_model.summary()

# Entrenamiento
lstm_model.fit(X_train, y_train, epochs=5, validation_data=(X_test, y_test))




''



Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 100, 64)           640000    
                                                                 
 lstm (LSTM)                 (None, 64)                33024     
                                                                 
 dense (Dense)               (None, 1)                 65        
                                                                 
Total params: 673089 (2.57 MB)
Trainable params: 673089 (2.57 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Epoch 1/5


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


''