In [17]:
import numpy as np
import pandas as pd
import tensorflow as tf

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from sklearn.utils.class_weight import compute_class_weight

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import (
    Input, Embedding, LSTM, Bidirectional,
    Dense, Dropout, Layer
)
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping
import tensorflow.keras.backend as K

In [18]:
data = []

with open("train.txt", "r", encoding="utf-8") as f:
    for line in f:
        text, label = line.strip().rsplit(";", 1)
        data.append([text, label])

df = pd.DataFrame(data, columns=["text", "label"])
df.head()

Unnamed: 0,text,label
0,i didnt feel humiliated,sadness
1,i can go from feeling so hopeless to so damned...,sadness
2,im grabbing a minute to post i feel greedy wrong,anger
3,i am ever feeling nostalgic about the fireplac...,love
4,i am feeling grouchy,anger


In [19]:
le = LabelEncoder()
df["label"] = le.fit_transform(df["label"])

num_classes = len(le.classes_)
print("Classes:", le.classes_)

Classes: ['anger' 'fear' 'joy' 'love' 'sadness' 'surprise']


In [20]:
VOCAB_SIZE = 30000
MAX_LEN = 150

tokenizer = Tokenizer(num_words=VOCAB_SIZE, oov_token="<OOV>")
tokenizer.fit_on_texts(df["text"])

sequences = tokenizer.texts_to_sequences(df["text"])
padded_sequences = pad_sequences(
    sequences,
    maxlen=MAX_LEN,
    padding="post",
    truncating="post"
)

In [21]:
X_train, X_test, y_train, y_test = train_test_split(
    padded_sequences,
    df["label"],
    test_size=0.2,
    random_state=42,
    stratify=df["label"]
)

In [22]:
class_weights = compute_class_weight(
    class_weight="balanced",
    classes=np.unique(y_train),
    y=y_train
)

class_weights = dict(enumerate(class_weights))
class_weights

{0: np.float64(1.2352827639451844),
 1: np.float64(1.3763440860215055),
 2: np.float64(0.4972804972804973),
 3: np.float64(2.045381911153723),
 4: np.float64(0.5714795963925351),
 5: np.float64(4.6681254558716265)}

In [23]:
!wget -q http://nlp.stanford.edu/data/glove.6B.zip
!unzip -q glove.6B.zip

In [24]:
embedding_dim = 100
embeddings_index = {}

with open("glove.6B.100d.txt", encoding="utf-8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], dtype="float32")
        embeddings_index[word] = vector

In [25]:
embedding_matrix = np.zeros((VOCAB_SIZE, embedding_dim))

for word, i in tokenizer.word_index.items():
    if i < VOCAB_SIZE and word in embeddings_index:
        embedding_matrix[i] = embeddings_index[word]

In [26]:
class Attention(Layer):
    def build(self, input_shape):
        self.W = self.add_weight(
            name="att_weight",
            shape=(input_shape[-1], 1),
            initializer="normal"
        )
        super().build(input_shape)

    def call(self, inputs):
        score = K.tanh(K.dot(inputs, self.W))
        attention_weights = K.softmax(score, axis=1)
        context = inputs * attention_weights
        return K.sum(context, axis=1)

In [27]:
input_layer = Input(shape=(MAX_LEN,))

embedding = Embedding(
    VOCAB_SIZE,
    embedding_dim,
    weights=[embedding_matrix],
    trainable=False
)(input_layer)

x = Bidirectional(LSTM(128, return_sequences=True))(embedding)
x = Attention()(x)

x = Dense(128, activation="relu")(x)
x = Dropout(0.5)(x)

output = Dense(num_classes, activation="softmax")(x)

model = Model(inputs=input_layer, outputs=output)

model.compile(
    loss="sparse_categorical_crossentropy",
    optimizer="adam",
    metrics=["accuracy"]
)

model.summary()

In [28]:
early_stop = EarlyStopping(
    monitor="val_loss",
    patience=2,
    restore_best_weights=True
)

history = model.fit(
    X_train,
    y_train,
    epochs=15,
    batch_size=64,
    validation_split=0.1,
    class_weight=class_weights,
    callbacks=[early_stop]
)

Epoch 1/15
[1m180/180[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m113s[0m 598ms/step - accuracy: 0.1872 - loss: 1.7956 - val_accuracy: 0.4320 - val_loss: 1.6070
Epoch 2/15
[1m180/180[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m111s[0m 616ms/step - accuracy: 0.3417 - loss: 1.5797 - val_accuracy: 0.5883 - val_loss: 1.1476
Epoch 3/15
[1m180/180[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m109s[0m 608ms/step - accuracy: 0.6068 - loss: 0.9500 - val_accuracy: 0.7445 - val_loss: 0.7415
Epoch 4/15
[1m180/180[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m114s[0m 631ms/step - accuracy: 0.7459 - loss: 0.6156 - val_accuracy: 0.7617 - val_loss: 0.6552
Epoch 5/15
[1m180/180[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m144s[0m 644ms/step - accuracy: 0.8149 - loss: 0.4686 - val_accuracy: 0.8359 - val_loss: 0.4404
Epoch 6/15
[1m180/180[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m137s[0m 619ms/step - accuracy: 0.8604 - loss: 0.3393 - val_accuracy: 0.8422 - val_loss: 0.4425
Epoc

In [29]:
y_pred = np.argmax(model.predict(X_test), axis=1)

print(classification_report(
    y_test,
    y_pred,
    target_names=le.classes_
))

[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 174ms/step
              precision    recall  f1-score   support

       anger       0.88      0.89      0.89       432
        fear       0.87      0.85      0.86       387
         joy       0.96      0.86      0.90      1072
        love       0.70      0.91      0.79       261
     sadness       0.93      0.92      0.93       933
    surprise       0.67      0.95      0.78       115

    accuracy                           0.89      3200
   macro avg       0.83      0.90      0.86      3200
weighted avg       0.90      0.89      0.89      3200



In [30]:
def predict_emotion(text):
    seq = tokenizer.texts_to_sequences([text])
    pad = pad_sequences(seq, maxlen=MAX_LEN, padding="post")
    pred = model.predict(pad)
    return le.inverse_transform([np.argmax(pred)])[0]

predict_emotion("I feel extremely lonely and helpless today")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 780ms/step


'sadness'

In [32]:
model.save("emotion_model.keras")

import pickle

with open("tokenizer.pkl", "wb") as f:
    pickle.dump(tokenizer, f)

with open("label_encoder.pkl", "wb") as f:
    pickle.dump(le, f)

print("Model, tokenizer, and label encoder saved successfully!")

Model, tokenizer, and label encoder saved successfully!
