In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import joblib
import warnings
warnings.filterwarnings('ignore')
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import  Embedding, LSTM, GRU, SimpleRNN, Dense, Dropout

In [2]:
data = pd.read_csv('/content/tweet_emotions.csv')
data.head()

Unnamed: 0,tweet_id,sentiment,content
0,1956967341,empty,@tiffanylue i know i was listenin to bad habi...
1,1956967666,sadness,Layin n bed with a headache ughhhh...waitin o...
2,1956967696,sadness,Funeral ceremony...gloomy friday...
3,1956967789,enthusiasm,wants to hang out with friends SOON!
4,1956968416,neutral,@dannycastillo We want to trade with someone w...


In [3]:
data.shape

(40000, 3)

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40000 entries, 0 to 39999
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   tweet_id   40000 non-null  int64 
 1   sentiment  40000 non-null  object
 2   content    40000 non-null  object
dtypes: int64(1), object(2)
memory usage: 937.6+ KB


In [5]:
data.isnull().sum()

Unnamed: 0,0
tweet_id,0
sentiment,0
content,0


In [6]:
data.drop("tweet_id",axis=1,inplace = True)

In [7]:
data.head()

Unnamed: 0,sentiment,content
0,empty,@tiffanylue i know i was listenin to bad habi...
1,sadness,Layin n bed with a headache ughhhh...waitin o...
2,sadness,Funeral ceremony...gloomy friday...
3,enthusiasm,wants to hang out with friends SOON!
4,neutral,@dannycastillo We want to trade with someone w...


In [8]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'[^a-z\s]', '', text)
    return text

In [9]:
data['content'] = data['content'].apply(clean_text)

In [10]:
label_encoder = LabelEncoder()
data['sentiment'] = label_encoder.fit_transform(data['sentiment'])
y = tf.keras.utils.to_categorical(data['sentiment'])

In [11]:
max_words = 10000
max_len = 50
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(data['content'])
X = tokenizer.texts_to_sequences(data['content'])
X = pad_sequences(X, maxlen=max_len)

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
model = Sequential()
model.add(Embedding(input_dim=max_words, output_dim=128, input_length=max_len))
model.add(LSTM(128))
model.add(Dropout(0.5))
model.add(Dense(13, activation='softmax'))

In [14]:
model.summary()

In [15]:
# model compile
model.compile(optimizer='adam',loss='categorical_crossentropy',metrics=['accuracy'])

In [16]:
history = model.fit(X_train,y_train,validation_data=(X_test, y_test),
                    epochs=15,batch_size=64)

Epoch 1/15
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m67s[0m 129ms/step - accuracy: 0.2490 - loss: 2.1628 - val_accuracy: 0.3374 - val_loss: 1.9301
Epoch 2/15
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m63s[0m 126ms/step - accuracy: 0.3858 - loss: 1.8365 - val_accuracy: 0.3475 - val_loss: 1.9081
Epoch 3/15
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m80s[0m 122ms/step - accuracy: 0.4515 - loss: 1.6808 - val_accuracy: 0.3394 - val_loss: 1.9607
Epoch 4/15
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m61s[0m 122ms/step - accuracy: 0.5071 - loss: 1.5203 - val_accuracy: 0.3260 - val_loss: 2.0275
Epoch 5/15
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m61s[0m 121ms/step - accuracy: 0.5532 - loss: 1.3827 - val_accuracy: 0.3137 - val_loss: 2.1431
Epoch 6/15
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m62s[0m 125ms/step - accuracy: 0.5987 - loss: 1.2412 - val_accuracy: 0.3047 - val_loss: 2.3332
Epoch 7/15

In [17]:
loss, accuracy = model.evaluate(X_test, y_test)
print("Test Accuracy:", accuracy)

[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 28ms/step - accuracy: 0.2678 - loss: 4.1037
Test Accuracy: 0.2685000002384186


In [18]:
def predict_emotion(text):
    text = clean_text(text)
    seq = tokenizer.texts_to_sequences([text])
    padded = pad_sequences(seq, maxlen=max_len)
    pred = model.predict(padded)
    return label_encoder.inverse_transform([np.argmax(pred)])[0]
predict_emotion("I feel very sad and lonely today")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 191ms/step


'sadness'

In [19]:
def predict_emotion(text):
    text = clean_text(text)
    seq = tokenizer.texts_to_sequences([text])
    padded = pad_sequences(seq, maxlen=max_len)
    pred = model.predict(padded)
    return label_encoder.inverse_transform([np.argmax(pred)])[0]
predict_emotion("cant fall asleep")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step


'neutral'

In [20]:
model.save("emotion_detection_model.h5")



In [25]:
preprocessor = {
    "tokenizer": tokenizer,
    "label_encoder": label_encoder,
    "max_len": max_len
}

In [26]:
joblib.dump(preprocessor, "preprocessor.pkl")

['preprocessor.pkl']