In [2]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("abhi8923shriv/sentiment-analysis-dataset")

print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/sentiment-analysis-dataset


# Libraries

In [3]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Activation,Convolution1D, Flatten, Dropout,Embedding,Input,Multiply
from sklearn.preprocessing import LabelEncoder
import pandas as pd 
import numpy as np


In [4]:
train = pd.read_csv("/kaggle/input/sentiment-analysis-dataset/train.csv", encoding='latin1')

In [5]:
train.head()

Unnamed: 0,textID,text,selected_text,sentiment,Time of Tweet,Age of User,Country,Population -2020,Land Area (Km²),Density (P/Km²)
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral,morning,0-20,Afghanistan,38928346,652860.0,60
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative,noon,21-30,Albania,2877797,27400.0,105
2,088c60f138,my boss is bullying me...,bullying me,negative,night,31-45,Algeria,43851044,2381740.0,18
3,9642c003ef,what interview! leave me alone,leave me alone,negative,morning,46-60,Andorra,77265,470.0,164
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative,noon,60-70,Angola,32866272,1246700.0,26


In [6]:
docs = train["text"].astype(str)
tokenizer = Tokenizer()
tokenizer.fit_on_texts(docs)
label = train["sentiment"].astype(str)
labels_keys = label.unique()
encoder = LabelEncoder()
integer_labels = encoder.fit_transform(label)

In [7]:
print(encoder.classes_)


['negative' 'neutral' 'positive']


In [8]:
integer_labels

array([1, 0, 0, ..., 2, 2, 1])

In [9]:
for word, index in list(tokenizer.word_index.items())[:10]:
    print(f"{word}:{index}")

i:1
to:2
the:3
a:4
my:5
it:6
you:7
and:8
is:9
in:10


In [10]:
X_train = tokenizer.texts_to_matrix(docs, mode='binary')
y_train = to_categorical(integer_labels)


In [11]:
print(X_train.shape)
print(y_train.shape)

(27481, 26599)
(27481, 3)


In [13]:
# Training steps
max_words = 1000
docs = train["text"].astype(str)
tokenizer = Tokenizer(num_words=max_words)           # Optionally, set num_words if you want to limit the vocabulary.
tokenizer.fit_on_texts(docs)        # Fit on training data.
label = train["sentiment"].astype(str)
encoder = LabelEncoder()
integer_labels = encoder.fit_transform(label)
X_train = tokenizer.texts_to_matrix(docs, mode='binary')
y_train = to_categorical(integer_labels)

In [15]:
input_dim = X_train.shape[1]
nb_classes = y_train.shape[1]

# Build and compile your model
inputs = Input(shape=(max_words,))
attention = Dense(max_words, activation='softmax', name='attention')(inputs)

# Use Multiply() instead of merge
attention_prod = Multiply(name='attention_prod')([inputs, attention])

# Further layers
attention_prod = Dense(256)(attention_prod)
attention_prod = Activation('relu')(attention_prod)
output = Dense(nb_classes, activation='softmax')(attention_prod)

# Define and compile the model
model = Model(inputs=inputs, outputs=output)  # Fixed `input` -> `inputs`, `output` -> `outputs`
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Print summary
model.summary()

In [16]:
print("Training...")
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.1, shuffle=False, verbose=1)

Training...
Epoch 1/10
[1m773/773[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 5ms/step - accuracy: 0.4108 - loss: 1.0801 - val_accuracy: 0.5507 - val_loss: 0.9373
Epoch 2/10
[1m773/773[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.5847 - loss: 0.8839 - val_accuracy: 0.6253 - val_loss: 0.8557
Epoch 3/10
[1m773/773[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.6664 - loss: 0.7625 - val_accuracy: 0.6508 - val_loss: 0.8285
Epoch 4/10
[1m773/773[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.7168 - loss: 0.6715 - val_accuracy: 0.6548 - val_loss: 0.8298
Epoch 5/10
[1m773/773[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.7567 - loss: 0.5950 - val_accuracy: 0.6515 - val_loss: 0.8527
Epoch 6/10
[1m773/773[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.7933 - loss: 0.5257 - val_accuracy: 0.6530 - val_loss: 0.8868
Epoch 7/10
[1m7

<keras.src.callbacks.history.History at 0x784ecfabda80>

In [70]:
model.save("sentinment best model.h5")

In [17]:
test = pd.read_csv("/kaggle/input/sentiment-analysis-dataset/test.csv", encoding='latin1')

In [18]:
index_to_emotion = {
    0:"negative",
    1:"neutral",
    2:"positive",
}


In [20]:
docs_test = test["text"].fillna("").astype(str)
sent_test = test["sentiment"].fillna("").astype(str)
# DON'T call tokenizer.fit_on_texts(docs_test) again!
test_doc = docs_test.iloc[10]  # raw text string from your test set
test_sent = sent_test[10]
# Convert the test document into the same numerical representation using the already-fitted tokenizer.
test_vector = tokenizer.texts_to_matrix([test_doc], mode='binary')

# Predict using the model
print("=================original sentence=======================")
print(test_doc)
print("=================original_sentinment=======================")
print(test_sent)
print("=================predicted_sentinment=======================")
prediction = model.predict(test_vector)
print(index_to_emotion[np.argmax(prediction)])


 and within a short time of the last clue all of them
neutral
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
neutral
