In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GRU, Dense, Dropout

In [2]:
# Load the dataset
# Adjust encoding if necessary, depending on the data's format
data = pd.read_csv('Copy of Sentiment.csv', encoding='ISO-8859-1')  # Use 'latin-1', 'ISO-8859-1' or encoding='cp1252'

# Data Cleaning

In [3]:
# Replace NaN (missing) values in 'text' column with empty strings
# This ensures all values are strings and prevents errors when applying text operations
data['text'] = data['text'].fillna('').astype(str)

# Convert text to lowercase to normalize the data for text processing

In [4]:
# We apply this to the 'text' column
data['text'] = data['text'].apply(lambda x: x.lower())

# Tokenization - Convert the text data into sequences of tokens (words)

In [5]:
# Here we use the 'text' column for sentiment analysis
tokenizer = Tokenizer(num_words=10000, oov_token='<OOV>')
tokenizer.fit_on_texts(data['text'])

In [6]:
# Convert texts to sequences of integers
sequences = tokenizer.texts_to_sequences(data['text'])

# Padding - Ensure all sequences are of the same length for model input

In [7]:
# Pad sequences to the maximum length, truncating longer ones and padding shorter ones
padded_sequences = pad_sequences(sequences, maxlen=50, padding='post', truncating='post')

# Label Encoding - Convert the 'sentiment' column (target variable) into numeric labels

In [8]:
# This is necessary because machine learning models can't work directly with text labels
sentiment_mapping = {'positive': 2, 'neutral': 1, 'negative': 0}
data['sentiment'] = data['sentiment'].map(sentiment_mapping)

In [9]:
# Convert target variable (sentiment) into NumPy array
labels = np.array(data['sentiment'])

# Train-Test Split - Split the data into training and test sets

In [10]:
# 80% training and 20% test data
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, labels, test_size=0.2, random_state=42)

# Build the GRU model for sentiment analysis

In [11]:
model = Sequential()

# Embedding layer to convert word indices into dense vectors of fixed size
model.add(Embedding(input_dim=10000, output_dim=64))

# GRU layer for sequential processing of text data
model.add(GRU(64, return_sequences=False))

# Add a Dropout layer to prevent overfitting
model.add(Dropout(0.5))

# Dense layer for output
model.add(Dense(3, activation='softmax'))  # Output layer for 3 classes: positive, neutral, negative

# Compile the model

In [12]:
# Use categorical crossentropy for multi-class classification and Adam optimizer
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model

In [13]:
# Train for 10 epochs with a batch size of 32
history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))

Epoch 1/10
[1m687/687[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 33ms/step - accuracy: 0.4056 - loss: 1.0884 - val_accuracy: 0.4057 - val_loss: 1.0868
Epoch 2/10
[1m687/687[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 27ms/step - accuracy: 0.4037 - loss: 1.0878 - val_accuracy: 0.4057 - val_loss: 1.0869
Epoch 3/10
[1m687/687[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 27ms/step - accuracy: 0.4009 - loss: 1.0895 - val_accuracy: 0.4057 - val_loss: 1.0867
Epoch 4/10
[1m687/687[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 27ms/step - accuracy: 0.3948 - loss: 1.0910 - val_accuracy: 0.4057 - val_loss: 1.0867
Epoch 5/10
[1m687/687[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 27ms/step - accuracy: 0.4013 - loss: 1.0893 - val_accuracy: 0.4057 - val_loss: 1.0867
Epoch 6/10
[1m687/687[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 26ms/step - accuracy: 0.4060 - loss: 1.0871 - val_accuracy: 0.4057 - val_loss: 1.0867
Epoch 7/10
[1m6

# Evaluate the model

In [14]:
# Evaluate the performance of the model on the test data
test_loss, test_acc = model.evaluate(X_test, y_test)
print(f'Test Accuracy: {test_acc}')

[1m172/172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 10ms/step - accuracy: 0.4091 - loss: 1.0873
Test Accuracy: 0.40567582845687866


# Make Predictions

In [15]:
# Predict sentiment on new data (X_test as example)
predictions = model.predict(X_test)
predicted_classes = np.argmax(predictions, axis=1)

[1m172/172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 11ms/step


In [16]:
# Show some predictions
for i in range(5):
    print(f"Text: {data['text'][i]}")
    print(f"True Sentiment: {y_test[i]}, Predicted Sentiment: {predicted_classes[i]}")
    print()

Text:  i`d have responded, if i were going
True Sentiment: 2, Predicted Sentiment: 1

Text:  sooo sad i will miss you here in san diego!!!
True Sentiment: 1, Predicted Sentiment: 1

Text: my boss is bullying me...
True Sentiment: 1, Predicted Sentiment: 1

Text:  what interview! leave me alone
True Sentiment: 0, Predicted Sentiment: 1

Text:  sons of ****, why couldn`t they put them on the releases we already bought
True Sentiment: 2, Predicted Sentiment: 1

