In [31]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Dropout, LSTM
from sklearn.model_selection import train_test_split
from sklearn import metrics
import matplotlib.pyplot as plt

# Load dataset
data = pd.read_csv("Corona_NLP_test.csv")

In [32]:
data.head()

Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment
0,1,44953,NYC,02-03-2020,TRENDING: New Yorkers encounter empty supermar...,Extremely Negative
1,2,44954,"Seattle, WA",02-03-2020,When I couldn't find hand sanitizer at Fred Me...,Positive
2,3,44955,,02-03-2020,Find out how you can protect yourself and love...,Extremely Positive
3,4,44956,Chicagoland,02-03-2020,#Panic buying hits #NewYork City as anxious sh...,Negative
4,5,44957,"Melbourne, Victoria",03-03-2020,#toiletpaper #dunnypaper #coronavirus #coronav...,Neutral


In [34]:
#Encode Sentiment labels
label_encoder = LabelEncoder()
data['Sentiment'] = label_encoder.fit_transform(data['Sentiment'])

# Drop any missing values
data = data.dropna()


In [35]:

# Prepare input (X) and output (y)
x = data['OriginalTweet']
y = data['Sentiment'].values  # Labels (0, 1, 2...)

# Download stopwords
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [36]:

# Preprocess text
ps = PorterStemmer()
corpus = []
for tweet in x:
    review = re.sub('[^a-zA-Z]', ' ', tweet)  # Remove non-alphabet characters
    review = review.lower()
    review = review.split()
    review = [ps.stem(word) for word in review if word not in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)


In [37]:

# Tokenization
voc_size = 50000
tokenizer = Tokenizer(num_words=voc_size, oov_token="<OOV>")
tokenizer.fit_on_texts(corpus)
sequences = tokenizer.texts_to_sequences(corpus)

# Padding
sent_length = 30
final_input = pad_sequences(sequences, padding='pre', maxlen=sent_length)


In [38]:

# Convert to NumPy array
final_output = np.array(y)

# Split into training & testing sets
x_train, x_test, y_train, y_test = train_test_split(final_input, final_output, test_size=0.33, random_state=42)


In [39]:

# Model
dim = 40
model = Sequential()
model.add(Embedding(voc_size, dim, input_length=sent_length))
model.add(Dropout(0.3))
model.add(LSTM(80))
model.add(Dropout(0.3))
model.add(Dense(len(set(y)), activation='softmax'))  # Adjust output layer dynamically




In [40]:
# Compile model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [44]:
# Train model
model.fit(x_train, y_train, batch_size=50, epochs=100, validation_data=(x_test, y_test))

Epoch 1/100
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 49ms/step - accuracy: 0.9844 - loss: 0.0682 - val_accuracy: 0.3943 - val_loss: 3.0825
Epoch 2/100
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 47ms/step - accuracy: 0.9855 - loss: 0.0493 - val_accuracy: 0.3994 - val_loss: 3.1542
Epoch 3/100
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 45ms/step - accuracy: 0.9944 - loss: 0.0251 - val_accuracy: 0.3953 - val_loss: 3.1933
Epoch 4/100
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 62ms/step - accuracy: 0.9914 - loss: 0.0285 - val_accuracy: 0.3963 - val_loss: 3.4754
Epoch 5/100
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 56ms/step - accuracy: 0.9940 - loss: 0.0238 - val_accuracy: 0.3963 - val_loss: 3.3166
Epoch 6/100
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 47ms/step - accuracy: 0.9975 - loss: 0.0149 - val_accuracy: 0.3984 - val_loss: 3.4260
Epoch 7/100
[1m40/40[0m [

<keras.src.callbacks.history.History at 0x7de5f8ecb9d0>

In [45]:
# Predict classes
y_pred = np.argmax(model.predict(x_test), axis=1)

[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 12ms/step


In [46]:
# Accuracy score
val = metrics.accuracy_score(y_test, y_pred)
print("Accuracy is =", str(val * 100) + " %")

Accuracy is = 36.567926455566905 %
