In [1]:
import pandas as pd

In [2]:
data = pd.read_csv('twitter_training.csv')

In [3]:
data.columns = ['ID', 'Brand', 'Sentiment', 'Phrase']

In [4]:
data.head()

Unnamed: 0,ID,Brand,Sentiment,Phrase
0,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
1,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
2,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
3,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
4,2401,Borderlands,Positive,im getting into borderlands and i can murder y...


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74681 entries, 0 to 74680
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   ID         74681 non-null  int64 
 1   Brand      74681 non-null  object
 2   Sentiment  74681 non-null  object
 3   Phrase     73995 non-null  object
dtypes: int64(1), object(3)
memory usage: 2.3+ MB


In [6]:
data.isna().sum()

Unnamed: 0,0
ID,0
Brand,0
Sentiment,0
Phrase,686


In [7]:
data = data.dropna()

In [8]:
data.isna().sum()

Unnamed: 0,0
ID,0
Brand,0
Sentiment,0
Phrase,0


In [10]:
data['Sentiment'].unique()

array(['Positive', 'Neutral', 'Negative', 'Irrelevant'], dtype=object)

In [11]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
data['Sentiment'] = le.fit_transform(data['Sentiment'])

In [12]:
le.classes_

array(['Irrelevant', 'Negative', 'Neutral', 'Positive'], dtype=object)

In [14]:
#Feature Generation Using Bag of Words
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import RegexpTokenizer

In [20]:
#tf-idf
from sklearn.feature_extraction.text import TfidfVectorizer
tf=TfidfVectorizer()
text_tf= tf.fit_transform(data['Phrase'])

In [30]:
import re
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, SpatialDropout1D
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
import nltk

In [31]:
# Download stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [32]:
# Rename columns to match given dataset
data.rename(columns={'Brand': 'candidate', 'Sentiment': 'sentiment', 'Phrase': 'text'}, inplace=True)


In [33]:
# Keep relevant columns
data = data[['candidate', 'sentiment', 'text']]


In [34]:
data

Unnamed: 0,candidate,sentiment,text
0,Borderlands,3,I am coming to the borders and I will kill you...
1,Borderlands,3,im getting on borderlands and i will kill you ...
2,Borderlands,3,im coming on borderlands and i will murder you...
3,Borderlands,3,im getting on borderlands 2 and i will murder ...
4,Borderlands,3,im getting into borderlands and i can murder y...
...,...,...,...
74676,Nvidia,3,Just realized that the Windows partition of my...
74677,Nvidia,3,Just realized that my Mac window partition is ...
74678,Nvidia,3,Just realized the windows partition of my Mac ...
74679,Nvidia,3,Just realized between the windows partition of...


In [39]:
data['sentiment'].unique()

array([3, 2, 1, 0])

In [35]:
# Text Preprocessing Function
def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'https?://\S+|www\.\S+', '', text)  # Remove URLs
    text = re.sub(r'@\w+', '', text)  # Remove mentions
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove punctuation
    text = ' '.join([word for word in text.split() if word not in stop_words])  # Remove stopwords
    return text

In [36]:
data['text'] = data['text'].astype(str).apply(clean_text)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['text'] = data['text'].astype(str).apply(clean_text)


In [37]:
# Tokenization
tokenizer = Tokenizer(num_words=20000)  # Use top 20,000 words
tokenizer.fit_on_texts(data['text'])
X = tokenizer.texts_to_sequences(data['text'])

In [40]:
# Padding sequences
max_len = 50  # Fixed sequence length
X = pad_sequences(X, maxlen=max_len)

# Encode target variable
y = to_categorical(data['sentiment'], num_classes=4)  #  4 sentiment classes

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [44]:
# Build LSTM Model
model = Sequential([
    Embedding(input_dim=20000, output_dim=128, input_length=max_len),
    SpatialDropout1D(0.2),
    LSTM(100, dropout=0.2, recurrent_dropout=0.2),
    Dense(4, activation='softmax')  # 3 sentiment classes
])

In [45]:
# Compile Model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])


In [46]:
# Train Model
history = model.fit(X_train, y_train, epochs=5, batch_size=64, validation_split=0.2)


Epoch 1/5
[1m740/740[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m122s[0m 157ms/step - accuracy: 0.4925 - loss: 1.1483 - val_accuracy: 0.7323 - val_loss: 0.7020
Epoch 2/5
[1m740/740[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m148s[0m 165ms/step - accuracy: 0.7958 - loss: 0.5508 - val_accuracy: 0.7927 - val_loss: 0.5561
Epoch 3/5
[1m740/740[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m137s[0m 158ms/step - accuracy: 0.8579 - loss: 0.3792 - val_accuracy: 0.8057 - val_loss: 0.5255
Epoch 4/5
[1m740/740[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m142s[0m 158ms/step - accuracy: 0.8871 - loss: 0.3027 - val_accuracy: 0.8186 - val_loss: 0.5112
Epoch 5/5
[1m740/740[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m123s[0m 166ms/step - accuracy: 0.9001 - loss: 0.2598 - val_accuracy: 0.8296 - val_loss: 0.5160


In [47]:
# Evaluate Model
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy:.4f}")

[1m463/463[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 21ms/step - accuracy: 0.8294 - loss: 0.5106
Test Accuracy: 0.8293
