In [1]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [2]:
!cp '/content/drive/MyDrive/twitter_sentiment_analysis.zip' 'my_data'

In [3]:
!unzip 'my_data' -d 'my_data2'

Archive:  my_data
  inflating: my_data2/twitter_training.csv  
  inflating: my_data2/twitter_validation.csv  


In [23]:
import pandas as pd

train_data = pd.read_csv('my_data2/twitter_training.csv', header=None)
test_data = pd.read_csv('my_data2/twitter_validation.csv', header=None)

train_data.columns = ['id', 'entity', 'sentiment', 'tweet']
test_data.columns = ['id', 'entity', 'sentiment', 'tweet']

In [24]:

sentiment_map = {'Positive':2, 'Neutral':1, 'Negative':0}


train_data['sentiment'] = train_data['sentiment'].map(sentiment_map)
test_data['sentiment'] = test_data['sentiment'].map(sentiment_map)

train_data.dropna(subset=['sentiment'], inplace=True)
test_data.dropna(subset=['sentiment'], inplace=True)


In [25]:
import re
def clean_text(text):
  text = re.sub(r'[http\S+|www\S+|https\S+]', '', text)
  text = re.sub(r'@\w+|#\w+', '', text)
  text = re.sub(r'[^a-zA-Z\s]', '', text)
  text = ' '.join(text.split())
  text = text.lower()
  return text


In [29]:

train_data['tweet'] = train_data['tweet'].astype(str).apply(clean_text)
test_data['tweet'] = test_data['tweet'].astype(str).apply(clean_text)


# Drop empty tweets (optional but recommended)
train_data = train_data[train_data['tweet'].str.strip() != '']
test_data = test_data[test_data['tweet'].str.strip() != '']

y_train = train_data['sentiment'].astype(int).values
y_val = test_data['sentiment'].astype(int).values



In [31]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(num_words = 10000, oov_token='<OOV>')
tokenizer.fit_on_texts(train_data['tweet'])

X_train = tokenizer.texts_to_sequences(train_data['tweet'])
X_val = tokenizer.texts_to_sequences(test_data['tweet'])

X_train = pad_sequences(X_train, padding='post', truncating='post', maxlen=50)
X_val = pad_sequences(X_val, padding='post', truncating='post', maxlen=50)


In [30]:
import numpy as np
print(np.unique(y_train))
print(np.unique(y_val))

[]
[]


In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Dropout, LSTM

model = Sequential([
    Embedding(input_dim=1000, output_dim=64, input_length=50),
    LSTM(64, return_sequences=False),
    Dropout(0.5),
    Dense(64, activation='relu'),
    Dense(32, activation='relu'),
    Dense(3, activation='softmax')
])

model.compile(optimizer='adam', metrics=['accuracy'], loss='sparse_categorical_crossentropy')





In [None]:
history = model.fit(X_train, y_train, epochs=5, batch_size=64, validation_data=(X_val, y_val))

Epoch 1/5
[1m1157/1157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 7ms/step - accuracy: 0.3047 - loss: nan - val_accuracy: 0.2663 - val_loss: nan
Epoch 2/5
[1m1157/1157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 7ms/step - accuracy: 0.2984 - loss: nan - val_accuracy: 0.2663 - val_loss: nan
Epoch 3/5
[1m1157/1157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 7ms/step - accuracy: 0.3043 - loss: nan - val_accuracy: 0.2663 - val_loss: nan
Epoch 4/5
[1m1157/1157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 7ms/step - accuracy: 0.3051 - loss: nan - val_accuracy: 0.2663 - val_loss: nan
Epoch 5/5
[1m1157/1157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 6ms/step - accuracy: 0.3062 - loss: nan - val_accuracy: 0.2663 - val_loss: nan
