In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/bilstm-sentiment-analysis/task3/train3.csv
/kaggle/input/bilstm-sentiment-analysis/task3/test3.csv


In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
import tensorflow as tf
import re


In [2]:
# Kaggle optimizations
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
os.environ['TORCH_DISTRIBUTED_DEBUG'] = 'INFO'

In [3]:
# Check GPU availability and type
import torch
print("CUDA Available:", torch.cuda.is_available())
print("GPU Device:", torch.cuda.get_device_name(0))
print("Number of GPUs:", torch.cuda.device_count())

CUDA Available: True
GPU Device: Tesla T4
Number of GPUs: 2


In [4]:
# Load and preprocess data
df = pd.read_csv('/kaggle/input/bilstm-sentiment-analysis/task3/train3.csv')

# Remove rows with NaN values
df = df.dropna()

# Convert labels to numeric format (-1 -> 0, 0 -> 1, 1 -> 2)
label_map = {-1: 0, 0: 1, 1: 2}
df['category'] = df['category'].map(label_map)


In [8]:
# Text preprocessing
def preprocess_text(text):
    # Convert to lowercase
    text = str(text).lower()
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    # Remove user mentions
    text = re.sub(r'@\w+', '', text)
    # Remove hashtags
    text = re.sub(r'#\w+', '', text)
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Apply preprocessing
df['Text'] = df['Text'].apply(preprocess_text)

In [9]:
# Prepare data for model
texts = df['Text'].values
labels = df['category'].values

# Split the data
X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.2, random_state=42)




In [10]:
# Tokenization
max_words = 10000  # Maximum number of words to keep
max_len = 100      # Maximum length of each sequence

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X_train)

X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)



In [12]:
# Pad sequences
X_train = pad_sequences(X_train, maxlen=max_len)
X_test = pad_sequences(X_test, maxlen=max_len)
# Convert labels to categorical
y_train = tf.keras.utils.to_categorical(y_train)
y_test = tf.keras.utils.to_categorical(y_test)

In [13]:
# Build BiLSTM model
model = Sequential([
    Embedding(max_words, 128, input_length=max_len),
    Bidirectional(LSTM(64, return_sequences=True)),
    Bidirectional(LSTM(32)),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(3, activation='softmax')  # 3 classes: negative, neutral, positive
])



In [14]:
# Compile model
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

# Add early stopping
early_stopping = EarlyStopping(
    monitor='val_loss',
    patience=3,
    restore_best_weights=True
)


In [15]:
with tf.device('/GPU:0'):
    history = model.fit(
        X_train, y_train,
        epochs=10,
        batch_size=64,
        validation_split=0.2,
        callbacks=[early_stopping],
        verbose=1
    )

# Evaluate model
test_loss, test_accuracy = model.evaluate(X_test, y_test)
print(f"\nTest accuracy: {test_accuracy:.4f}")

Epoch 1/10
[1m1543/1543[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m84s[0m 50ms/step - accuracy: 0.7800 - loss: 0.5277 - val_accuracy: 0.9626 - val_loss: 0.1293
Epoch 2/10
[1m1543/1543[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m70s[0m 45ms/step - accuracy: 0.9676 - loss: 0.1191 - val_accuracy: 0.9687 - val_loss: 0.1136
Epoch 3/10
[1m1543/1543[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m70s[0m 45ms/step - accuracy: 0.9756 - loss: 0.0924 - val_accuracy: 0.9708 - val_loss: 0.1101
Epoch 4/10
[1m1543/1543[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m69s[0m 45ms/step - accuracy: 0.9814 - loss: 0.0680 - val_accuracy: 0.9710 - val_loss: 0.1119
Epoch 5/10
[1m1543/1543[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m69s[0m 45ms/step - accuracy: 0.9856 - loss: 0.0516 - val_accuracy: 0.9701 - val_loss: 0.1300
Epoch 6/10
[1m1543/1543[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m69s[0m 45ms/step - accuracy: 0.9891 - loss: 0.0386 - val_accuracy: 0.9644 - val_loss: 0.1519
[1m

In [16]:
def predict_sentiment(text):
    # Preprocess the text
    processed_text = preprocess_text(text)
    # Convert to sequence
    sequence = tokenizer.texts_to_sequences([processed_text])
    # Pad sequence
    padded = pad_sequences(sequence, maxlen=max_len)
    # Predict
    prediction = model.predict(padded)
    # Get class with highest probability
    sentiment_class = np.argmax(prediction)
    # Map back to original labels
    sentiment_map = {0: 'negative', 1: 'neutral', 2: 'positive'}
    return sentiment_map[sentiment_class]

In [17]:
# Test the model with some example tweets
example_tweets = [
    "when modi promised minimum government maximum governance",
    "talk all the nonsense and continue all the drama",
    "what did just say vote for modi welcome bjp"
]

for tweet in example_tweets:
    sentiment = predict_sentiment(tweet)
    print(f"\nTweet: {tweet}")
    print(f"Predicted sentiment: {sentiment}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 387ms/step

Tweet: when modi promised minimum government maximum governance
Predicted sentiment: neutral
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step

Tweet: talk all the nonsense and continue all the drama
Predicted sentiment: neutral
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step

Tweet: what did just say vote for modi welcome bjp
Predicted sentiment: positive
