In [7]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from sklearn.metrics import classification_report

# Read the data
file_path = r'C:\Users\Admin\Desktop\Utkarsh\Dataset\Twitter_Data.csv'
df = pd.read_csv(file_path)

# Change dependent variable to categorical
label_encoder = LabelEncoder()
df['category'] = label_encoder.fit_transform(df['category'])

# Missing value analysis and drop null values
df.dropna(inplace=True)

# Text cleaning
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

def clean_text(text):
    # Remove symbols and convert to lowercase
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    text = text.lower()
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(text)
    filtered_words = [word for word in words if word.lower() not in stop_words]
    
    # Join the words back into a sentence
    cleaned_text = ' '.join(filtered_words)
    
    return cleaned_text

# Apply text cleaning
df['clean_text'] = df['clean_text'].apply(clean_text)

# Create a new column for sentence length
df['sentence_length'] = df['clean_text'].apply(lambda x: len(x.split()))

# Split data into dependent (X) and independent (y) dataframe
X = df['clean_text']
y = df['category']

# Operations on text data
# One-hot encoding
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X)
X_encoded = tokenizer.texts_to_sequences(X)

# Adding padding from the front side
X_padded = pad_sequences(X_encoded, padding='pre')

# Build LSTM model
vocab_size = len(tokenizer.word_index) + 1
embedding_dim = 50  # Adjust as needed
input_length = X_padded.shape[1]

model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=input_length))
model.add(LSTM(units=100))
model.add(Dropout(0.2))
model.add(Dense(units=1, activation='sigmoid'))  # Updated output units to 1 for binary classification

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Dummy variable creation for dependent variable
y_dummy = pd.get_dummies(y)

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_padded, y_dummy, test_size=0.2, random_state=42)

y_train_binary = y_train.values.argmax(axis=1)
y_test_binary = y_test.values.argmax(axis=1)

# Train the model
model.fit(X_train, y_train_binary, epochs=5, batch_size=64, validation_data=(X_test, y_test_binary))

# Normalize predictions
y_pred = model.predict(X_test)
y_pred_binary = [1 if pred >= 0.5 else 0 for pred in y_pred]

# Measure performance metrics and accuracy
print(classification_report(y_test.values.argmax(axis=1), y_pred_binary))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
              precision    recall  f1-score   support

           0       0.00      0.00      0.00      7127
           1       0.34      1.00      0.50     10960
           2       0.00      0.00      0.00     14506
           3       0.00      0.00      0.00         3

    accuracy                           0.34     32596
   macro avg       0.08      0.25      0.13     32596
weighted avg       0.11      0.34      0.17     32596



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
