In [1]:
import pandas as pd
import numpy as np
import re
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, SpatialDropout1D, Bidirectional, Dropout, Attention
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.utils.class_weight import compute_class_weight
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
import nltk
nltk.download('omw-1.4')
nltk.download('stopwords')
nltk.download('wordnet')

# Load dataset
train_file_path = "train.csv"
test_file_path = "test.csv"
train_df = pd.read_csv(train_file_path)
test_df = pd.read_csv(test_file_path)

# Select relevant columns and drop NaN values
train_df = train_df[['cleaned_tweet', 'subtask_a']].dropna()
test_df = test_df[['cleaned_tweet', 'subtask_a']].dropna()

# Convert labels (OFF -> 1, NOT -> 0)
train_df['label'] = train_df['subtask_a'].apply(lambda x: 1 if x == 'OFF' else 0)
test_df['label'] = test_df['subtask_a'].apply(lambda x: 1 if x == 'OFF' else 0)

# Preprocessing function
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove special characters
    words = text.split()
    words = [lemmatizer.lemmatize(stemmer.stem(word)) for word in words if word not in stop_words]
    return ' '.join(words)

train_df['cleaned_tweet'] = train_df['cleaned_tweet'].apply(preprocess_text)
test_df['cleaned_tweet'] = test_df['cleaned_tweet'].apply(preprocess_text)

# Tokenization
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_df['cleaned_tweet'])
X_train = tokenizer.texts_to_sequences(train_df['cleaned_tweet'])
X_test = tokenizer.texts_to_sequences(test_df['cleaned_tweet'])

# Padding sequences
max_length = max(len(seq) for seq in X_train)
X_train = pad_sequences(X_train, maxlen=max_length, padding='post')
X_test = pad_sequences(X_test, maxlen=max_length, padding='post')

# Convert labels to numpy arrays
y_train = np.array(train_df['label'])
y_test = np.array(test_df['label'])

# Compute class weights
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y_train), y=y_train)
class_weight_dict = {i: class_weights[i] for i in range(len(class_weights))}

# Define improved BiLSTM Model with Attention
vocab_size = len(tokenizer.word_index) + 1
embedding_dim = 100  # Trainable Embedding layer
model = Sequential([
    Embedding(vocab_size, embedding_dim, input_length=max_length, trainable=True),
    SpatialDropout1D(0.3),
    Bidirectional(LSTM(128, return_sequences=True, dropout=0.3, recurrent_dropout=0.3)),
    Bidirectional(LSTM(64, return_sequences=True, dropout=0.3, recurrent_dropout=0.3)),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])



[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
# Train the model with early stopping
#early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
model.fit(X_train, y_train, epochs=50, batch_size=64, validation_data=(X_test, y_test), class_weight=class_weight_dict, callbacks=[early_stopping])

# Save the model
model.save("safespeakmodel.h5")


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50


In [45]:
# Function to predict new text
def predict_text(text):
    text = preprocess_text(text)
    sequence = tokenizer.texts_to_sequences([text])
    padded_sequence = pad_sequences(sequence, maxlen=max_length, padding='post')
    prediction = model.predict(padded_sequence)[0][0]
    return "Harmful" if prediction >= 0.5 else "Safe"

# Example prediction
test_text = "Courtney Quinn is a fashion and makeup blogger from NYC. She shares her colorful world in her blog called Color Me Courtney, where you can also find various makeup tutorials, lifestyle posts, and more"
print(f"Prediction: {predict_text(test_text)}")

Prediction: Safe
