In [6]:
import pandas as pd
import numpy as np
import re
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, SpatialDropout1D, Bidirectional, Dropout
from sklearn.model_selection import train_test_split

# Load dataset 
train_file_path = "train.csv"
test_file_path = "test.csv"
train_df = pd.read_csv(train_file_path)
test_df = pd.read_csv(test_file_path)

# Select relevant columns and drop NaN values
train_df = train_df[['cleaned_tweet', 'subtask_a']].dropna()
test_df = test_df[['cleaned_tweet', 'subtask_a']].dropna()

# Convert labels (OFF -> 1, NOT -> 0)
train_df['label'] = train_df['subtask_a'].apply(lambda x: 1 if x == 'OFF' else 0)
test_df['label'] = test_df['subtask_a'].apply(lambda x: 1 if x == 'OFF' else 0)

# Tokenization
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_df['cleaned_tweet'])
X_train = tokenizer.texts_to_sequences(train_df['cleaned_tweet'])
X_test = tokenizer.texts_to_sequences(test_df['cleaned_tweet'])

# Padding sequences
max_length = max(len(seq) for seq in X_train)
X_train = pad_sequences(X_train, maxlen=max_length, padding='post')
X_test = pad_sequences(X_test, maxlen=max_length, padding='post')

# Convert labels to numpy arrays
y_train = np.array(train_df['label'])
y_test = np.array(test_df['label'])

# Define improved BiLSTM Model
vocab_size = len(tokenizer.word_index) + 1
embedding_dim = 100  # Trainable Embedding layer
model = Sequential([
    Embedding(vocab_size, embedding_dim, input_length=max_length, trainable=True),
    SpatialDropout1D(0.3),
    Bidirectional(LSTM(128, return_sequences=True, dropout=0.3, recurrent_dropout=0.3)),
    Bidirectional(LSTM(64, dropout=0.3, recurrent_dropout=0.3)),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))

# Save the model
model.save("safespeak_model.h5")




Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [16]:
# Function to predict new text
def predict_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    sequence = tokenizer.texts_to_sequences([text])
    padded_sequence = pad_sequences(sequence, maxlen=max_length, padding='post')
    prediction = model.predict(padded_sequence)[0][0]
    return "Harmful" if prediction >= 0.5 else "Safe"

# Example prediction
test_text = "You are black and ugly. Move out"
print(f"Prediction: {predict_text(test_text)}")

Prediction: Harmful
