In [None]:
import pandas as pd
import numpy as np
import re
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, SpatialDropout1D, Bidirectional, Dropout
from sklearn.model_selection import train_test_split

train_file_path = "train.csv"
test_file_path = "test.csv"
train_df = pd.read_csv(train_file_path)
test_df = pd.read_csv(test_file_path)

train_df = train_df[['cleaned_tweet', 'subtask_a']].dropna()
test_df = test_df[['cleaned_tweet', 'subtask_a']].dropna()

train_df['label'] = train_df['subtask_a'].apply(lambda x: 1 if x == 'OFF' else 0)
test_df['label'] = test_df['subtask_a'].apply(lambda x: 1 if x == 'OFF' else 0)

tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_df['cleaned_tweet'])
X_train = tokenizer.texts_to_sequences(train_df['cleaned_tweet'])
X_test = tokenizer.texts_to_sequences(test_df['cleaned_tweet'])

max_length = max(len(seq) for seq in X_train)
X_train = pad_sequences(X_train, maxlen=max_length, padding='post')
X_test = pad_sequences(X_test, maxlen=max_length, padding='post')

y_train = np.array(train_df['label'])
y_test = np.array(test_df['label'])

vocab_size = len(tokenizer.word_index) + 1
embedding_dim = 100  
model = Sequential([
    Embedding(vocab_size, embedding_dim, input_length=max_length, trainable=True),
    SpatialDropout1D(0.3),
    Bidirectional(LSTM(128, return_sequences=True, dropout=0.3, recurrent_dropout=0.3)),
    Bidirectional(LSTM(64, dropout=0.3, recurrent_dropout=0.3)),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

model.fit(X_train, y_train, epochs=50, batch_size=64, validation_data=(X_test, y_test))

model.save("safespeak_model.h5")




Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [None]:
def predict_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    sequence = tokenizer.texts_to_sequences([text])
    padded_sequence = pad_sequences(sequence, maxlen=max_length, padding='post')
    prediction = model.predict(padded_sequence)[0][0]
    return "Harmful" if prediction >= 0.5 else "Safe"

test_text = "You are black and ugly. Move out"
print(f"Prediction: {predict_text(test_text)}")

Prediction: Harmful
