In [1]:
import pandas as pd
import numpy as np
import re
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, SpatialDropout1D
from sklearn.model_selection import train_test_split

# Sample dataset (replace with a real dataset)
data = {
    "text": [
        "You are amazing!",
        "I hate you!",
        "Great job on your work.",
        "You are so stupid!",
        "Fantastic effort!",
        "Get lost, idiot!"
    ],
    "label": [0, 1, 0, 1, 0, 1]  # 0 = Safe, 1 = Harmful
}
df = pd.DataFrame(data)

# Text preprocessing
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    return text

df['cleaned_text'] = df['text'].apply(clean_text)

# Tokenization
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['cleaned_text'])
X = tokenizer.texts_to_sequences(df['cleaned_text'])
X = pad_sequences(X, padding='post')
y = np.array(df['label'])

# Splitting dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the model
vocab_size = len(tokenizer.word_index) + 1
embedding_dim = 64
max_length = X.shape[1]

model = Sequential([
    Embedding(vocab_size, embedding_dim, input_length=max_length),
    SpatialDropout1D(0.2),
    LSTM(64, dropout=0.2, recurrent_dropout=0.2),
    Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=4, validation_data=(X_test, y_test))

# Save the model
model.save("safespeak.h5")


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [2]:
import numpy as np
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences
import re

# Load the trained model
model = load_model("safespeak_model.h5")

# Define text preprocessing function
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove special characters
    return text

# Function to predict if content is offensive
def predict_text(text):
    text = clean_text(text)  # Apply the same preprocessing
    sequence = tokenizer.texts_to_sequences([text])  # Tokenize
    padded_sequence = pad_sequences(sequence, maxlen=X.shape[1], padding='post')  # Pad sequence
    prediction = model.predict(padded_sequence)[0][0]  # Get prediction probability
    return "Harmful" if prediction >= 0.5 else "Safe"

# Test with a new sentence
test_text = """ What the fuck """ 
print(f"Prediction: {predict_text(test_text)}")


Prediction: Harmful
