In [3]:
import numpy as np
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, SpatialDropout1D
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [7]:
df = pd.read_csv('./dataset/clean_data.csv').head(1000)
df.rename(columns={'is_profane': 'label'}, inplace=True)
# df.rename(columns=[{"is_profane": 'label'}], inplace=True)

df.head()

Unnamed: 0,label,text
0,0,Then go to the village pump and suggest they c...
1,1,ANTI GREEK NATIONALIS -WIKIPEDIA \r\n\r\nHi Al...
2,1,Dis hoe wasnt dis violent on Lottery Ticket 😂😂
3,0,It is better for Atabay not helping the banned...
4,0,"""is in CamelCase. """"SiCKO"""" is not CamelCase,..."


In [8]:

# Step 1: Prepare the dataset
# data = {
#     'text': [
#         'This is a clean sentence.',
#         'Another example of a clean sentence.',
#         'This sentence contains a badword.',
#         'Another badword in this sentence.'
#     ],
#     'label': [0, 0, 1, 1]  # 0 for clean, 1 for profanity
# }

# df = pd.DataFrame(data)

# Step 2: Preprocess the text
MAX_NB_WORDS = 50000  # max number of words for the tokenizer
MAX_SEQUENCE_LENGTH = 100  # max length of each sequence
EMBEDDING_DIM = 100  # dimension of the GloVe embeddings

tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(df['text'].values)
word_index = tokenizer.word_index
print(f'Found {len(word_index)} unique tokens.')

X = tokenizer.texts_to_sequences(df['text'].values)
X = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)
y = df['label'].values

# Step 3: Load pre-trained embeddings
GLOVE_DIR = "./dataset/glove.6B.100d.txt"  # Update this path to where GloVe is stored
embeddings_index = {}
with open(GLOVE_DIR, encoding="utf8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
print(f'Found {len(embeddings_index)} word vectors.')

# Step 4: Create an embedding matrix
embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

# Step 5: Build the model
model = Sequential()
model.add(Embedding(len(word_index) + 1,
                    EMBEDDING_DIM,
                    weights=[embedding_matrix],
                    input_length=MAX_SEQUENCE_LENGTH,
                    trainable=False))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

# Step 6: Train and evaluate the model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
history = model.fit(X_train, y_train, epochs=5, batch_size=64, validation_data=(X_test, y_test), verbose=2)

# Evaluate the model
y_pred = (model.predict(X_test) > 0.5).astype("int32")
print(classification_report(y_test, y_pred))

# Optional: Predict new sentences
new_sentences = ['This is a new sentence with a badword.', 'Clean sentence here.']
new_sequences = tokenizer.texts_to_sequences(new_sentences)
new_padded_sequences = pad_sequences(new_sequences, maxlen=MAX_SEQUENCE_LENGTH)
predictions = (model.predict(new_padded_sequences) > 0.5).astype("int32")
print("Predictions for new sentences:", predictions)


Found 8905 unique tokens.
Found 400000 word vectors.




None
Epoch 1/5
12/12 - 5s - 435ms/step - accuracy: 0.7973 - loss: 0.4898 - val_accuracy: 0.8520 - val_loss: 0.3690
Epoch 2/5
12/12 - 1s - 108ms/step - accuracy: 0.8293 - loss: 0.3943 - val_accuracy: 0.8720 - val_loss: 0.3054
Epoch 3/5
12/12 - 1s - 106ms/step - accuracy: 0.8560 - loss: 0.3390 - val_accuracy: 0.9080 - val_loss: 0.2526
Epoch 4/5
12/12 - 1s - 115ms/step - accuracy: 0.8813 - loss: 0.3014 - val_accuracy: 0.9200 - val_loss: 0.2580
Epoch 5/5
12/12 - 1s - 112ms/step - accuracy: 0.8787 - loss: 0.2860 - val_accuracy: 0.9240 - val_loss: 0.2354
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 68ms/step
              precision    recall  f1-score   support

           0       0.93      0.99      0.96       213
           1       0.88      0.57      0.69        37

    accuracy                           0.92       250
   macro avg       0.90      0.78      0.82       250
weighted avg       0.92      0.92      0.92       250

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m