In [7]:
# pip install tensorflow nltk scikit-learn

In [8]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, SpatialDropout1D
from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize
import nltk

# Download necessary NLTK resources
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ssharma\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [9]:
df = pd.read_csv('./dataset/clean_data.csv').head(1000)
# df.rename(columns=[{"is_profane": 'label'}], inplace=True)

df.head()

Unnamed: 0,is_profane,text
0,0,Then go to the village pump and suggest they c...
1,1,ANTI GREEK NATIONALIS -WIKIPEDIA \r\n\r\nHi Al...
2,1,Dis hoe wasnt dis violent on Lottery Ticket 😂😂
3,0,It is better for Atabay not helping the banned...
4,0,"""is in CamelCase. """"SiCKO"""" is not CamelCase,..."


In [10]:
sentences = df['text']
print(sentences)

labels = df['is_profane']
print(labels)

# Tokenize and pad sequences
max_words = 5000
max_len = 50

tokenizer = Tokenizer(num_words=max_words, lower=True)
tokenizer.fit_on_texts(sentences)
sequences = tokenizer.texts_to_sequences(sentences)
padded_sequences = pad_sequences(sequences, maxlen=max_len)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, labels, test_size=0.2, random_state=42)

# Build LSTM model
embedding_dim = 128

model = Sequential()
model.add(Embedding(max_words, embedding_dim, input_length=max_len))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))



# Train the model
batch_size = 32
epochs = 5

history = model.fit(X_train, np.array(y_train), epochs=epochs, batch_size=batch_size, 
                    validation_data=(X_test, np.array(y_test)), verbose=2)

# Evaluate the model
loss, accuracy = model.evaluate(X_test, np.array(y_test), verbose=2)
print(f'Accuracy: {accuracy*100:.2f}%')


0      Then go to the village pump and suggest they c...
1      ANTI GREEK NATIONALIS -WIKIPEDIA \r\n\r\nHi Al...
2         Dis hoe wasnt dis violent on Lottery Ticket 😂😂
3      It is better for Atabay not helping the banned...
4      "is in CamelCase.  ""SiCKO"" is not CamelCase,...
                             ...                        
995     like fuccck man why yall bitches wearing wigs...
996      I just wanna travel the world with my bitches 😩
997    FUCK YOU BITCH\r\nKiss my ass, you dickless tr...
998    depends what bbg stands for.\r\n\r\nAlso, was ...
999    " (UTC)\r\n\r\nHey, do you prefer ""useful vol...
Name: text, Length: 1000, dtype: object
0      0
1      1
2      1
3      0
4      0
      ..
995    1
996    1
997    1
998    0
999    0
Name: is_profane, Length: 1000, dtype: int64
Epoch 1/5




25/25 - 4s - 176ms/step - accuracy: 0.8087 - loss: 0.5313 - val_accuracy: 0.8150 - val_loss: 0.4380
Epoch 2/5
25/25 - 1s - 40ms/step - accuracy: 0.8350 - loss: 0.3839 - val_accuracy: 0.8450 - val_loss: 0.3653
Epoch 3/5
25/25 - 1s - 39ms/step - accuracy: 0.8750 - loss: 0.3087 - val_accuracy: 0.8600 - val_loss: 0.3220
Epoch 4/5
25/25 - 1s - 39ms/step - accuracy: 0.9212 - loss: 0.2128 - val_accuracy: 0.8900 - val_loss: 0.2960
Epoch 5/5
25/25 - 1s - 41ms/step - accuracy: 0.9638 - loss: 0.1270 - val_accuracy: 0.9000 - val_loss: 0.2844
7/7 - 0s - 20ms/step - accuracy: 0.9000 - loss: 0.2844
Accuracy: 90.00%


In [11]:
# Make predictions on new data
def predict_profanity(sentence):
    sequence = tokenizer.texts_to_sequences([sentence])
    padded_sequence = pad_sequences(sequence, maxlen=max_len)
    prediction = model.predict(padded_sequence)
    print(prediction)
    return prediction[0][0]


In [12]:
# Test the model
test_sentences = [
    "I love you",
    "You are so dumb",
    "This is fantastic",
    "You are a fool",
    "Biatch is hoe",
    "ANTI GREEK NATIONALIS -WIKIPEDIA",
    "Dis hoe wasnt dis violent on Lottery Ticket"
]

for sentence in test_sentences:
    prediction = predict_profanity(sentence)
    print(f'Sentence: "{sentence}" - Profanity Probability: {prediction:.2f}')


testData = ["Hi Bruv, Hope you are doing well", "Love this product", "She's a hoe", 'Suck water from bottle', "write the article ibtch", "Idiot, A cup of cum for you! DRINK IT ALL PLZ", "A cup of cum for you! DRINK IT ALL PLZ"]

for text in testData:
    prediction = predict_profanity(text)
    print(f'Sentence: "{text}" - Profanity Probability: {prediction:.2f}')


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 337ms/step
[[0.60901487]]
Sentence: "I love you" - Profanity Probability: 0.61
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step
[[0.41655546]]
Sentence: "You are so dumb" - Profanity Probability: 0.42
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step
[[0.08622228]]
Sentence: "This is fantastic" - Profanity Probability: 0.09
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step
[[0.6747503]]
Sentence: "You are a fool" - Profanity Probability: 0.67
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step
[[0.50418556]]
Sentence: "Biatch is hoe" - Profanity Probability: 0.50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step
[[0.31595683]]
Sentence: "ANTI GREEK NATIONALIS -WIKIPEDIA" - Profanity Probability: 0.32
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step
[[0.8961495]]
Sentence: "Dis hoe wasnt dis viol