# BUILD A NLP NEURAL NETWORK - BY YERIKO VARGAS

This Python script demonstrates a basic framework for Natural Language Processing (NLP) using NLTK and for Deep Learning using Keras.

In the NLP section, we tokenized the sentences and words, filtered out stop words, and applied lemmatization.

In the Deep Learning section, we preprocessed the data by splitting it into training and testing sets. We then built a simple neural network model, compiled it, trained it, and finally evaluated its accuracy.

In [None]:
#                                      Text Input
#                                           |
#                                      Tokenization
#                                           |
#                                  Data Preprocessing
#                                           |
#                                  Neural Network Model
#                                           |
#                                       Training
#                                           |
#                                     Evaluation
#                                           |
#                                   Predict Bad Words



# Natural Language Processing (NLP) Basic Framework using NLTK
# -------------------------------------------------------------
# Library Import
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

# Predefined list of bad words (You can expand this list)
bad_words = ['badword1', 'badword2']

# Sample Data (0 for clean text, 1 for text with bad words)
sentences = ["This is a clean sentence", "This sentence contains a badword1"]
labels = [0, 1]

# Data Preprocessing
tokenizer = Tokenizer(oov_token="<OOV>")
tokenizer.fit_on_texts(sentences)
sequences = tokenizer.texts_to_sequences(sentences)
padded_sequences = pad_sequences(sequences, padding="post")

# Prepare labels
labels = np.array(labels)

# Split Data
x_train, x_test, y_train, y_test = train_test_split(padded_sequences, labels, test_size=0.2)

# Build Model
model = Sequential([
    Dense(12, input_shape=(x_train.shape[1],), activation='relu'),
    Dense(12, activation='relu'),
    Dense(1, activation='sigmoid')
])

# Compile
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train
model.fit(x_train, y_train, epochs=15, batch_size=10)

# Evaluate
_, accuracy = model.evaluate(x_test, y_test)
print(f"Model accuracy: {accuracy*100:.2f}%")

# Prediction function
def predict_bad_words(sentence):
    sequence = tokenizer.texts_to_sequences([sentence])
    padded_sequence = pad_sequences(sequence, padding="post", maxlen=x_train.shape[1])
    prediction = model.predict(padded_sequence)
    return True if prediction > 0.5 else False

# Test prediction function
result = predict_bad_words("This is a sentence with badword1")
print("Contains bad words:", result)

