In [1]:
import nltk

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score


# Sample data (profane and non-profane)
texts = [
    ("I hate you", "profane"),
    ("This is a neutral sentence", "non-profane"),
    ("You are stupid", "profane"),
    ("This is a positive review", "non-profane"),
    ("He is an idiot", "profane"),
    ("The weather is nice today", "non-profane")
]

def tokenize_and_lemmatize(text):
    tokens = nltk.word_tokenize(text.lower())  # Tokenize and convert to lowercase
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return ' '.join(lemmatized_tokens)

# Prepare data for training
processed_texts = [tokenize_and_lemmatize(text) for text, label in texts]
labels = [label for text, label in texts]

# Vectorize text
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(processed_texts)

# Convert labels to numerical values
y = [1 if label == 'profane' else 0 for label in labels]

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a classifier (Multinomial Naive Bayes for simplicity)
classifier = MultinomialNB()
classifier.fit(X_train, y_train)

# Predict labels for test set
y_pred = classifier.predict(X_test)

# Evaluate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ssharma\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ssharma\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ssharma\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Accuracy: 1.00
