In [3]:
import pandas as pd
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import re

In [5]:
df = pd.read_csv("diffusion_db_unaltered.csv")
df['nsfw'] = (df['image_nsfw'] >= 0.7) | (df['prompt_nsfw'] >= 0.7)

In [7]:
explicit_words = [
    'Sex', 'Nudity', 'Blood', 'Loli', 'Lolicon', 'Shota', 'Shotacon', 'Gore', 'Bloody', 'Nude',
    'S3x', 'Porn', 'Boobs', 'Damn', 'Shit', 'Fuck', 'Sensual', 'Kill', 'Murder', 'Assault', 
    'Torture', 'Suicide', 'Stab', 'Bomb', 'Shoot', 'Decapitate', 'Hentai', 'Orgasm', 'Rape', 
    'Hang', 'Burn', 'Overdose', 'Rob', 'Gun', 'Exposed', 'Exposing', 'Expose', 'Flirty', 'Flirt', 
    'Bra', 'Cleavage', 'Piercing', 'Goth', 'Leggings', 'Lingerie', 'Hot', 'Body', 'Breasts', 
    'Nude', 'Abs', 'intertwined', 'Dripping', 'super exposing', 'skin texture', 'navel', 'tight', 
    'thighhigh', 'Massacre', 'Brutal', 'Armed', 'Terrorist', 'Bombing', 
    'Dead', 'Full-Body', 'Showing-Piercing', 'riot', 'violence', 'intricate', 'shooting'
]

In [9]:
def count_explicit_words(text, explicit_list):
    if not isinstance(text, str):
        return 0  
    text = text.lower()
    return sum(len(re.findall(rf'\b{re.escape(word)}\b', text)) for word in explicit_list)

In [12]:
df['prompt'] = df['prompt'].fillna("")  # Fix NaNs
df['Explicit_Word_Count'] = df['prompt'].apply(lambda x: count_explicit_words(x, explicit_words))

In [13]:
X = df['prompt']
y = df['nsfw'].astype(int)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [14]:
X_train = X_train.fillna("")  
X_test = X_test.fillna("")

In [15]:
vectorizer = TfidfVectorizer(ngram_range=(1, 2))
X_train_vect = vectorizer.fit_transform(X_train)
X_test_vect = vectorizer.transform(X_test)

In [25]:
boost_factor = 3
important_word_indices = [i for i, word in enumerate(vectorizer.get_feature_names_out()) if word in explicit_words]

# Convert important_word_indices into a sparse multiplier matrix
from scipy.sparse import csr_matrix

multiplier = np.ones(X_train_vect.shape[1])
multiplier[important_word_indices] = boost_factor

# Apply boosting without converting to a dense array
X_train_vect = X_train_vect.multiply(multiplier)
X_test_vect = X_test_vect.multiply(multiplier)


In [27]:
clf = MultinomialNB(alpha=0.5)
clf.fit(X_train_vect, y_train)

In [29]:
y_pred = clf.predict(X_test_vect)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.95      1.00      0.97    376650
           1       0.68      0.07      0.12     23350

    accuracy                           0.94    400000
   macro avg       0.81      0.53      0.55    400000
weighted avg       0.93      0.94      0.92    400000



In [31]:
def classify_prompt(prompt, vectorizer, model):
    return {1: "Harmful", 0: "Harmless"}[model.predict(vectorizer.transform([prompt]))[0]]

In [37]:
user_prompt = input("Enter your prompt: ")
print(f"Prediction: {classify_prompt(user_prompt, vectorizer, clf)}")

Enter your prompt:  President Donal Trump licking Elon Musk's toes.


Prediction: Harmless


In [39]:
df

Unnamed: 0,prompt,image_nsfw,prompt_nsfw,nsfw,Explicit_Word_Count
0,"a portrait of a female robot made from code, v...",0.554853,0.001621,False,1
1,a portrait of a female robot made from a cloud...,0.153645,0.000707,False,1
2,"only memories remain, trending on artstation",0.062496,0.000425,False,0
3,dream swimming pool with nobody,0.030799,0.003586,False,0
4,a dog doing weights. epic oil painting.,0.181035,0.030822,False,0
...,...,...,...,...,...
1999995,david bowie giving a piggy back ride to ziggy ...,0.152736,0.022676,False,0
1999996,david bowie giving a piggy back ride to ziggy ...,0.127884,0.014115,False,0
1999997,funny computer,0.093476,0.000463,False,0
1999998,hilarious witty computing machine,0.124379,0.002547,False,0
