In [1]:
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import re
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.feature_extraction.text import TfidfVectorizer



In [2]:
df = pd.read_csv("diffusion_db_unaltered.csv")
df

Unnamed: 0,prompt,image_nsfw,prompt_nsfw
0,"a portrait of a female robot made from code, v...",0.554853,0.001621
1,a portrait of a female robot made from a cloud...,0.153645,0.000707
2,"only memories remain, trending on artstation",0.062496,0.000425
3,dream swimming pool with nobody,0.030799,0.003586
4,a dog doing weights. epic oil painting.,0.181035,0.030822
...,...,...,...
1999995,david bowie giving a piggy back ride to ziggy ...,0.152736,0.022676
1999996,david bowie giving a piggy back ride to ziggy ...,0.127884,0.014115
1999997,funny computer,0.093476,0.000463
1999998,hilarious witty computing machine,0.124379,0.002547


In [3]:
df['nsfw'] = (df['image_nsfw'] >= 0.7) | (df['prompt_nsfw'] >= 0.7)
df

Unnamed: 0,prompt,image_nsfw,prompt_nsfw,nsfw
0,"a portrait of a female robot made from code, v...",0.554853,0.001621,False
1,a portrait of a female robot made from a cloud...,0.153645,0.000707,False
2,"only memories remain, trending on artstation",0.062496,0.000425,False
3,dream swimming pool with nobody,0.030799,0.003586,False
4,a dog doing weights. epic oil painting.,0.181035,0.030822,False
...,...,...,...,...
1999995,david bowie giving a piggy back ride to ziggy ...,0.152736,0.022676,False
1999996,david bowie giving a piggy back ride to ziggy ...,0.127884,0.014115,False
1999997,funny computer,0.093476,0.000463,False
1999998,hilarious witty computing machine,0.124379,0.002547,False


In [4]:
explicit_words = [
    'Sex', 
    'Nudity', 
    'Blood', 
    'Loli', 
    'Lolicon', 
    'Shota', 
    'Shotacon', 
    'Gore', 
    'Bloody', 
    'Nude',
    'S3x',
    'Porn',
    'Boobs',
    'Damn',
    'Shit',
    'Fuck',
    'Sensual',
    'Kill',
    'Murder',
    'Assault',
    'Torture',
    'Suicide',
    'Stab',
    'Bomb',
    'Shoot',
    'Decapitate',
    'Hentai',
    'Orgasm',
    'Rape',
    'Hang',
    'Burn',
    'Overdose',
    'Rob',
    'Gun',
    'Exposed',
    'Exposing',
    'Expose',
    'Flirty',
    'Flirt',
    'Bra',
    'Cleavage',
    'Piercing',
    'Goth',
    'Leggings',
    'Lingerie',
    'Hot',
    'Body',
    'Breasts',
    'Nude',
    'Abs',
    'intertwined',
    'Dripping',
    'super exposing',
    'skin texture',
    'navel',
    'tight',
    'thighhigh',
    'Full Body',
    'Only Towel',
    'Massacre',
    'Brutal',
    'Armed',
    'Terrorist',
    'Bombing',
    'Dead',
    'Full-Body',
    'Showing Piercing',
    'riot',
    'violence',
    'intricate',
    'shooting',
    
    
]  

In [5]:
def count_explicit_words(text, explicit_list):
    if not isinstance(text, str):
        return 0  
    text = text.lower()  
    return sum(len(re.findall(rf'\b{re.escape(word)}\b', text)) for word in explicit_list)

In [6]:
df['Explicit_Word_Count'] = df['prompt'].apply(lambda x: count_explicit_words(x, explicit_words))
df

Unnamed: 0,prompt,image_nsfw,prompt_nsfw,nsfw,Explicit_Word_Count
0,"a portrait of a female robot made from code, v...",0.554853,0.001621,False,1
1,a portrait of a female robot made from a cloud...,0.153645,0.000707,False,1
2,"only memories remain, trending on artstation",0.062496,0.000425,False,0
3,dream swimming pool with nobody,0.030799,0.003586,False,0
4,a dog doing weights. epic oil painting.,0.181035,0.030822,False,0
...,...,...,...,...,...
1999995,david bowie giving a piggy back ride to ziggy ...,0.152736,0.022676,False,0
1999996,david bowie giving a piggy back ride to ziggy ...,0.127884,0.014115,False,0
1999997,funny computer,0.093476,0.000463,False,0
1999998,hilarious witty computing machine,0.124379,0.002547,False,0


In [7]:
df.iloc[0]['prompt']

'a portrait of a female robot made from code, very intricate details, octane render, 8 k, trending on artstation '

In [8]:
X = df['prompt']
y = (df['prompt_nsfw'] >= 0.7).astype(int)

In [79]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.8, random_state=42)

print(f"Training size: {len(X_train)}, Testing size: {len(X_test)}")


Training size: 400000, Testing size: 1600000


In [81]:
X_train = X_train.fillna("")
X_test = X_test.fillna("")

In [83]:
vectorizer = TfidfVectorizer()
X_train_vect = vectorizer.fit_transform(X_train)
X_test_vect = vectorizer.transform(X_test)

In [85]:
feature_names = np.array(vectorizer.get_feature_names_out())


important_words = explicit_words  # Words to emphasize
boost_factor = 100  # Increase their weight

# Find indices of those words in the vectorized output
important_word_indices = [i for i, word in enumerate(feature_names) if word in important_words]

# Apply boost
X_train_vect[:, important_word_indices] *= boost_factor
X_test_vect[:, important_word_indices] *= boost_factor

In [87]:
clf = MultinomialNB(alpha=.5)

In [89]:
clf.fit(X_train_vect, y_train)

In [91]:
y_pred = clf.predict(X_test_vect)

In [93]:
y_pred

array([0, 0, 0, ..., 0, 0, 0])

In [95]:
y_pred_labels = pd.Series(y_pred).map({1: "Harmful", 0: "Harmless"})
y_pred_labels

0          Harmless
1          Harmless
2          Harmless
3          Harmless
4          Harmless
             ...   
1599995    Harmless
1599996    Harmless
1599997    Harmless
1599998    Harmless
1599999    Harmless
Length: 1600000, dtype: object

In [97]:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

In [98]:
print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")

Accuracy: 0.99
Precision: 0.46
Recall: 0.04
F1 Score: 0.07


In [105]:
def classify_prompt(prompt, vectorizer, model):
    prompt_vect = vectorizer.transform([prompt])  # Transform input text
    prediction = model.predict(prompt_vect)[0]  # Get prediction (0 or 1)
    return "Harmful" if prediction == 1 else "Harmless"

# Example usage
user_prompt = input("Enter your prompt: ")  # Take user input
result = classify_prompt(user_prompt, vectorizer, clf)  # Classify it

print(f"Prediction: {result}")

Enter your prompt:  President Donal Trump licking Elon Musk's toes.


Prediction: Harmless
