# Klasifikasi Teks Erotis (Pornoteks) Bahasa Indonesia

In [1]:
import pandas as pd
import tensorflow as tf

from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Embedding, GlobalAveragePooling1D, Dropout, Conv1D
from tensorflow.keras.layers import TextVectorization
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score

In [2]:
dataset = pd.read_csv("dataset500.csv")

X = dataset["text"]
y = dataset["label"]

In [3]:
Text_train, Text_temp, Label_train, Label_temp = train_test_split(X, y, test_size=0.4, random_state=42)

Text_val, Text_test, Label_val, Label_test = train_test_split(Text_temp, Label_temp, test_size=0.5, random_state=42)

In [4]:
# Create a LabelEncoder
label_encoder = LabelEncoder()

# Fit the encoder on your labels and transform them
Label_train = label_encoder.fit_transform(Label_train)
Label_val = label_encoder.transform(Label_val)
Label_test = label_encoder.transform(Label_test)

# 0: NEGATIF
# 1: POSITIF

In [5]:
train_dataset = tf.data.Dataset.from_tensor_slices((Text_train, Label_train))
val_dataset = tf.data.Dataset.from_tensor_slices((Text_val, Label_val))
test_dataset = tf.data.Dataset.from_tensor_slices((Text_test, Label_test))

In [6]:
embedding_layer = tf.keras.layers.Embedding(1000, 5)

In [7]:
result = embedding_layer(tf.constant([1, 2, 3]))
result.numpy()

array([[ 3.34560610e-02, -4.64357510e-02, -1.48817785e-02,
         4.64683771e-03, -7.69165903e-03],
       [-6.45349175e-03,  1.01177469e-02, -5.72475046e-03,
        -9.78541374e-03, -9.81241465e-06],
       [ 4.89111580e-02,  4.52224277e-02,  1.60588138e-02,
         4.64906357e-02, -5.66937774e-03]], dtype=float32)

In [8]:
result = embedding_layer(tf.constant([[0, 1, 2], [3, 4, 5]]))
result.shape

TensorShape([2, 3, 5])

In [9]:
# Update the train_dataset, val_dataset, test_dataset specifications
batch_size = 32  # You can set the batch size to your desired value

train_dataset = train_dataset.batch(batch_size)
val_dataset = val_dataset.batch(batch_size)
test_dataset = test_dataset.batch(batch_size)

In [10]:
train_dataset = train_dataset.cache().prefetch(buffer_size=tf.data.AUTOTUNE)
val_dataset = val_dataset.cache().prefetch(buffer_size=tf.data.AUTOTUNE)
test_dataset = test_dataset.prefetch(buffer_size=tf.data.AUTOTUNE)

In [11]:
for text_batch, label_batch in train_dataset.take(1):
  for i in range(10):
    print("Text: ", text_batch.numpy()[i])
    print("Label:", label_batch.numpy()[i])

Text:  b'mas kocok mas eghh mas yang dalam kocok terus selangkanganku aduhh eghh mas enakk'
Label: 0
Text:  b'sementara tangan kananku mulai menggerayangi memek  yang sudah mulai basahaku usap-usap bibir memek tante dengan lembut hingga desahan-desahan menggairahkan semakin keras dari bibirnya'
Label: 0
Text:  b'pemodelan topik merupakan metode untuk menemukan tema utama yang mencakup koleksi dokumen besar dan tidak terstruktur yang dapat menyusun dataset sesuai dengan tema yang ditemukan di dalamnya'
Label: 1
Text:  b'tahap ini dilakukan dengan mencari, menggali dan mempelajari informasi yang berhubungan dengan skripsi ini. informasi didapat melalui buku-buku referensi atau sumber-sumber yang berkaitan dengan skripsi ini, baik dari text book maupun internet'
Label: 1
Text:  b'Banyak juga wanita yang lebih menyukai rangsangan seksual pada klitoris secara langsung atau manual'
Label: 1
Text:  b'Makanan yang mengandung CoQ10 dan likopen juga dapat membantu meningkatkan libido'
Label: 1
T

In [12]:
for i, label in enumerate(label_encoder.classes_):
  print("Label", i, "corresponds to", label)

Label 0 corresponds to negatif
Label 1 corresponds to positif


In [13]:
embedding_dim = 16
vocab_size = 10000
sequence_length = 100

vectorize_layer = TextVectorization(
    max_tokens=vocab_size,
    output_mode='int',
    output_sequence_length=sequence_length,
)

In [14]:
train_text = train_dataset.map(lambda text, labels: text)
vectorize_layer.adapt(train_text)
print(len(vectorize_layer.get_vocabulary()))

3590


In [15]:
# Retrieve a batch (of 32 reviews and labels) from the dataset.
text_batch, label_batch = next(iter(train_dataset))
first_text, first_label = text_batch[0], label_batch[0]
print("Text:", first_text)
print("Label:", first_label)

Text: tf.Tensor(b'mas kocok mas eghh mas yang dalam kocok terus selangkanganku aduhh eghh mas enakk', shape=(), dtype=string)
Label: tf.Tensor(0, shape=(), dtype=int64)


In [16]:
print("'int' vectorized text:",
      vectorize_layer(first_text).numpy())

'int' vectorized text: [ 134  821  134  504  134    3   18  821   35 1012 1463  504  134 3026
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0]


In [17]:
print("2000 ---> ", vectorize_layer.get_vocabulary()[2000])
print("205 ---> ", vectorize_layer.get_vocabulary()[205])
print("Vocabulary size: {}".format(len(vectorize_layer.get_vocabulary())))

2000 --->  pergumulan
205 --->  menjilati
Vocabulary size: 3590


In [18]:
model = Sequential([
  vectorize_layer,
  Embedding(vocab_size, embedding_dim, name="embedding"),
  GlobalAveragePooling1D(),
  Dropout(0.1),
  Dense(16, activation='relu'),
  Dropout(0.1),
  Dense(1, activation="sigmoid")
])

In [19]:
model.compile(optimizer='adam',
              loss='binary_crossentropy',  # Use binary cross-entropy for binary classification
              metrics=['accuracy'])

In [20]:
model.fit(
    train_dataset,  # Shuffle and batch the training data
    validation_data=val_dataset,  # Batch the validation data
    epochs=15)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.src.callbacks.History at 0x7984cd5e6350>

In [21]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 text_vectorization (TextVe  (None, 100)               0         
 ctorization)                                                    
                                                                 
 embedding (Embedding)       (None, 100, 16)           160000    
                                                                 
 global_average_pooling1d (  (None, 16)                0         
 GlobalAveragePooling1D)                                         
                                                                 
 dropout (Dropout)           (None, 16)                0         
                                                                 
 dense (Dense)               (None, 16)                272       
                                                                 
 dropout_1 (Dropout)         (None, 16)                0

In [22]:
model.save("my_model", save_format='tf')

In [23]:
from google.colab import files

!zip -r saved_model.zip my_model
files.download("saved_model.zip")

  adding: my_model/ (stored 0%)
  adding: my_model/keras_metadata.pb (deflated 87%)
  adding: my_model/assets/ (stored 0%)
  adding: my_model/saved_model.pb (deflated 80%)
  adding: my_model/variables/ (stored 0%)
  adding: my_model/variables/variables.data-00000-of-00001 (deflated 55%)
  adding: my_model/variables/variables.index (deflated 56%)
  adding: my_model/fingerprint.pb (stored 0%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [24]:
loaded = tf.saved_model.load('my_model')

In [25]:
true_labels = []
for text_batch, label_batch in test_dataset:
  for i in range(len(label_batch)):
    true_labels.append(label_batch.numpy()[i])

print(len(true_labels))

200


In [26]:
examples = [
    "ngentot yuk",
    "Uhh kegedean tuh",
    "rasanya pen bet dah gua sedot tuh gunung",
    "Kaka Buka Dong Bajunya",
    "Love yang mau remas t3t3knya",
    "Seru nih kenyot nenen nya",
    "ngentot yu",
    "toket",
    "Kumpulan cerita sex terbaru, cersex bergambar, kisah hot, cerita dewasa, cerita mesum, cerpen ngentot, cerita selingkuh dan cerita seks pemerkosaan",
    "bacapres 2024 adu debat di TVONE",
    "aku remas t3t3knya",
    "owhh hhahh nghhahh goyang yang keras! hhahh ennnakkkhh mmhh enakkkhterus katryn bilang, mau spank nggak? katanya",
    "saya jambak rambutnya yang panjang dan saya tarik sampai bisa saya remas payudaranya yang besar",
    "enak banget hari ini aku bercinta sama pacarku",
    "gigitannya yang kecil-kecil ini yang membuat saya meringis kenikmatan campur kesakitan yang bikin saya takluk sama dia"
]

predictions = model.predict(test_dataset)

# Define a threshold for classification (e.g., 0.5)
threshold = 0.5

# Use a conditional statement to get the predicted labels as strings
predicted_labels = [1 if p >= threshold else 0 for p in predictions]
print(len(predicted_labels))

accuracy = accuracy_score(true_labels, predicted_labels)
print(accuracy)
# Print the predicted labels
# for example, label in zip(Text_test, predicted_labels):
    # print(f"Input Text: {Text_test}")
    # print(f"Predicted Label: {label}\n")

200
0.97
