# Data Wrangling


In [67]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
import matplotlib.pyplot as plt
import seaborn as sns


# bbc-text.csv
!gdown --id 1KLn3NLLv2rng2vV_8Ys3Yn53iahv9shd

Downloading...
From (original): https://drive.google.com/uc?id=1KLn3NLLv2rng2vV_8Ys3Yn53iahv9shd
From (redirected): https://drive.google.com/uc?id=1KLn3NLLv2rng2vV_8Ys3Yn53iahv9shd&confirm=t&uuid=ecddecb3-ccbe-4672-bfe6-cf88ba9f9643
To: /content/Suicide_Detection.csv
100% 167M/167M [00:01<00:00, 148MB/s]


In [68]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [74]:
data = pd.read_csv('Suicide_Detection.csv')
display(data.head())

Unnamed: 0.1,Unnamed: 0,text,class
0,2,Ex Wife Threatening SuicideRecently I left my ...,suicide
1,3,Am I weird I don't get affected by compliments...,non-suicide
2,4,Finally 2020 is almost over... So I can never ...,non-suicide
3,8,i need helpjust help me im crying so hard,suicide
4,9,"I’m so lostHello, my name is Adam (16) and I’v...",suicide


In [75]:
data.info()
display(data.isna().sum())
print("Jumlah duplikasi: ", data.duplicated().sum())
display(data.describe())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 232074 entries, 0 to 232073
Data columns (total 3 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   Unnamed: 0  232074 non-null  int64 
 1   text        232074 non-null  object
 2   class       232074 non-null  object
dtypes: int64(1), object(2)
memory usage: 5.3+ MB


Unnamed: 0,0
Unnamed: 0,0
text,0
class,0


Jumlah duplikasi:  0


Unnamed: 0.1,Unnamed: 0
count,232074.0
mean,174152.863518
std,100500.425362
min,2.0
25%,87049.25
50%,174358.5
75%,261285.75
max,348110.0


In [None]:
# vocab_size = 1000
# embedding_dim = 16
# max_length = 120
# trunc_type='post'
# padding_type='post'
# oov_tok = "<OOV>"
# training_portion = .8

def preprocess_data(df):
    # Encoding labels (Binary)
    le = LabelEncoder()
    df['class'] = le.fit_transform(df['class'])

    # Tokenization
    max_words = 5000
    max_len = 200
    tokenizer = Tokenizer(num_words=max_words, lower=True)
    tokenizer.fit_on_texts(df['text'])

    X = tokenizer.texts_to_sequences(df['text'])
    X = pad_sequences(X, maxlen=max_len)

    y = df['class'].values

    return X, y, tokenizer, le

def create_binary_lstm_model(input_shape):
    model = Sequential([
        # Embedding layer
        Embedding(5000, 128, input_length=input_shape),

        # LSTM Layers
        LSTM(128, return_sequences=True),
        Dropout(0.3),
        LSTM(64),

        # Dense Layers
        Dense(64, activation='relu'),
        Dropout(0.3),

        # Binary Output with Sigmoid
        Dense(1, activation='sigmoid')
    ])

    model.compile(
        optimizer='adam',
        loss='binary_crossentropy',  # Binary Cross Entropy for Binary Classification
        metrics=['accuracy']
    )

    return model

def train_and_evaluate_model(X, y):
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )

    # Early Stopping
    early_stopping = EarlyStopping(
        monitor='val_loss',
        patience=3,
        restore_best_weights=True
    )

    # Create and Train Model
    model = create_binary_lstm_model(X_train.shape[1])

    history = model.fit(
        X_train, y_train,
        epochs=10,
        batch_size=64,
        validation_split=0.2,
        callbacks=[early_stopping],
        verbose=1
    )

    # Predict probabilities
    y_pred_proba = model.predict(X_test).flatten()
    y_pred = (y_pred_proba > 0.5).astype(int)

    # Evaluation Metrics
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))

    # ROC Curve
    fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)
    roc_auc = roc_auc_score(y_test, y_pred_proba)

    # Visualisasi
    plt.figure(figsize=(12,4))

    # Confusion Matrix
    plt.subplot(1,2,1)
    cm = confusion_matrix(y_test, y_pred)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title('Confusion Matrix')
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')

    # ROC Curve
    plt.subplot(1,2,2)
    plt.plot(fpr, tpr, color='darkorange', lw=2,
             label=f'ROC Curve (AUC = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic')
    plt.legend(loc="lower right")

    plt.tight_layout()
    plt.show()

    return model, history

# Main Execution
def main():
    # Ganti dengan path dataset Anda di Google Colab
    file_path = '/content/suicide_detection.csv'

    # Load Dataset
    df = data

    # Preprocessing
    X, y, tokenizer, label_encoder = preprocess_data(df)

    # Train and Evaluate
    model, history = train_and_evaluate_model(X, y)

    # Plotting Training History
    plt.figure(figsize=(12,4))
    plt.subplot(1,2,1)
    plt.plot(history.history['accuracy'], label='Training Accuracy')
    plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
    plt.title('Model Accuracy')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.legend()

    plt.subplot(1,2,2)
    plt.plot(history.history['loss'], label='Training Loss')
    plt.plot(history.history['val_loss'], label='Validation Loss')
    plt.title('Model Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()
    plt.show()

# Jalankan fungsi main
main()



Epoch 1/10
[1m1636/2321[0m [32m━━━━━━━━━━━━━━[0m[37m━━━━━━[0m [1m9:50[0m 862ms/step - accuracy: 0.8765 - loss: 0.3141

In [66]:
sentences = []
labels = []
stopwords = [ "a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "as", "at", "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "could", "did", "do", "does", "doing", "down", "during", "each", "few", "for", "from", "further", "had", "has", "have", "having", "he", "he'd", "he'll", "he's", "her", "here", "here's", "hers", "herself", "him", "himself", "his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into", "is", "it", "it's", "its", "itself", "let's", "me", "more", "most", "my", "myself", "nor", "of", "on", "once", "only", "or", "other", "ought", "our", "ours", "ourselves", "out", "over", "own", "same", "she", "she'd", "she'll", "she's", "should", "so", "some", "such", "than", "that", "that's", "the", "their", "theirs", "them", "themselves", "then", "there", "there's", "these", "they", "they'd", "they'll", "they're", "they've", "this", "those", "through", "to", "too", "under", "until", "up", "very", "was", "we", "we'd", "we'll", "we're", "we've", "were", "what", "what's", "when", "when's", "where", "where's", "which", "while", "who", "who's", "whom", "why", "why's", "with", "would", "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves" ]

In [52]:
with open("./Suicide_Detection.csv", 'r') as csvfile:
    reader = csv.reader(csvfile, delimiter=',')
    next(reader)
    for row in reader:
        labels.append(row[0])
        sentence = row[1]
        for word in stopwords:
            token = " " + word + " "
            sentence = sentence.replace(token, " ")
        sentences.append(sentence)

In [53]:
train_size = int(len(sentences) * training_portion)

train_sentences = sentences[:train_size]
train_labels = labels[:train_size]

validation_sentences = sentences[train_size:]
validation_labels = labels[train_size:]

In [54]:
tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(train_sentences)
word_index = tokenizer.word_index

train_sequences = tokenizer.texts_to_sequences(train_sentences)
train_padded = pad_sequences(train_sequences, padding=padding_type, maxlen=max_length)

validation_sequences = tokenizer.texts_to_sequences(validation_sentences)
validation_padded = pad_sequences(validation_sequences, padding=padding_type, maxlen=max_length)

In [55]:
label_tokenizer = Tokenizer()
label_tokenizer.fit_on_texts(labels)

training_label_seq = np.array(label_tokenizer.texts_to_sequences(train_labels))
validation_label_seq = np.array(label_tokenizer.texts_to_sequences(validation_labels))

In [62]:
def create_model(vocab_size, embedding_dim, max_length):
  model = models.Sequential([
  layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
  layers.GlobalAveragePooling1D(),
  layers.Dense(64, activation='relu'),
  layers.Dense(128, activation='relu'),
  layers.Dense(1, activation='sigmoid')
  ])

  # model = tf.keras.models.Sequential([
  #     tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_length),
  #     tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(embedding_dim, return_sequences=True)),
  #     tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
  #     tf.keras.layers.Dense(embedding_dim, activation='relu'),
  #     tf.keras.layers.Dense(1, activation=tf.nn.sigmoid)
  # ])

  model.compile(loss=tf.keras.losses.BinaryCrossentropy(),
                optimizer=optimizers.Adam(),
                metrics=['accuracy'])

  return model

In [63]:
model = create_model(vocab_size, embedding_dim, max_length)
model.summary()

In [64]:
num_epochs = 30
history = model.fit(train_padded, training_label_seq,
                    epochs=num_epochs,
                    validation_data=(validation_padded, validation_label_seq))

Epoch 1/30
[1m5802/5802[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 4ms/step - accuracy: 9.5428e-06 - loss: -35893943468032.0000 - val_accuracy: 0.0000e+00 - val_loss: -1453374385946624.0000
Epoch 2/30
[1m5802/5802[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 3ms/step - accuracy: 1.0016e-05 - loss: -1456139002707968.0000 - val_accuracy: 0.0000e+00 - val_loss: -13008637748314112.0000
Epoch 3/30
[1m5802/5802[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 4ms/step - accuracy: 2.1205e-06 - loss: -8751889755668480.0000 - val_accuracy: 0.0000e+00 - val_loss: -48257655537401856.0000
Epoch 4/30
[1m5802/5802[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 3ms/step - accuracy: 1.9419e-06 - loss: -28382808191795200.0000 - val_accuracy: 0.0000e+00 - val_loss: -125169563347189760.0000
Epoch 5/30
[1m5802/5802[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 3ms/step - accuracy: 9.8171e-06 - loss: -68952006265405440.0000 - val_accuracy: 0.0000e+00 - val_