In [1]:
import pandas as pd 
import numpy as np
import re
import string

In [2]:
df = pd.read_csv('../data/Suicide_Detection.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,text,class
0,2,Ex Wife Threatening SuicideRecently I left my ...,suicide
1,3,Am I weird I don't get affected by compliments...,non-suicide
2,4,Finally 2020 is almost over... So I can never ...,non-suicide
3,8,i need helpjust help me im crying so hard,suicide
4,9,"I’m so lostHello, my name is Adam (16) and I’v...",suicide


In [3]:
df.duplicated().sum()

np.int64(0)

In [4]:
df.isnull().sum()

Unnamed: 0    0
text          0
class         0
dtype: int64

In [5]:
df = df.drop(columns=['Unnamed: 0'])

In [6]:
df.head()

Unnamed: 0,text,class
0,Ex Wife Threatening SuicideRecently I left my ...,suicide
1,Am I weird I don't get affected by compliments...,non-suicide
2,Finally 2020 is almost over... So I can never ...,non-suicide
3,i need helpjust help me im crying so hard,suicide
4,"I’m so lostHello, my name is Adam (16) and I’v...",suicide


# Data Cleaning

In [7]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

def clean(doc):
    # doc is a string of text
    # let's define a regex to match special characters and digits
    regex = '[^a-zA-Z.]'
    doc = re.sub(regex, ' ', doc)
    # convert to lowercase
    doc = doc.lower()
    # tokenization
    tokens = nltk.word_tokenize(doc)
    # Stop word removal 
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word not in stop_words]
    # Lemmatize
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]
    # join and return 
    return ' '.join(lemmatized_tokens)

In [8]:
df['clean_text'] = df['text'].apply(lambda x : clean(x))
df.head()

Unnamed: 0,text,class,clean_text
0,Ex Wife Threatening SuicideRecently I left my ...,suicide,ex wife threatening suiciderecently left wife ...
1,Am I weird I don't get affected by compliments...,non-suicide,weird get affected compliment coming someone k...
2,Finally 2020 is almost over... So I can never ...,non-suicide,finally almost ... never hear bad year ever . ...
3,i need helpjust help me im crying so hard,suicide,need helpjust help im cry hard
4,"I’m so lostHello, my name is Adam (16) and I’v...",suicide,losthello name adam struggling year afraid . p...


In [9]:
df.drop(columns=['text'],inplace=True)

In [10]:
df.head()

Unnamed: 0,class,clean_text
0,suicide,ex wife threatening suiciderecently left wife ...
1,non-suicide,weird get affected compliment coming someone k...
2,non-suicide,finally almost ... never hear bad year ever . ...
3,suicide,need helpjust help im cry hard
4,suicide,losthello name adam struggling year afraid . p...


# Splitting the Data

In [11]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

x = df["clean_text"].values
y = df["class"].values

le = LabelEncoder()
y = le.fit_transform(y)

x_train,x_test,y_train,y_test = train_test_split(x, y, test_size=0.2, random_state=42, stratify=y)

# Padding and Tokenizing

In [12]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

MAX_WORDS = 30000
MAX_LEN = 100

tokenizer = Tokenizer(num_words=MAX_WORDS, oov_token="<OOV>")
tokenizer.fit_on_texts(x_train)

x_train_seq = tokenizer.texts_to_sequences(x_train)
x_test_seq = tokenizer.texts_to_sequences(x_test)

x_train_pad = pad_sequences(x_train_seq, maxlen=MAX_LEN, padding="post")
x_test_pad = pad_sequences(x_test_seq, maxlen=MAX_LEN, padding="post")

vocab_size = min(MAX_WORDS, len(tokenizer.word_index) + 1)


# Building an LSTM Model

In [14]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional

model = Sequential([
    Embedding(vocab_size, 128, input_length=MAX_LEN),
    Bidirectional(LSTM(128)),
    Dropout(0.3),
    Dense(64, activation="relu"),
    Dropout(0.3),
    Dense(1, activation="sigmoid")
])

model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])



In [15]:
from tensorflow.keras.callbacks import EarlyStopping
import pickle

es = EarlyStopping(monitor="val_loss", patience=2, restore_best_weights=True)

from sklearn.utils.class_weight import compute_class_weight
import numpy as np

class_weights = compute_class_weight(
    class_weight="balanced",
    classes=np.unique(y_train),
    y=y_train
)

class_weights = dict(enumerate(class_weights))

history = model.fit(
    x_train_pad, y_train,
    validation_split=0.2,
    epochs=5,
    batch_size=128,
    class_weight=class_weights,
    verbose=1
)

Epoch 1/5
[1m1020/1161[0m [32m━━━━━━━━━━━━━━━━━[0m[37m━━━[0m [1m32s[0m 233ms/step - accuracy: 0.8946 - loss: 0.2563

KeyboardInterrupt: 

In [None]:
model.summary()

# Evaluation

In [None]:
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix

y_pred_prob = model.predict(x_test_pad)
y_pred = (y_pred_prob > 0.65).astype(int)  

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred, target_names=le.classes_))
print(confusion_matrix(y_test, y_pred))


[1m1451/1451[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 28ms/step
Accuracy: 0.9430141118173004
              precision    recall  f1-score   support

 non-suicide       0.93      0.96      0.94     23208
     suicide       0.96      0.93      0.94     23207

    accuracy                           0.94     46415
   macro avg       0.94      0.94      0.94     46415
weighted avg       0.94      0.94      0.94     46415

[[22243   965]
 [ 1680 21527]]


# Model Saving

In [None]:
model.save("../models/lstm_suicide_model.h5")
pickle.dump(tokenizer, open("../models/tokenizer.pkl", "wb"))
pickle.dump(le, open("../models/label_encoder.pkl", "wb"))

