# YZV 311E DATA MINING PROJECT
# <strong>Detection Of Sucidal Texts</strong>
## Zehra Demir
## Nurbanu Gök

### Importing the necessary libraries

In [16]:
import numpy as np
import pandas as pd

### Reading the data file

In [17]:
df = pd.read_csv("data/Suicide_Detection_50k.csv")

In [18]:
df.head()


Unnamed: 0,text,class
0,How do you shower? May you tell me how you sho...,non-suicide
1,How do I prevent suicide before it even starts...,suicide
2,Suicidal ThoughtsI haven't gone 1 day without ...,suicide
3,"Ignore, just checkin somethin' Just checking i...",non-suicide
4,i’m a busy man 😂😂😂😂😂😂😂😂 jk all i do is go on r...,non-suicide


In [4]:
df.shape

(50000, 2)

## Data Preprocessing

In [5]:
df.isna().sum()

text     0
class    0
dtype: int64

In [6]:
df.duplicated().sum()

0

In [7]:
df.dtypes

text     object
class    object
dtype: object

### Some work has been done when reducing the data set to be able to fit it in GitHub repository. This is why we do not have any null or duplicate values now. You can find the details in data_reducing.ipynb file.

### We can now proceed with the text preprocessing part.

## Text Preprocessing

### Remove URLs etc.

In [19]:
import re

In [20]:
def clean(text):
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'@[A-Za-z0-9]+', '', text) # Remove mentions/handles
    #text = re.sub(r'[^A-Za-z0-9]+', ' ', text) 
    return text
df['text'] = df['text'].apply(clean)

### Lowercasing the text data

In [21]:
df['text'] = df['text'].str.lower()
df.head()

Unnamed: 0,text,class
0,how do you shower? may you tell me how you sho...,non-suicide
1,how do i prevent suicide before it even starts...,suicide
2,suicidal thoughtsi haven't gone 1 day without ...,suicide
3,"ignore, just checkin somethin' just checking i...",non-suicide
4,i’m a busy man 😂😂😂😂😂😂😂😂 jk all i do is go on r...,non-suicide


#

### Emoji and Emoticon Handling

In [11]:
pip install emoji

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.1.2 -> 23.3.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [11]:
import emoji


#### Converting the emojis

In [13]:


def convert_emojis_to_text(text):
    return emoji.demojize(text)

#df['text'] = df['text'].apply(convert_emojis_to_text)


In [14]:
df.head(5)

Unnamed: 0,text,class
0,how do you shower? may you tell me how you sho...,non-suicide
1,how do i prevent suicide before it even starts...,suicide
2,suicidal thoughtsi haven't gone 1 day without ...,suicide
3,"ignore, just checkin somethin' just checking i...",non-suicide
4,i’m a busy man 😂😂😂😂😂😂😂😂 jk all i do is go on r...,non-suicide


#### ... or removing the emojis

In [12]:
pip install unidecode

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.1.2 -> 23.3.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [13]:
pip install clean-text

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.1.2 -> 23.3.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [22]:
from cleantext import clean

In [23]:
def clean_emojis(text):

    return clean(text, no_emoji=True)

df['text'] = df['text'].apply(clean_emojis)

In [24]:
df.head()

Unnamed: 0,text,class
0,how do you shower? may you tell me how you sho...,non-suicide
1,how do i prevent suicide before it even starts...,suicide
2,suicidal thoughtsi haven't gone 1 day without ...,suicide
3,"ignore, just checkin somethin' just checking i...",non-suicide
4,i'm a busy man jk all i do is go on reddit,non-suicide


### Remove Punctuations

In [25]:
import string

In [26]:
def remove_punctuation(text):
    translator = str.maketrans('', '', string.punctuation)
    return text.translate(translator)

df["text"] = df["text"].apply(remove_punctuation)

### Remove Stop Words

In [27]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kullanıcı\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [28]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('stopwords')
nltk.download('punkt')

def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(text)
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return ' '.join(filtered_words)

# Example usage:
df['text'] = df['text'].apply(remove_stopwords)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kullanıcı\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\kullanıcı\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [29]:
df.head()

Unnamed: 0,text,class
0,shower may tell shower step step please im wei...,non-suicide
1,prevent suicide even startsi headed towards co...,suicide
2,suicidal thoughtsi havent gone 1 day without t...,suicide
3,ignore checkin somethin checking alt enough ka...,non-suicide
4,im busy man jk go reddit,non-suicide


### Stemming

In [30]:
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.lancaster import LancasterStemmer

stemmer1=PorterStemmer()
stemmer2=SnowballStemmer("english")
stemmer3=LancasterStemmer()

In [26]:


def stem_words(text):
    words = word_tokenize(text)
    stemmed_words = [stemmer1.stem(word) for word in words]
    return ' '.join(stemmed_words)

# Example usage:
stem_words("I love to eating cookie")

'i love to eat cooki'

In [27]:
df.head()

Unnamed: 0,text,class
0,shower may tell shower step step please im wei...,non-suicide
1,prevent suicide even startsi headed towards co...,suicide
2,suicidal thoughtsi havent gone 1 day without t...,suicide
3,ignore checkin somethin checking alt enough ka...,non-suicide
4,im busy man jk go reddit,non-suicide


### Lemmatization

In [31]:
from nltk.stem import WordNetLemmatizer
nltk.download("wordnet")
nltk.download("omw-1.4")
#init the wordnet lemmatizer

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\kullanıcı\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\kullanıcı\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [53]:
wnl = WordNetLemmatizer()

def lemmatize_words(text):
    words = word_tokenize(text)
    lemmatized_words = [wnl.lemmatize(word) for word in words]
    return ' '.join(lemmatized_words)

# Example usage:
lemmatize_words("programmed programmers programming")

df["text"] = df["text"].apply(lemmatize_words)

## Splitting Data

In [None]:
from sklearn.model_selection import train_test_split

In [37]:

X_train, X_test, y_train, y_test = train_test_split(df['text'], df['class'], test_size=0.2, random_state=42)

## Saving the Dataframe

In [38]:
train_df = pd.DataFrame({'text': X_train, 'class': y_train})
test_df = pd.DataFrame({'text': X_test, 'class': y_test})

# Save train and test DataFrames to CSV files
train_df.to_csv('train_data.csv', index=False)
test_df.to_csv('test_data.csv', index=False)

## word2vec

In [40]:
from gensim.models import Word2Vec
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score


In [42]:
# Tokenize the text data
tokenized_text = X_train.apply(lambda x: x.split())

# Train Word2Vec model
word2vec_model = Word2Vec(sentences=tokenized_text, vector_size=100, window=5, min_count=1, workers=4 )


In [47]:
def document_vectorizer(doc, model):
    doc_vector = [model.wv[word] for word in doc if word in model.wv.index_to_key]
    return np.mean(doc_vector, axis=0) if doc_vector else np.zeros(model.vector_size)

# Apply the function to train and test data
X_train_w2v = tokenized_text.apply(lambda x: document_vectorizer(x, word2vec_model))
X_test_w2v = X_test.apply(lambda x: document_vectorizer(x.split(), word2vec_model))

# Train RandomForest Classifier
rf_classifier = RandomForestClassifier(random_state=42)
rf_classifier.fit(list(X_train_w2v), y_train)

# Make Predictions and Evaluate
y_pred_w2v = rf_classifier.predict(list(X_test_w2v))
accuracy = accuracy_score(y_test, y_pred_w2v)

print(f'Test Accuracy: {accuracy}')

Test Accuracy: 0.8989


### Randomized Search for the model with word2vec

In [50]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV

In [61]:

# Load and split the dataset


# Taking a small subset for faster experimentation
df_subset = df.sample(frac=0.2, random_state=42)

X_train_small, X_test_small, y_train_small, y_test_small = train_test_split(df_subset['text'], df_subset['class'], test_size=0.2, random_state=42)

# Tokenize the text data
tokenized_text = X_train_small.apply(lambda x: x.split())

# Train Word2Vec model
word2vec_model = Word2Vec(sentences=tokenized_text, vector_size=100, window=5, min_count=1, workers=4)

# Create a function to get the vector representation for each document
def document_vectorizer(doc, model):
    doc_vector = [model.wv[word] for word in doc if word in model.wv.index_to_key]
    return np.mean(doc_vector, axis=0) if doc_vector else np.zeros(model.vector_size)

# Apply the function to train and test data
X_train_w2v_small = tokenized_text.apply(lambda x: document_vectorizer(x, word2vec_model))
X_test_w2v_small = X_test_small.apply(lambda x: document_vectorizer(x.split(), word2vec_model))

# Hyperparameter tuning for RandomForest using RandomizedSearchCV
param_dist_rf = {
    'n_estimators':[100, 150, 200, 300]       ,#randint(100, 500),
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4, 8],
    'criterion': ['gini', 'entropy'],
    'max_features': ['sqrt', 'log2'],
    'min_impurity_decrease': [0.0, 0.1, 0.2],
    'max_samples': [None, 0.5, 0.75, 1.0],
    'bootstrap': [True, False],
    
    
    
    
}

rf_classifier = RandomForestClassifier(random_state=42)

randomized_search_rf = RandomizedSearchCV(rf_classifier, param_distributions=param_dist_rf, n_iter=10, cv=3, scoring='accuracy', n_jobs=-1, random_state=42, verbose=2)
randomized_search_rf.fit(list(X_train_w2v_small), y_train_small)

# Get best parameters for RandomForest
best_params_rf = randomized_search_rf.best_params_
best_model_rf = randomized_search_rf.best_estimator_

# Evaluate on the test set
y_test_pred_rf = best_model_rf.predict(list(X_test_w2v_small))
accuracy_test_rf = accuracy_score(y_test_small, y_test_pred_rf)

print(f'Best Parameters for RandomForest: {best_params_rf}')
print(f'Test Accuracy for RandomForest: {accuracy_test_rf}')


Fitting 3 folds for each of 10 candidates, totalling 30 fits


15 fits failed out of a total of 30.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
15 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\kullanıcı\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\kullanıcı\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\ensemble\_forest.py", line 383, in fit
    raise ValueError(
ValueError: `max_sample` cannot be set if `bootstrap=False`. Either switch to `bootstrap=True` or set `max_sample=None`.

        nan 0.50724998 0.87050057        nan]


Best Parameters for RandomForest: {'n_estimators': 300, 'min_samples_split': 10, 'min_samples_leaf': 2, 'min_impurity_decrease': 0.0, 'max_samples': None, 'max_features': 'log2', 'max_depth': None, 'criterion': 'entropy', 'bootstrap': False}
Test Accuracy for RandomForest: 0.8815


Applied randomized search a small part of the data because the dataset was too large. It was taking hours to for whole dataset. After that, I try the parameters for the whole train data.

In [None]:
rf_classifier_tuned = RandomForestClassifier(random_state=42, n_estimators=250, min_samples_split= 5,  min_impurity_decrease=0, max_features= "log2", min_samples_leaf=1, max_depth=None, n_jobs=-1, criterion = "entropy",  verbose=2)
rf_classifier_tuned.fit(list(X_train_w2v), y_train)

# Make Predictions and Evaluate
y_pred_w2v_tuned = rf_classifier_tuned.predict(list(X_test_w2v))
accuracy = accuracy_score(y_test, y_pred_w2v_tuned)

print(f'Test Accuracy: {accuracy}')

After randomizez search and trying different parameter combinations, the highest accuracy value was 0.9031 for:

**rf_classifier_tuned = RandomForestClassifier(random_state=42, n_estimators=250, min_samples_split= 5,  min_impurity_decrease=0, max_features= "log2", min_samples_leaf=1, max_depth=None, n_jobs=-1, criterion = "entropy",  verbose=2)**

In [73]:


# Display confusion matrix and classification report
conf_matrix_tfidf = confusion_matrix(y_test, y_pred_w2v_tuned)
class_report_tfidf = classification_report(y_test, y_pred_w2v_tuned)
print('Confusion Matrix (word2vec):')
#print(conf_matrix_tfidf)
print('\nClassification Report (word2vec):')
print(class_report_tfidf)

Confusion Matrix (word2vec):

Classification Report (word2vec):
              precision    recall  f1-score   support

 non-suicide       0.90      0.91      0.90      5044
     suicide       0.90      0.90      0.90      4956

    accuracy                           0.90     10000
   macro avg       0.90      0.90      0.90     10000
weighted avg       0.90      0.90      0.90     10000



The highest accuracy for now =  0.9013, 0.9031, 0.9016




## TF - IDF

In [None]:


from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix



In [39]:
# Convert text data to numerical format using TF-IDF
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

print(X_train_tfidf.shape)

(40000, 67068)


### Saving TF-IDF dataframe

In [None]:
train_df_tfidf = pd.DataFrame({'text': X_train_tfidf, 'class': y_train})
test_df_tfidf = pd.DataFrame({'text': X_test_tfidf, 'class': y_test})

# Save train and test DataFrames to CSV files
train_df_tfidf.to_csv('train_data.csv', index=False)
test_df_tfidf.to_csv('test_data.csv', index=False)

### Preprocessor function for inputs

The main steps for cleaning and preprocess. For now, the vectorizer is tf-idf but it will be changed after investigatin the the other methods such as word2vec.

In [34]:
def preprocess(text):
    text = clean(text)
    text = text.lower()
    text = clean_emojis(text)
    text = remove_punctuation(text)
    text = remove_stopwords(text)
    text = lemmatize_words(text)
    tfidf_vectorizer = TfidfVectorizer()
    text = tfidf_vectorizer.fit_transform(text)
    
    return text

## Random Forest

In [7]:
import joblib

# Alttaki random forest tuning sonucu en yüksek şıkanla değiştirilecek

In [28]:
rf = RandomForestClassifier()
rf.fit(X_train_tfidf, y_train)

In [31]:
joblib.dump(rf, 'trained__rf_model.joblib')


['trained__rf_model.joblib']

In [8]:
# Load the trained model from the file
loaded_model = joblib.load('trained_model.joblib')

# Now you can use the loaded_model to make predictions


In [48]:
#y_pred_tfidf = rf.predict(X_test_tfidf)
y_pred_tfidf = loaded_model.predict(X_test_tfidf)

### Hyperparameter tuning

### Randomized Search

### Gridsearch

## Gaussian Naïve Bayes 

In [None]:
#X_train_dense = X_train_tfidf.toarray()
#X_test_dense = X_test_tfidf.toarray()

MemoryError: Unable to allocate 20.0 GiB for an array with shape (40000, 67068) and data type float64

In [40]:
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()
nb.fit(X_train_tfidf, y_train)

In [41]:
y_pred_tfidf = nb.predict(X_test_tfidf)

In [42]:
# Check accuracy
accuracy_tfidf = accuracy_score(y_test, y_pred_tfidf)
print(f'Accuracy (TF-IDF): {accuracy_tfidf}')

# Display confusion matrix and classification report
conf_matrix_tfidf = confusion_matrix(y_test, y_pred_tfidf)
class_report_tfidf = classification_report(y_test, y_pred_tfidf)

print('Confusion Matrix (TF-IDF):')
print(conf_matrix_tfidf)
print('\nClassification Report (TF-IDF):')
print(class_report_tfidf)

Accuracy (TF-IDF): 0.8588
Confusion Matrix (TF-IDF):
[[3723 1321]
 [  91 4865]]

Classification Report (TF-IDF):
              precision    recall  f1-score   support

 non-suicide       0.98      0.74      0.84      5044
     suicide       0.79      0.98      0.87      4956

    accuracy                           0.86     10000
   macro avg       0.88      0.86      0.86     10000
weighted avg       0.88      0.86      0.86     10000

