## Importing Libraries

In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns

import nltk
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize

import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

import joblib

from wordcloud import WordCloud
from collections import Counter

# Data Proprocessing

## Loading Data

Load the data, and have an overview of the data

In [None]:
data = pd.read_csv('Suicide_Detection.csv')

print(data.head(),"\n")
print(data.info(),"\n")
print("Shape: ",data.shape)

# Convert string to 0/1 values
data['is_suicide'] = data['class'].apply(lambda x: 1 if x == 'suicide' else 0)

The column 'Unnammed: 0' is a meaningless column at here, thus we can delete it

In [None]:
data.drop(columns = 'Unnamed: 0', inplace=True)
data.head()

Check is null value available

In [None]:
data.isnull().sum()

### Features Engineering

In [None]:
data['total_words'] = data['text'].apply(lambda x: len(x.split()))
data.head()

## Data Visualization

Proportion of suicide and non-suicide thoughts

In [None]:
classCount = data["is_suicide"].value_counts()
print(classCount)

plt.figure(figsize=(15, 5))

plt.subplot(1, 2, 2)
plt.title('Mental Health Issues')
plt.pie(classCount, labels=['Suicide', 'Not Suicide'], autopct='%.0f%%')

# Add a legend outside the pie chart
plt.legend(title="Responses", loc="upper left", bbox_to_anchor=(1, 0.5))

# Add a legend outside the pie chart

plt.show()

## Text Processing

Lower Case, Remove Puncutuations, Remove Stop Words

In [None]:
# Convert texts to lower case
data['preprocessed_text'] = data['text'].str.lower()

# Remove punctuations
data['preprocessed_text'] = data['preprocessed_text'].str.replace(r'[^\w\s]+','',regex=True)

# Remove stop words
from nltk.corpus import stopwords

stop_words = stopwords.words('english')
data['preprocessed_text'] = data['preprocessed_text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))

# Tokenize the words
data['preprocessed_text'] = data['preprocessed_text'].apply(lambda x:nltk.word_tokenize(x))

### Lemmatization

In [None]:
# Lemmatize the words
lemmatizer = WordNetLemmatizer()
data['preprocessed_text'] = data['preprocessed_text'].apply(lambda x: [lemmatizer.lemmatize(word,pos='v') for word in x])

# ps = PorterStemmer()
# data['preprocessed_text'] = data['preprocessed_text'].apply(lambda x: [ps.stem(i) for i in x])

# Combine the words
data['preprocessed_text'] = data['preprocessed_text'].apply(lambda x: ' '.join(x))

data.head()

Check null values after preprocessing process

In [None]:
data.isnull().sum()

Save preprocessed works to new csv file

In [None]:
data['preprocessed_text'] = data['preprocessed_text'].astype(str)
data.to_csv('preprocessed.csv')

preprocessed_data = pd.read_csv('preprocessed.csv')

preprocessed_data.head()

### Words Frequency

show words commonly used when expressing suicidal thought

In [None]:
# Generate word cloud of suicidal thoughts
preprocessed_data['preprocessed_text'] = preprocessed_data['preprocessed_text'].astype(str)

suicidal_thoughts = " ".join(preprocessed_data[preprocessed_data['is_suicide'] == 1]['preprocessed_text'])

plt.figure(figsize=(15,10))
wordcloud = WordCloud(max_words = 300, height = 900, width = 1600, background_color='black',colormap='viridis').generate(suicidal_thoughts)
plt.imshow(wordcloud,interpolation='bilinear')
plt.axis('off')
plt.show()

show words ranking of suicidal thought

In [None]:
depression_words_ranking = []
for sentence in preprocessed_data[preprocessed_data['is_suicide'] == 1]['preprocessed_text'].to_list():
    for word in sentence.split():
        depression_words_ranking.append(word)
        
df = pd.DataFrame(Counter(depression_words_ranking).most_common(50),columns=['Word','Frequency'])

sns.set_context('notebook')
plt.figure(figsize=(18,8))
sns.barplot(y=df['Word'], x=df['Frequency'],palette='summer')
plt.title('Most commonly used words for suicidal thoughts')
plt.xlabel('Frequency')
plt.ylabel('Words')
plt.show()

Comparions of Original word & Preprocessed word

In [None]:
preprocessed_data[['text','preprocessed_text']]

## Split Data

For training the dataset and make predictions

In [None]:
x,y = preprocessed_data['preprocessed_text'],preprocessed_data['is_suicide']

vectorizer = TfidfVectorizer(min_df=50,max_features=5000)
x = vectorizer.fit_transform(x).toarray()

X_train, X_test, y_train, y_test = train_test_split(x,y,test_size=0.20,random_state=3)
X_train.shape, X_test.shape

# Models

## Naive Bayes

Naive bayes with voting classifier

In [None]:
naive_bayes_gaussian = GaussianNB()
naive_bayes_bernoulli = BernoulliNB()
naive_bayes_multinomial = MultinomialNB()

VotingClassifiers = VotingClassifier(
    estimators=[
        ('GaussianNB',naive_bayes_gaussian),
        ('Bernoulli',naive_bayes_bernoulli),
        ('Multinomial',naive_bayes_multinomial)
    ],voting='soft'
)

VotingClassifiers.fit(X_train,y_train)

nb_prediction = VotingClassifiers.predict(X_test)
nb_confusion_matrix = confusion_matrix(y_test,nb_prediction)
nb_accuracy = accuracy_score(y_test,nb_prediction)

# Score for Training
print("Training Score = ",VotingClassifiers.score(X_train,y_train))

# Score for Testing
print("Testing Score = ",VotingClassifiers.score(X_test,y_test))

# Confusion Matrix
print("\n\nConfusion Matrix : ", nb_confusion_matrix)

print(classification_report(y_test,nb_prediction))

## Logistic Regression

In [None]:
lr_classifier = LogisticRegression(max_iter=1000)
lr_classifier.fit(X_train,y_train)

lr_prediction = lr_classifier.predict(X_test)
lr_confusion_matrix = confusion_matrix(y_test,lr_prediction)
lr_accuracy = accuracy_score(y_test,lr_prediction)

# Score for Training
print("Training Score = ",lr_classifier.score(X_train,y_train))

# Score for Testing
print("Testing Score = ",lr_classifier.score(X_test,y_test))

# Confusion Matrix
print("\nConfusion Matrix : ", lr_confusion_matrix)

print(classification_report(y_test,lr_prediction))

## Random Forest

In [None]:
%time

# random_forest_classifier = RandomizedSearchCV(
#     RandomForestClassifier(),
#     {
#         'n_estimators':[4,5],
#         'criterion':['entropy'],
#         'max_depth':range(1,4),'min_samples_split':range(2,5)
#     }, random_state=10
# )

random_forest_classifier = RandomForestClassifier(n_estimators=100,random_state=30)

random_forest_classifier.fit(X_train,y_train)

rf_prediction = random_forest_classifier.predict(X_test)
rf_confusion_matrix = confusion_matrix(y_test,rf_prediction)
rf_accuracy = accuracy_score(y_test,rf_prediction)

# Score for Training
print("Training Score = ",random_forest_classifier.score(X_train,y_train))

# Score for Testing
print("Testing Score = ",random_forest_classifier.score(X_test,y_test))

# Confusion Matrix
print("\nConfusion Matrix : ", rf_confusion_matrix)

print(classification_report(y_test,rf_prediction))

## Support Vector Machines (SVM)

In [None]:
svm_classifier = SVC(kernel='linear')

svm_classifier.fit(X_train,y_train)

svm_prediction = svm_classifier.predict(X_test)
svm_confusion_matrix = confusion_matrix(y_test,svm_prediction)
svm_accuracy = accuracy_score(y_test,svm_prediction)

# Score for Training
print("Training Score = ",svm_classifier.score(X_train,y_train))

# Score for Testing
print("Testing Score = ",svm_classifier.score(X_test,y_test))

# Confusion Matrix
print("\nConfusion Matrix : ", svm_confusion_matrix)

print(classification_report(y_test,svm_prediction))

## Decision Tree

In [None]:
%time

decision_tree_classifier = DecisionTreeClassifier(criterion='gini',splitter='random',min_samples_leaf=100,max_depth=6,random_state=0)
decision_tree_classifier.fit(X_train,y_train)

dt_prediction = decision_tree_classifier.predict(X_test)
dt_confusion_matrix = confusion_matrix(y_test,dt_prediction)
dt_accuracy = accuracy_score(y_test,dt_prediction)
                             
# Score for Training
print("Training Score = ",decision_tree_classifier.score(X_train,y_train))
                             
# Score for Testing
print("Testing Score = ",decision_tree_classifier.score(X_test,y_test))

# Confusion Matrix
print("\nConfusion Matrix : ", rf_confusion_matrix)

print(classification_report(y_test,rf_prediction))

# Conclusion

In [None]:
model_ev = pd.DataFrame({
    'Model':['Naive Bayes','Random Forest','Decision Tree','Logistic Regression'],
    'Accuracy':[nb_accuracy,nb_accuracy,lr_accuracy]
})

model_ev

Naive Bayes is the best fit model for the given dataset as it has the values,
- Training Score: 0.9024567408018418
- Testing Score =  0.8993723338551916

### Save Model

Save the model (Naive Bayes )with highest accuracy to a joblib file

In [None]:
joblib.dump(VotingClassifiers,'naive_bayes_model.joblib')

In [None]:
def preprocess(word):
    word = word.lower()
    word = word.replace(r'[^\w\s]+','')
    word = [word for word in word.split() if word not in (stop_words)]
    word = ' '.join([ps.stem(i) for i in word])
    return vectorizer.transform([word]).toarray()

def mental_health_issue_detector(text):
    print("Input = ",text)
    processed_word = preprocess(text)
    prediction = VotingClassifiers.predict(processed_word)
    print(prediction[0])

## User Input

In [None]:
mental_health_issue_detector("love")