In [1]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import nltk
from nltk.corpus import stopwords
from nltk.stem import *
from sklearn.feature_extraction.text import TfidfVectorizer

# Data Overview

In [2]:
df = pd.read_csv("Suicide_Detection.csv")

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 232074 entries, 0 to 232073
Data columns (total 3 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   Unnamed: 0  232074 non-null  int64 
 1   text        232074 non-null  object
 2   class       232074 non-null  object
dtypes: int64(1), object(2)
memory usage: 5.3+ MB


In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,text,class
0,2,Ex Wife Threatening SuicideRecently I left my ...,suicide
1,3,Am I weird I don't get affected by compliments...,non-suicide
2,4,Finally 2020 is almost over... So I can never ...,non-suicide
3,8,i need helpjust help me im crying so hard,suicide
4,9,"I’m so lostHello, my name is Adam (16) and I’v...",suicide


In [5]:
df = df.sample(n=9000, random_state=42) 

In [6]:
df.drop(columns = 'Unnamed: 0',inplace=True)

In [7]:
df.isnull().sum()

text     0
class    0
dtype: int64

In [8]:
df.duplicated().sum()

0

In [9]:
classCnt = df['class'].value_counts()
print(classCnt)

non-suicide    4548
suicide        4452
Name: class, dtype: int64


# Pre Processing


## From Upper Case To Lower Case

In [10]:
df['text']= df['text'].str.lower() # to lower case

## Remove punctuation

In [11]:
df['text'] = df['text'].str.replace(r'[^\w\s]+', '',regex = True) # remove any special characters or punctuation

The regular expression pattern [^\w\s]+ matches one or more occurrences of any character that is not a word character (i.e., a letter, digit, or underscore) or a whitespace character. The ^ character at the beginning of the pattern negates the character set, so that any character that does not match the set is replaced.

The replacement value is an empty string, which effectively removes any non-word and non-space characters from the text.

The regex=True parameter indicates that the regular expression pattern should be used for the replacement, rather than treating it as a plain string.

## Remove Stop Words

In [12]:
stop_words = stopwords.words('english') # remove stop words
df['text'] = df['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))


## Tokenization

In [13]:
df['text'] = df['text'].apply(lambda x:nltk.word_tokenize(x)) # tokenization

## Stemming

In [14]:
ps = PorterStemmer() 

In [15]:
df['text'] = df['text'].apply(lambda x : [ps.stem(i) for i in x])
df['text']=df['text'].apply(lambda x : ' '.join(x))

## Saving the cleaned dataset

In [16]:
df.to_csv('file1.csv')

In [17]:
dfnew = pd.read_csv('file1.csv')
dfnew.head()
dfnew.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9000 entries, 0 to 8999
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  9000 non-null   int64 
 1   text        8999 non-null   object
 2   class       9000 non-null   object
dtypes: int64(1), object(2)
memory usage: 211.1+ KB


In [18]:
dfnew.dropna(inplace=True)

In [19]:
x,y = dfnew['text'],dfnew['class']

## TF-IDF Vectorization

In [20]:
x,y = dfnew['text'],dfnew['class']
vectorizer = TfidfVectorizer(min_df=50,max_features=5000)
x =  vectorizer.fit_transform(x).toarray()

## Saving the model

In [21]:
import pickle
with open('tfidf.pkl', 'wb') as f:
    pickle.dump(vectorizer, f) 

# Machine Learning

## Splitting data

In [22]:
from sklearn.model_selection import train_test_split

In [23]:
X_train,X_test,y_train,y_test = train_test_split(x,y,test_size=0.30,random_state=5)
X_train.shape,X_test.shape

((6299, 1244), (2700, 1244))

## Models

### 1

In [24]:
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import VotingClassifier

The VotingClassifier is a meta-estimator in scikit-learn that combines multiple machine learning models to improve the overall prediction accuracy. It works by aggregating the predictions of multiple models and choosing the class that has the highest probability based on the combined output of these models.

The VotingClassifier can be instantiated with several parameters, including:

    *estimators: a list of tuples that define the individual models to include in the voting ensemble. Each tuple should contain a string identifier for the model and the instantiated model object itself.
    *voting: the type of voting to use for combining the models' predictions. This can be one of three options: 'hard' (simple majority voting), 'soft' (weighted voting based on predicted probabilities), or 'uniform' (equal weight to all models).
    *weights: a list of weights to assign to each model in the voting ensemble. This is only relevant when using 'soft' voting.

In [25]:
nb = GaussianNB()
nb2 = BernoulliNB()
nb3 = MultinomialNB()

In [26]:
VotingClassifiers = VotingClassifier(estimators=[('GaussianNB', nb),('BernoulliNB',nb2), ('MultinomialNB', nb3)], voting = 'soft')
VotingClassifiers.fit(X_train, y_train)
print('Training score:',VotingClassifiers.score(X_train, y_train))
print('Testing score:',VotingClassifiers.score(X_test,y_test))

Training score: 0.9018891887601207
Testing score: 0.87


In [27]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

y_act=y_test
y_pred=VotingClassifiers.predict(X_test)
print(classification_report(y_act,y_pred))

              precision    recall  f1-score   support

 non-suicide       0.87      0.87      0.87      1387
     suicide       0.87      0.87      0.87      1313

    accuracy                           0.87      2700
   macro avg       0.87      0.87      0.87      2700
weighted avg       0.87      0.87      0.87      2700


# 2

In [28]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report

In [29]:
svm = SVC()

In [30]:
svm.fit(X_train, y_train)

SVC()

In [31]:
y_train_pred = svm.predict(X_train)

In [32]:
train_report = classification_report(y_train, y_train_pred)
print("Training Set Report:")
print(train_report)

Training Set Report:
              precision    recall  f1-score   support

 non-suicide       0.98      0.98      0.98      3161
     suicide       0.98      0.98      0.98      3138

    accuracy                           0.98      6299
   macro avg       0.98      0.98      0.98      6299
weighted avg       0.98      0.98      0.98      6299


In [33]:
y_test_pred = svm.predict(X_test)

In [34]:
test_report = classification_report(y_test, y_test_pred)
print("Test Set Report:")
print(test_report)

Test Set Report:
              precision    recall  f1-score   support

 non-suicide       0.89      0.93      0.91      1387
     suicide       0.93      0.88      0.90      1313

    accuracy                           0.91      2700
   macro avg       0.91      0.91      0.91      2700
weighted avg       0.91      0.91      0.91      2700


In [35]:
print("SVC Model Parameters:")
print(svm.get_params())

SVC Model Parameters:
{'C': 1.0, 'break_ties': False, 'cache_size': 200, 'class_weight': None, 'coef0': 0.0, 'decision_function_shape': 'ovr', 'degree': 3, 'gamma': 'scale', 'kernel': 'rbf', 'max_iter': -1, 'probability': False, 'random_state': None, 'shrinking': True, 'tol': 0.001, 'verbose': False}


*Precision: Precision is the ratio of true positive predictions to the total number of positive predictions. It measures the accuracy of positive predictions. Higher precision indicates fewer false positives. Precision is calculated for each class label.

*Recall (also known as sensitivity or true positive rate): Recall is the ratio of true positive predictions to the total number of actual positive instances. It measures the ability of the model to correctly identify positive instances. Higher recall indicates fewer false negatives. Recall is calculated for each class label.

*F1-score: The F1-score is the harmonic mean of precision and recall. It provides a balanced measure that takes both precision and recall into account. It is especially useful when you have imbalanced class distributions.

*Support: Support is the number of actual occurrences of each class in the test set. It represents the number of samples in the true response that have the corresponding class.

## Saving models

In [36]:
with open('VOTING_CLASSIFIER_model.pkl', 'wb') as f:
    pickle.dump(VotingClassifiers, f) 

In [37]:
with open('SVC_model.pkl', 'wb') as f:
    pickle.dump(svm, f) 

# Example

In [38]:
def preprocess(inp):
    inp = inp.lower() #convert to lower case 
    inp = inp.replace(r'[^\w\s]+', '') #remove punctuations
    inp = [word for word in inp.split() if word not in (stop_words)] #tokenize the sentence
    inp = ' '.join([ps.stem(i) for i in inp]) #stremming
    inputToModel = vectorizer.transform([inp]).toarray() #transform to vector form
    return inputToModel

In [39]:
def app(input_text):
    # Define the input text box
    print('Input : ',input_text) #take input from user
    processed_array = preprocess(input_text) #preprocess the text 
    predict = svm.predict(processed_array) #Model prediction
    print('Prediction : ', predict[0])

In [40]:
app('i am tired of my life i want to end my life')

Input :  i am tired of my life i want to end my life
Prediction :  suicide


In [41]:
app('I am feeling soo well')

Input :  I am feeling soo well
Prediction :  non-suicide
