In [None]:
pip install nltk

In [None]:
pip install pickle

Importing dependencies

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
import pickle
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
from sklearn.metrics import accuracy_score
from sklearn.ensemble import VotingClassifier
from sklearn.calibration import calibration_curve

nltk.download('wordnet')  
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

Importing Main Dataset

In [None]:
dataset = pd.read_csv('dataset.csv', sep=',')
dataset.head(10)

Preprocessing Data

1- Checking missing values

2 - Checking distribution of classes

3- LowerCase

4- Punctuation

5- Stop Words

6- Tokenizing 

7- Lemmatizing

In [None]:
plt.figure(figsize=(15,6))
sns.heatmap(dataset.isnull().transpose(),cmap="RdBu_r",cbar_kws={'label': 'Missing Data'})
#No missing values visible, Good data !

In [None]:
classes_num = dataset['class'].value_counts()

# Define a custom color palette
custom_palette = ['gold', 'lightcoral', 'lightskyblue', 'lightgreen']

plt.figure(figsize=(20, 5))

plt.subplot(1, 2, 1)
sns.countplot(data=dataset, x='class', palette=custom_palette)

plt.show()


In [None]:
#Transforming text into lowercase text
dataset['text'] = dataset['text'].str.lower()
dataset['text']

In [None]:
#removing punctuation
dataset['text'] = dataset['text'].str.replace(r'[^\w\s]+', '',regex=True)
dataset['text']

In [None]:
def remove_stop_words(text):
    clean_text = ' '.join([word for word in text.split() if word.lower() not in stop_words])
    return clean_text

dataset['text'] = dataset['text'].apply(remove_stop_words)

In [None]:
dataset['text']

In [None]:
def tokenize_text(text):
    return nltk.word_tokenize(text)

In [None]:
dataset['text'] = dataset['text'].map(tokenize_text)

In [None]:
dataset['text']

In [None]:
lemmatizer = WordNetLemmatizer()

def lemmatize_sentence(tokens):
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    lemmatized_sentence = ' '.join(lemmatized_tokens)
    return lemmatized_sentence

In [None]:
dataset['text'] = dataset['text'].map(lemmatize_sentence)

In [None]:
dataset.head() 

In [None]:
dataset.to_csv('Clean Dataset.csv')

Importing the clean dataset

In [None]:
dataset = pd.read_csv('Clean Dataset.csv')
dataset

Removing missing values

In [None]:
print(dataset[dataset['text'].isnull()].index)

In [None]:
dataset.dropna(inplace=True)

In [None]:
print(dataset[dataset['text'].isnull()].index)

In [None]:
X = dataset['text']
y = dataset['class'].replace({'suicide':1,'non-suicide':0})
X

Test Train Split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05, random_state=4)
len(X_train),len(X_test)

Term Frequency Inverse Document Frequency ( TF-IDF Vectorizer )

In [None]:
max_features_values = [5000, 10000]

tfidf_vectorizer = TfidfVectorizer()
 
param_grid = {
    'max_features': max_features_values,
}

grid_search = GridSearchCV(tfidf_vectorizer, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

best_max_features = grid_search.best_params_['max_features']

print(f"Best max_features: {best_max_features}")

best_tfidf_vectorizer = grid_search.best_estimator_

pickle.dump(best_tfidf_vectorizer, open('grid_search_15000_tfidf.pkl', 'wb'))

In [None]:
best_tfidf_vectorizer = pickle.load(open('grid_search_15000_tfidf.pkl', 'rb'))

In [None]:
X_train_tfidf = best_tfidf_vectorizer.transform(X_train)
X_test_tfidf = best_tfidf_vectorizer.transform(X_test)

num_train_samples, _ = X_train_tfidf.shape
num_test_samples, _ = X_test_tfidf.shape

print(f"Number of training samples: {num_train_samples}")
print(f"Number of test samples: {num_test_samples}")

In [None]:
# Convert your sparse TF-IDF matrices to dense numpy arrays
X_train_dense = X_train_tfidf.toarray()
X_test_dense = X_test_tfidf.toarray()

Modeling Section

In [None]:
nb = GaussianNB()
nb2 = BernoulliNB()
nb3 = MultinomialNB()
VotingClassifiers = VotingClassifier(estimators=[('GaussianNB', nb),('BernoulliNB',nb2), ('MultinomialNB', nb3)], voting='soft')

VotingClassifiers.fit(X_train_dense, y_train)

pickle.dump(VotingClassifiers, open('Voting_classifier.pkl', 'wb'))

In [None]:
VotingClassifiers = pickle.load(open('Voting_classifier.pkl', 'rb'))

In [None]:
print('Training score:', VotingClassifiers.score(X_train_dense, y_train))
print('Testing score:', VotingClassifiers.score(X_test_dense, y_test))

Testing model on a New Dataset

In [None]:
new_dataset = pd.read_csv('test_suicide.csv')
new_dataset.head()

In [None]:
new_dataset['Tweet']

In [None]:
new_dataset['Tweet'] = new_dataset['Tweet'].str.lower()
new_dataset['Tweet']

In [None]:
new_dataset['Tweet'] = new_dataset['Tweet'].str.replace(r'[^\w\s]|(\d+\.\d+|\d+)', '', regex=True)
new_dataset['Tweet']

In [None]:
def remove__stop_words(text):
    clean_text = ' '.join([word for word in str(text).split() if word.lower() not in stop_words])
    return clean_text
 
new_dataset['Tweet'] = new_dataset['Tweet'].apply(remove__stop_words)

In [None]:
new_dataset['Tweet']

In [None]:
new_dataset['Tweet'] = new_dataset['Tweet'].map(tokenize_text)
new_dataset['Tweet']

In [None]:
new_dataset['Tweet'] = new_dataset['Tweet'].map(lemmatize_sentence)

In [None]:
new_dataset['Tweet']

In [None]:
print(new_dataset[new_dataset['Tweet'].isnull()].index) #No missing values

In [None]:
X_new = new_dataset['Tweet']
y_new = new_dataset['Suicide'].replace({'Potential Suicide post ':1,'Not Suicide post':0})
X_new

In [None]:
y_new.iloc[500:510]

In [None]:
best_tfidf_vectorizer = pickle.load(open('grid_search_15000_tfidf.pkl', 'rb'))
VotingClassifiers = pickle.load(open('Voting_classifier.pkl', 'rb'))

In [None]:
X_new_tfidf = best_tfidf_vectorizer.transform(X_new)
X_new_tfidf = X_new_tfidf.toarray()

In [None]:
print('Testing score:', VotingClassifiers.score(X_new_tfidf, y_new))

KNN Model 

In [None]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
X_train_dense = sc.fit_transform(X_train_dense)
X_test_dense = sc.transform(X_test_dense)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
classifier_KNN = KNeighborsClassifier(n_neighbors = 10, metric = 'minkowski', p = 2)
classifier_KNN.fit(X_train_dense, y_train)

In [None]:
print('Training score:', classifier_KNN.score(X_train_dense, y_train))
print('Testing score:', classifier_KNN.score(X_test_dense, y_test))

In [None]:
#We're keeping the Naives Bayes because they have the best accuracy

Preprocessing Function for sentences

In [None]:
def preprocess(element):
    element = element.lower() #convert to lower case 
    element = element.replace(r'[^\w\s]+', '') #remove punctuations
    element = [word for word in element.split() if word not in (stop_words)] #tokenize the sentence
    element = ' '.join([lemmatizer.lemmatize(i) for i in element]) #lemmatizing
    inputToModel = best_tfidf_vectorizer.transform([element]).toarray() #transform to vector form
    return inputToModel

Function to predict preprocessed sentences

In [None]:
def predict_voting(input_text):
    print('Input : ',input_text) 
    processed_array = preprocess(input_text) 
    predict = VotingClassifiers.predict(processed_array) #Model prediction
    if predict[0] == 1:
        print('Output : Suicidal thought detected')
    else:
        print('Output : No suicidal thoughts detected')

In [None]:
predict_voting("My life is so miserable and it's not getting better.")

In [None]:
predict_voting("Let's go for a hike !")

Confusion Matrix For accuracy Vizualisation

In [None]:
y_pred = VotingClassifiers.predict(X_test_dense)

cm = confusion_matrix(y_test, y_pred)

plt.figure(figsize=(15, 6))
sns.set(font_scale=1.4)  
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False,
            xticklabels=['No Suicide', 'Suicide'],
            yticklabels=['No Suicide', 'Suicide'])

plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')

plt.show()

accuracy_score(y_test, y_pred)

Calibration Curve

In [None]:
#The calibration curve assesses how well a binary classification model's
#predicted probabilities match actual outcomes. It plots the average predicted probability
#against the actual fraction of positive cases. A well-calibrated model's curve aligns closely with the ideal diagonal line.

y_prob = VotingClassifiers.predict_proba(X_test_dense)[:, 1]

prob_true, prob_pred = calibration_curve(y_test, y_prob, n_bins=10, strategy='uniform')

plt.figure(figsize=(15, 3))
plt.plot(prob_pred, prob_true, marker='o', linestyle='-', color='b', label='Calibration Curve')
plt.plot([0, 1], [0, 1], linestyle='--', color='gray', label='Perfectly Calibrated')
plt.xlabel('Mean Predicted Probability')
plt.ylabel('Fraction of Positives')
plt.title('Calibration Curve')
plt.legend(loc='best')
plt.grid()
plt.show()


In [None]:
y_pred = VotingClassifiers.predict(X_test_dense)

In [None]:
print("Shape of y_test:", y_test.shape)
print("Shape of y_pred:", y_pred.shape)

In [None]:
y_test = pd.DataFrame(np.array(y_test).reshape(1,-1))
y_pred = pd.DataFrame(np.array(y_pred).reshape(1,-1))

In [None]:
print("Shape of y_test:", y_test.shape)
print("Shape of y_pred:", y_pred.shape)

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.metrics import precision_score, recall_score, f1_score

precision = precision_score(y_test, y_pred, average='micro')
recall = recall_score(y_test, y_pred, average='micro')
f1 = f1_score(y_test, y_pred, average='micro')

metrics_df = pd.DataFrame({
    'Metric': ['Precision', 'Recall', 'F1-Score'],
    'Score': [precision, recall, f1]
})

sns.set(style="whitegrid")

g = sns.catplot(x='Metric', y='Score', data=metrics_df, kind='bar', palette='Set2', height=5, aspect=2)

g.despine(left=True)
g.set_ylabels('Score')
g.set(ylim=(0, 1.0)) 
plt.title('Classification Metrics')

plt.show()
