# YZV 311E DATA MINING PROJECT
# <strong>Detection Of Sucidal Texts</strong>
## Zehra Demir
## Nurbanu Gök

### Importing the necessary libraries

In [1]:
import numpy as np
import pandas as pd

### Reading the data file

In [2]:
df = pd.read_csv("data/Suicide_Detection_50k.csv")

In [3]:
df.head()


Unnamed: 0,text,class
0,How do you shower? May you tell me how you sho...,non-suicide
1,How do I prevent suicide before it even starts...,suicide
2,Suicidal ThoughtsI haven't gone 1 day without ...,suicide
3,"Ignore, just checkin somethin' Just checking i...",non-suicide
4,i’m a busy man 😂😂😂😂😂😂😂😂 jk all i do is go on r...,non-suicide


In [4]:
df.shape

(50000, 2)

## Data Preprocessing

In [5]:
df.isna().sum()

text     0
class    0
dtype: int64

In [6]:
df.duplicated().sum()

0

In [7]:
df.dtypes

text     object
class    object
dtype: object

### Some work has been done when reducing the data set to be able to fit it in GitHub repository. This is why we do not have any null or duplicate values now. You can find the details in data_reducing.ipynb file.

### We can now proceed with the text preprocessing part.

## Text Preprocessing

### Remove URLs etc.

In [8]:
import re

In [9]:
def clean(text):
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'@[A-Za-z0-9]+', '', text) # Remove mentions/handles
    #text = re.sub(r'[^A-Za-z0-9]+', ' ', text) 
    return text
df['text'] = df['text'].apply(clean)

### Lowercasing the text data

In [11]:
df['text'] = df['text'].str.lower()
df.head()

Unnamed: 0,text,class
0,how do you shower may you tell me how you show...,non-suicide
1,how do i prevent suicide before it even starts...,suicide
2,suicidal thoughtsi haven t gone 1 day without ...,suicide
3,ignore just checkin somethin just checking if ...,non-suicide
4,i m a busy man jk all i do is go on reddit amp...,non-suicide


#

### Emoji and Emoticon Handling

In [None]:
pip install emoji

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.1.2 -> 23.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [12]:
import emoji


#### Converting the emojis

In [None]:


def convert_emojis_to_text(text):
    return emoji.demojize(text)

#df['text'] = df['text'].apply(convert_emojis_to_text)


In [None]:
df.head(5)

Unnamed: 0,text,class
0,how do you shower may you tell me how you show...,non-suicide
1,how do i prevent suicide before it even starts...,suicide
2,suicidal thoughtsi haven t gone 1 day without ...,suicide
3,ignore just checkin somethin just checking if ...,non-suicide
4,i m a busy man jk all i do is go on reddit amp...,non-suicide


#### ... or removing the emojis

In [13]:
pip install unidecode

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.1.2 -> 23.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [None]:
pip install clean-text

In [14]:
from cleantext import clean

In [15]:
def clean_emojis(text):

    return clean(text, no_emoji=True)

df['text'] = df['text'].apply(clean_emojis)

In [None]:
df.head()

Unnamed: 0,text,class
0,how do you shower may you tell me how you show...,non-suicide
1,how do i prevent suicide before it even starts...,suicide
2,suicidal thoughtsi haven t gone 1 day without ...,suicide
3,ignore just checkin somethin just checking if ...,non-suicide
4,i m a busy man jk all i do is go on reddit amp...,non-suicide


### Remove Punctuations

In [16]:
import string

In [18]:
def remove_punctuation(text):
    translator = str.maketrans('', '', string.punctuation)
    return text.translate(translator)

df["text"] = df["text"].apply(remove_punctuation)

### Remove Stop Words

In [20]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kullanıcı\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [23]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('stopwords')
nltk.download('punkt')

def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(text)
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return ' '.join(filtered_words)

# Example usage:
df['text'] = df['text'].apply(remove_stopwords)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kullanıcı\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\kullanıcı\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [24]:
df.head()

Unnamed: 0,text,class
0,shower may tell shower step step please weirdo...,non-suicide
1,prevent suicide even starts headed towards con...,suicide
2,suicidal thoughtsi gone 1 day without thinking...,suicide
3,ignore checkin somethin checking alt enough ka...,non-suicide
4,busy man jk go reddit amp x200b,non-suicide


### Stemming

In [26]:
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.lancaster import LancasterStemmer

stemmer1=PorterStemmer()
stemmer2=SnowballStemmer("english")
stemmer3=LancasterStemmer()

In [34]:


def stem_words(text):
    words = word_tokenize(text)
    stemmed_words = [stemmer.stem(word) for word in words]
    return ' '.join(stemmed_words)

# Example usage:
stem_words("I love to eating cookie")

'i love to eat cooki'

In [None]:
df.head()

Unnamed: 0,text,class
0,shower may tell shower step step pleas weirdo ...,non-suicide
1,prevent suicid even start head toward constant...,suicide
2,suicid thoughtsi gone 1 day without think suic...,suicide
3,ignor checkin somethin check alt enough karma ...,non-suicide
4,busi man jk go reddit amp x200b,non-suicide


### Lemmatization

In [36]:
from nltk.stem import WordNetLemmatizer
nltk.download("wordnet")
nltk.download("omw-1.4")
#init the wordnet lemmatizer


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\kullanıcı\AppData\Roaming\nltk_data...
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\kullanıcı\AppData\Roaming\nltk_data...


'I love to eat cookie'

In [41]:
wnl = WordNetLemmatizer()

def lemmatize_words(text):
    words = word_tokenize(text)
    lemmatized_words = [wnl.lemmatize(word) for word in words]
    return ' '.join(lemmatized_words)

# Example usage:
lemmatize_words("programmed programmers programming")

'programmed programmer programming'

## TF - IDF

In [42]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
#train_X = df['text']?????
#train_y = df['label']?????



In [None]:
tf_idf = TfidfVectorizer()

X_train_tf = tf_idf.fit_transform(train_X)
#applying tf idf to training data
X_train_tf = tf_idf.transform(train_X)

### Other representations