# YZV 311E DATA MINING PROJECT
# <strong>Detection Of Sucidal Texts</strong>
## Zehra Demir
## Nurbanu Gök

### Importing the necessary libraries

In [2]:
import numpy as np
import pandas as pd

### Reading the data file

In [3]:
df = pd.read_csv("data/Suicide_Detection_50k.csv")

In [4]:
df.head()


Unnamed: 0,text,class
0,How do you shower? May you tell me how you sho...,non-suicide
1,How do I prevent suicide before it even starts...,suicide
2,Suicidal ThoughtsI haven't gone 1 day without ...,suicide
3,"Ignore, just checkin somethin' Just checking i...",non-suicide
4,i’m a busy man 😂😂😂😂😂😂😂😂 jk all i do is go on r...,non-suicide


In [5]:
df.shape

(50000, 2)

## Data Preprocessing

In [16]:
df.isna().sum()

text     0
class    0
dtype: int64

In [17]:
df.duplicated().sum()

0

In [18]:
df.dtypes

text     object
class    object
dtype: object

### Some work has been done when reducing the data set to be able to fit it in GitHub repository. This is why we do not have any null or duplicate values now. You can find the details in data_reducing.ipynb file.

### We can now proceed with the text preprocessing part.

## Text Preprocessing

### Remove URLs etc.

In [6]:
import re

In [7]:
def clean(text):
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'@[A-Za-z0-9]+', '', text) # Remove mentions/handles
    text = re.sub(r'[^A-Za-z0-9]+', ' ', text)
    return text
df['text'] = df['text'].apply(clean)

### Lowercasing the text data

In [8]:
df['text'] = df['text'].str.lower()
df.head()

Unnamed: 0,text,class
0,how do you shower may you tell me how you show...,non-suicide
1,how do i prevent suicide before it even starts...,suicide
2,suicidal thoughtsi haven t gone 1 day without ...,suicide
3,ignore just checkin somethin just checking if ...,non-suicide
4,i m a busy man jk all i do is go on reddit amp...,non-suicide


#

### Emoji and Emoticon Handling

In [9]:
pip install emoji

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.1.2 -> 23.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [10]:
import emoji

#### Converting the emojis

In [11]:


def convert_emojis_to_text(text):
    return emoji.demojize(text)

df['text'] = df['text'].apply(convert_emojis_to_text)


In [12]:
df.head(5)

Unnamed: 0,text,class
0,how do you shower may you tell me how you show...,non-suicide
1,how do i prevent suicide before it even starts...,suicide
2,suicidal thoughtsi haven t gone 1 day without ...,suicide
3,ignore just checkin somethin just checking if ...,non-suicide
4,i m a busy man jk all i do is go on reddit amp...,non-suicide


#### ... or removing the emojis

In [5]:
pip install unidecode

Collecting unidecode
  Downloading Unidecode-1.3.7-py3-none-any.whl (235 kB)
     -------------------------------------- 235.5/235.5 kB 1.6 MB/s eta 0:00:00
Installing collected packages: unidecode
Successfully installed unidecode-1.3.7
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.1.2 -> 23.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [None]:
pip install clean-text

In [7]:
from cleantext import clean

In [8]:
def clean_emojis(text):

    return clean(text, no_emoji=True)

df['text'] = df['text'].apply(clean_emojis)

In [9]:
df.head()

Unnamed: 0,text,class
0,how do you shower may you tell me how you show...,non-suicide
1,how do i prevent suicide before it even starts...,suicide
2,suicidal thoughtsi haven t gone 1 day without ...,suicide
3,ignore just checkin somethin just checking if ...,non-suicide
4,i m a busy man jk all i do is go on reddit amp...,non-suicide


### Remove Punctuations

In [13]:
import string

In [14]:
def remove_punctuation(text):
    translator = str.maketrans('', '', string.punctuation)
    return text.translate(translator)

### Remove Stop Words

In [15]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kullanıcı\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [19]:
def remove_stopwords(text):
    stop_words = set(stopwords.words("english"))
    word_tokens = text.split()
    filtered_text = [word for word in word_tokens if word not in stop_words]
    return ' '.join(filtered_text)

df['text'] = df['text'].apply(remove_stopwords)

In [20]:
df.head()

Unnamed: 0,text,class
0,shower may tell shower step step please weirdo...,non-suicide
1,prevent suicide even starts headed towards con...,suicide
2,suicidal thoughtsi gone 1 day without thinking...,suicide
3,ignore checkin somethin checking alt enough ka...,non-suicide
4,busy man jk go reddit amp x200b,non-suicide


### Stemming

In [24]:
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.lancaster import LancasterStemmer

stemmer=PorterStemmer()
stemmer2=SnowballStemmer("english")
stemmer3=LancasterStemmer()

In [40]:
nltk.download("punkt")
# Initialize Python porter stemmer
ps = PorterStemmer()
def stemming(text):
    text = [ps.stem(word) for word in text.split()]
    return ' '.join(text)

df['text'] = df['text'].apply(stemming)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\kullanıcı\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [41]:
df.head()

Unnamed: 0,text,class
0,shower may tell shower step step pleas weirdo ...,non-suicide
1,prevent suicid even start head toward constant...,suicide
2,suicid thoughtsi gone 1 day without think suic...,suicide
3,ignor checkin somethin check alt enough karma ...,non-suicide
4,busi man jk go reddit amp x200b,non-suicide


In [28]:
def stem_words(text):
    word_tokens = word_tokenize(text)
    stems = [stemmer.stem(word) for word in word_tokens]
    return ' '.join(stems)

df['text'] = df['text'].apply(stem_words)

NameError: name 'text' is not defined

## TF - IDF

### Other representations