# YZV 311E DATA MINING PROJECT
# <strong>Detection Of Sucidal Texts</strong>
## Zehra Demir
## Nurbanu Gök

### Importing the necessary libraries

In [9]:
import numpy as np
import pandas as pd

### Reading the data file

In [10]:
df = pd.read_csv("data/Suicide_Detection_50k.csv")

In [14]:
df.head()

Unnamed: 0,text,class
0,How do you shower? May you tell me how you sho...,non-suicide
1,How do I prevent suicide before it even starts...,suicide
2,Suicidal ThoughtsI haven't gone 1 day without ...,suicide
3,"Ignore, just checkin somethin' Just checking i...",non-suicide
4,i’m a busy man 😂😂😂😂😂😂😂😂 jk all i do is go on r...,non-suicide


In [15]:
df.shape

(50000, 2)

## Data Preprocessing

In [16]:
df.isna().sum()

text     0
class    0
dtype: int64

In [17]:
df.duplicated().sum()

0

In [18]:
df.dtypes

text     object
class    object
dtype: object

### Some work has been done when reducing the data set to be able to fit it in GitHub repository. This is why we do not have any null or duplicate values now. You can find the details in data_reducing.ipynb file.

### We can now proceed with the text preprocessing part.

## Text Preprocessing

### Remove URLs etc.

In [12]:
import re

In [18]:
 def clean(text):
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'@[A-Za-z0-9]+', '', text) # Remove mentions/handles
    text = re.sub(r'[^A-Za-z0-9]+', ' ', text)
    return text
df['text'] = df['text'].apply(clean)

### Lowercasing the text data

In [15]:
df['text'] = df['text'].str.lower()
df.head()

Unnamed: 0,text,class
0,how do you shower? may you tell me how you sho...,non-suicide
1,how do i prevent suicide before it even starts...,suicide
2,suicidal thoughtsi haven't gone 1 day without ...,suicide
3,"ignore, just checkin somethin' just checking i...",non-suicide
4,i’m a busy man 😂😂😂😂😂😂😂😂 jk all i do is go on r...,non-suicide


#

### Emoji and Emoticon Handling

In [5]:
pip install emoji

Collecting emoji
  Downloading emoji-2.8.0-py2.py3-none-any.whl (358 kB)
     -------------------------------------- 358.9/358.9 kB 2.5 MB/s eta 0:00:00
Installing collected packages: emoji
Successfully installed emoji-2.8.0
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.1.2 -> 23.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [16]:
import emoji

#### Converting the emojis

In [19]:


def convert_emojis_to_text(text):
    return emoji.demojize(text)

df['text'] = df['text'].apply(convert_emojis_to_text)


In [11]:
df.head(5)

Unnamed: 0,text,class
0,How do you shower? May you tell me how you sho...,non-suicide
1,How do I prevent suicide before it even starts...,suicide
2,Suicidal ThoughtsI haven't gone 1 day without ...,suicide
3,"Ignore, just checkin somethin' Just checking i...",non-suicide
4,i’m a busy man :face_with_tears_of_joy::face_w...,non-suicide


#### ... or removing the emojis

In [5]:
pip install unidecode

Collecting unidecode
  Downloading Unidecode-1.3.7-py3-none-any.whl (235 kB)
     -------------------------------------- 235.5/235.5 kB 1.6 MB/s eta 0:00:00
Installing collected packages: unidecode
Successfully installed unidecode-1.3.7
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.1.2 -> 23.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [None]:
pip install clean-text

In [6]:
from cleantext import clean

In [None]:
def clean_emojis(text):

    return clean(text, no_emoji=True)

#df.text.apply(clean_emojis)

### Remove Punctuations

In [7]:
import string

In [None]:
def remove_punctuation(text):
    translator = str.maketrans('', '', string.punctuation)
    return text.translate(translator)

### Remove Stop Words

In [None]:
import nltk
nltk.download('stopwords')

In [None]:

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize




# Define stop words and a tokenizer
stop_words = set(stopwords.words('english'))

def remove_stopwords_and_tokenize(text):
    # Tokenize the text
    tokens = word_tokenize(text)
    
    # Remove stop words
    filtered_tokens = [token for token in tokens if token.lower() not in stop_words]
    
    # Join the filtered tokens into a single string
    cleaned_text = ' '.join(filtered_tokens)
    
    return cleaned_text

# Apply the function to the 'text' column
df['cleaned_text'] = df['text'].apply(remove_stopwords_and_tokenize)

# Display the updated DataFrame
print(df[['text', 'cleaned_text']])


In [None]:

from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))

def remove_stopwords(tokens):
    return [token for token in tokens if token.lower() not in stop_words]

df['tokens'] = df['tokens'].apply(remove_stopwords)


### Tokenization

In [8]:
from nltk.tokenize import word_tokenize

### Stemming

## TF - IDF

### Other representations