In [1]:
import pandas as pd
import numpy as np
import re
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('Suicide_Detection.csv', engine='python')


In [3]:
df = pd.get_dummies(df, columns = ['class'], drop_first = True)

In [4]:
df = df.iloc[:,1:]

In [5]:
df = df.iloc[220000:]


In [6]:
#Importamos librerias para trabajar sobre text mining
from nltk import RegexpTokenizer
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer

In [7]:
# Defining Features Matrix
X = df.drop(['class_suicide'], axis=1)
X.head()

Unnamed: 0,text
220000,My inability to sleep... ...It is frightening....
220001,I just woke up at 2 pm hows your day going?
220002,I want more snap streaks so I can see cute boi...
220003,I read an interesting article on notesI starte...
220004,The horrors of the climbing unit in Gym class ...


In [8]:
# Define Target
y = df['class_suicide']
y.head()

220000    0
220001    0
220002    0
220003    1
220004    0
Name: class_suicide, dtype: uint8

In [9]:
# Separate training and testing sets, stratifying by class
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)

In [10]:
#Vamos a limpiar los datos de train y test
#Usamos Tokennizer que elimine los signos de puntuación y tags html
#Hacemos stemming para obtener las raices de las palabras en minusculas
#Eliminamos stopwords

In [11]:
def clean_reddit(text, tokenizer, stemmer, stopwords):
    
    #tokens (eliminamos todos los signos de puntuación)
    words = tokenizer.tokenize(text)
    
    #Stemming : raiz y minusculas:
    stem_words = [stemmer.stem(x) for x in words]
    
    #eliminamos stopwords (ya pasaron por stem)
    clean_words = [x for x in stem_words if x not in stopwords]
    
    result = " ".join(clean_words)
    
    return(result)

In [12]:
#Importing for stepwords
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Equipo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [13]:
#Eliminamos signos de puntuacion

tokenizer = RegexpTokenizer(r"\w+")

englishStemmer = SnowballStemmer("english")
stopwords_en = stopwords.words('english')
stopwords_en_stem = [englishStemmer.stem(x) for x in stopwords_en]

In [14]:
clean_train = [clean_reddit(x, tokenizer, englishStemmer, stopwords_en_stem) for x in X_train.text]

In [15]:
clean_test = [clean_reddit(x, tokenizer, englishStemmer, stopwords_en_stem) for x in X_test.text]

In [16]:
#CountVectorizer para transformar los datos de train y test

count_vectorizer = CountVectorizer()
count_vectorizer.fit(clean_train)
X_train_sparse = count_vectorizer.transform(clean_train)
X_test_sparse = count_vectorizer.transform(clean_test)

In [17]:
X_train = pd.DataFrame(X_train_sparse.todense(), 
             columns = count_vectorizer.get_feature_names())

In [18]:
X_test = pd.DataFrame(X_test_sparse.todense(), 
             columns = count_vectorizer.get_feature_names()) 

In [25]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline

In [26]:
pasos = [('decision_tree',DecisionTreeClassifier())]

In [27]:
pipe = Pipeline(pasos)

In [29]:
model = pipe.fit(X_train,y_train)

In [30]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

In [32]:
predict_tree_cat = model.predict(X_test)
accuracy_tree = accuracy_score(y_test, predict_tree_cat)
print(accuracy_tree)
conf_mat_tree = confusion_matrix(y_test, predict_tree_cat)
print(conf_mat_tree)

0.8287512421331567
[[1256  258]
 [ 259 1246]]


In [33]:
from sklearn.metrics import classification_report
print(classification_report(y_test, predict_tree_cat))

              precision    recall  f1-score   support

           0       0.83      0.83      0.83      1514
           1       0.83      0.83      0.83      1505

    accuracy                           0.83      3019
   macro avg       0.83      0.83      0.83      3019
weighted avg       0.83      0.83      0.83      3019

