In [1]:
%pip install pandas-profiling
%pip install contractions
%pip install inflect

Collecting pandas-profiling
  Downloading pandas_profiling-3.2.0-py2.py3-none-any.whl (262 kB)
     |████████████████████████████████| 262 kB 29.2 MB/s            
[?25hCollecting seaborn>=0.10.1
  Downloading seaborn-0.11.2-py3-none-any.whl (292 kB)
     |████████████████████████████████| 292 kB 129.4 MB/s            
Collecting missingno>=0.4.2
  Downloading missingno-0.5.1-py3-none-any.whl (8.7 kB)
Collecting tangled-up-in-unicode==0.2.0
  Downloading tangled_up_in_unicode-0.2.0-py3-none-any.whl (4.7 MB)
     |████████████████████████████████| 4.7 MB 111.6 MB/s            
[?25hCollecting htmlmin>=0.1.12
  Downloading htmlmin-0.1.12.tar.gz (19 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting pydantic>=1.8.1
  Downloading pydantic-1.9.2-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (11.2 MB)
     |████████████████████████████████| 11.2 MB 39.9 MB/s            
Collecting visions[type_image_path]==0.7.4
  Downloading visions-0.7.4-py3-none-any.whl (102

In [2]:
# Import libraries for text processing
import pandas as pd
import numpy as np
import sys
from pandas_profiling import ProfileReport
import re, string, unicodedata, contractions, inflect

from nltk import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer, WordNetLemmatizer, PorterStemmer
from nltk.tokenize import RegexpTokenizer

from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, HashingVectorizer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.svm import SVC
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import classification_report, confusion_matrix, plot_precision_recall_curve
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.metrics import accuracy_score, recall_score
from sklearn import metrics

import matplotlib.pyplot as plt

In [3]:
 # librería Natural Language Toolkit, usada para trabajar con textos 
import nltk
# Punkt permite separar un texto en frases.
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [4]:
data = pd.read_csv('SuicidiosProyecto.csv', sep=',', encoding = 'utf-8')
data_t = data

In [5]:
# Preprocessing
def process_text(text):
    stemmer = PorterStemmer()
    stopwords_english = stopwords.words('english')
    # Lower case
    text = text.lower()
    # Remove tags
    text = re.sub("&lt;/?.*?&gt;"," &lt;&gt; ",text)
    # Remove special characters and digits
    text = re.sub("(\\d|\\W)+"," ",text)
    # Remove hyperlinks
    text = re.sub(r'https?://[^\s\n\r]+', '', text)
    # only removing the hash # sign from the word
    text = re.sub(r'#', '', text)
    # Contractions
    text = contractions.fix(text)
    # Tokenize text
    tokenizer = RegexpTokenizer(r'\w+')
    texts_tokens = tokenizer.tokenize(text)

    texts_clean = []
    for word in texts_tokens:
        if (word not in stopwords_english and  # remove stopwords
                word not in string.punctuation):  # remove punctuation
            stem_word = stemmer.stem(word)  # stemming word
            texts_clean.append(stem_word)

    return texts_clean

In [6]:
data_t['processed_text'] = data_t['text'].apply(lambda x: process_text(x))

In [7]:
data_t.head()

Unnamed: 0.1,Unnamed: 0,text,class,processed_text
0,173271,i want to destroy myselffor once everything wa...,suicide,"[want, destroy, myselffor, everyth, start, fee..."
1,336321,I kinda got behind schedule with learning for ...,non-suicide,"[kind, got, behind, schedul, learn, next, week..."
2,256637,I'm just not sure anymoreFirst and foremost: I...,suicide,"[sure, anymorefirst, foremost, brazil, judg, s..."
3,303772,please give me a reason to liveThats too much ...,suicide,"[pleas, give, reason, livethat, much, reason, ..."
4,293747,27f struggling to find meaning moving forwardI...,suicide,"[f, struggl, find, mean, move, forwardi, admit..."


In [8]:
data_t['processed_text'] = data_t['processed_text'].apply(lambda x: ' '.join(map(str, x)))
data_t

Unnamed: 0.1,Unnamed: 0,text,class,processed_text
0,173271,i want to destroy myselffor once everything wa...,suicide,want destroy myselffor everyth start feel okay...
1,336321,I kinda got behind schedule with learning for ...,non-suicide,kind got behind schedul learn next week testwe...
2,256637,I'm just not sure anymoreFirst and foremost: I...,suicide,sure anymorefirst foremost brazil judg second ...
3,303772,please give me a reason to liveThats too much ...,suicide,pleas give reason livethat much reason live li...
4,293747,27f struggling to find meaning moving forwardI...,suicide,f struggl find mean move forwardi admit bit lo...
...,...,...,...,...
195695,248038,Drop some cool new cereal ideas Like what woul...,non-suicide,drop cool new cereal idea like would ideal cereal
195696,216516,Unpopular opinion but cats deserve love and re...,non-suicide,unpopular opinion cat deserv love respect much...
195697,199341,Hey guys :) How yall doin?,non-suicide,hey guy doin
195698,145373,uhm I covered my dog in a blanket because the ...,non-suicide,uhm cover dog blanket light wake woke ran wall


In [10]:
X, y = data_t['processed_text'], data_t['class']
y = (y == 'suicide').astype(int)

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
y_train

21358     1
65465     1
164476    0
165082    1
75228     1
         ..
119879    0
103694    0
131932    0
146867    1
121958    0
Name: class, Length: 156560, dtype: int64

### TF-IDF Vectorizer

In [13]:
tfidf_vectorizer = TfidfVectorizer(max_features=10000, ngram_range=(1,3))

X_tfidf_train= tfidf_vectorizer.fit_transform(X_train)
X_tfidf_test=tfidf_vectorizer.transform(X_test)

In [17]:
from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectFromModel

## LinearSVC (Support Vector Machines)

In [18]:
lsvc = LinearSVC(C=100, penalty='l1', max_iter=1000, dual=False)
lsvc.fit(X_tfidf_train, y_train)

# Select the best features that has high weight
fs = SelectFromModel(lsvc, prefit=True)
X_selection = fs.transform(X_tfidf_train)
X_test_selection = fs.transform(X_tfidf_test)

In [19]:
lsvc_tfidf = LinearSVC(C=1000, penalty='l1', max_iter=1000, dual=False)

lsvc_tfidf.fit(X_selection, y_train)
y_predict_tfidf = lsvc_tfidf.predict(X_test_selection)

linear_svm_tfidf_results = metrics.precision_recall_fscore_support(y_test, y_predict_tfidf)

### Evaluation of SVM

In [23]:
# Show confusion matrix
cm = confusion_matrix(y_test, y_predict_tfidf)
print(cm)

# Show precision and recall
print(classification_report(y_test, y_predict_tfidf))

# Show accuracy
print('Accuracy: ', tfidf_acc)

# Show f1 score
print('F1 score: ', linear_svm_tfidf_results[2])

[[20770  1299]
 [ 1609 15462]]
              precision    recall  f1-score   support

           0       0.93      0.94      0.93     22069
           1       0.92      0.91      0.91     17071

    accuracy                           0.93     39140
   macro avg       0.93      0.92      0.92     39140
weighted avg       0.93      0.93      0.93     39140

Accuracy:  0.9257026060296372
F1 score:  [0.93457523 0.91404587]


### Prediction function

In [26]:
# Make predict function
def predict(text):
    text = process_text(text)
    text = ' '.join(map(str, text))
    text = [text]
    text = tfidf_vectorizer.transform(text)
    text = fs.transform(text)
    return lsvc_tfidf.predict(text)

### Model Persistance

In [None]:
# Save model, vectorizer and feature selection
import pickle
pickle.dump(lsvc_tfidf, open('model_svm.pkl','wb'))
pickle.dump(tfidf_vectorizer, open('tfidf_vectorizer.pkl','wb'))
pickle.dump(fs, open('feature_selection.pkl','wb'))

In [None]:
# Load model, vectorizer and feature selection
import pickle
model_svm = pickle.load(open('model_svm.pkl','rb'))
tfidf_vectorizer = pickle.load(open('tfidf_vectorizer.pkl','rb'))
fs = pickle.load(open('feature_selection.pkl','rb'))


## Logistic Regression

In [42]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(random_state=0, solver='lbfgs', multi_class='multinomial')
lr.fit(X_tfidf_train, y_train)

F1 score:  0.9414406322974672


### Evaluation for LR

In [57]:
y_predict_lr = lr.predict(X_tfidf_test)
lr_results = metrics.precision_recall_fscore_support(y_test, y_predict_lr)

# Show confusion matrix
cm = confusion_matrix(y_test, y_predict_lr)
print(cm)

# Show precision and recall
print(classification_report(y_test, y_predict_lr))

# Show f1 score
print('F1 score: ', lr_results[2][0])


[[20964  1105]
 [ 1503 15568]]
              precision    recall  f1-score   support

           0       0.93      0.95      0.94     22069
           1       0.93      0.91      0.92     17071

    accuracy                           0.93     39140
   macro avg       0.93      0.93      0.93     39140
weighted avg       0.93      0.93      0.93     39140

F1 score:  0.9414406322974672


### Prediction Function

In [64]:
def predict_lr(text):
    text = process_text(text)
    text = ' '.join(map(str, text))
    text = [text]
    text = tfidf_vectorizer.transform(text)
    return lr.predict(text)

### Model Persistance

In [80]:
import pickle
pickle.dump(lr, open('model_lr.pkl','wb'))

# Naive Bayes

In [73]:
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()
nb.fit(X_tfidf_train, y_train)

y_predict_nb = nb.predict(X_tfidf_test)
nb_results = metrics.precision_recall_fscore_support(y_test, y_predict_nb)

### Evaluation for NB

In [82]:
y_predict_nb = nb.predict(X_tfidf_test)
lr_results = metrics.precision_recall_fscore_support(y_test, y_predict_nb)

# Show confusion matrix
cm = confusion_matrix(y_test, y_predict_nb)
print(cm)

# Show precision and recall
print(classification_report(y_test, y_predict_nb))

# Show f1 score
print('F1 score: ', lr_results[2][0])

[[19748  2321]
 [ 1172 15899]]
              precision    recall  f1-score   support

           0       0.94      0.89      0.92     22069
           1       0.87      0.93      0.90     17071

    accuracy                           0.91     39140
   macro avg       0.91      0.91      0.91     39140
weighted avg       0.91      0.91      0.91     39140

F1 score:  0.9187466561213333


### Prediction Function

In [78]:
def predict_nb(text):
    text = process_text(text)
    text = ' '.join(map(str, text))
    text = [text]
    text = tfidf_vectorizer.transform(text)
    return nb.predict(text)

### Model Persistance

In [81]:
import pickle
pickle.dump(nb, open('model_nb.pkl','wb'))

In [76]:
# Use random forest
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=500, max_depth=20, random_state=0)
rf.fit(X_tfidf_train, y_train)

y_predict_rf = rf.predict(X_tfidf_test)
rf_results = metrics.precision_recall_fscore_support(y_test, y_predict_rf)

In [77]:
rf_results

(array([0.83547035, 0.89405064]),
 array([0.93003761, 0.76322418]),
 array([0.88022129, 0.82347364]),
 array([22069, 17071]))