### Suicidal Text Model/Warning System

In [226]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('ggplot')
import seaborn as sns

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

import spacy
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import text
from sklearn.preprocessing import StandardScaler

In [2]:
corpus = pd.read_csv('../data/corpus.csv').drop(columns='Unnamed: 0')

In [65]:
nlp = spacy.load('en_trf_distilbertbaseuncased_lg')

In [231]:
depression = corpus[corpus['subreddit']=='depression']

In [232]:
depression

Unnamed: 0,full_text,subreddit,class,neg,neu,pos,comp
30000,lifes unfair for us lonely ones when i was 12-...,depression,1,0.232,0.586,0.182,-0.9576
30001,disconnected does anyone else feel like what h...,depression,1,0.064,0.788,0.147,0.4606
30002,i can’t ever focus and it’s getting worse. i’m...,depression,1,0.175,0.825,0.000,-0.8930
30003,"need a word of courage today, please help me :...",depression,1,0.151,0.616,0.233,0.6425
30004,"i finally got my anxiety under control, but no...",depression,1,0.272,0.617,0.112,-0.9661
...,...,...,...,...,...,...,...
59995,need to be creative i used to draw and paint a...,depression,1,0.045,0.741,0.213,0.8750
59996,too late to turn my life around i got denied c...,depression,1,0.122,0.815,0.063,-0.8086
59997,does else get nighttime depression and anxiety...,depression,1,0.310,0.690,0.000,-0.6597
59998,i feel really alone. today kinda sucked. i sta...,depression,1,0.208,0.671,0.121,-0.9787


In [145]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity, cosine_distances

In [210]:
my_stop = ['no_text', '\n&gt;' ,'https', 'com', 'www', 'don','just','don','like','know','really','things','im']
stop_words = text.ENGLISH_STOP_WORDS.union(my_stop)
tfidf = TfidfVectorizer(stop_words=stop_words,max_features=200)

In [212]:
dtm_corp = tfidf.fit_transform([depression.sample(100).str.cat()])
dtm_df = pd.DataFrame(dtm_corp.toarray(),columns=tfidf.get_feature_names())

corp_top_60 = pd.DataFrame(dtm_df.sum().nlargest(60))

In [213]:
corp_top_60

Unnamed: 0,0
feel,0.434117
want,0.263716
depression,0.18663
time,0.18663
help,0.178516
friends,0.174458
people,0.15823
life,0.154172
think,0.129829
day,0.121715


In [214]:
dtm_corp

<1x200 sparse matrix of type '<class 'numpy.float64'>'
	with 200 stored elements in Compressed Sparse Row format>

In [215]:
query = ['I am just extremely depressed and sad']

In [216]:
query_vect = tfidf.transform(query)

In [217]:
query_vect

<1x200 sparse matrix of type '<class 'numpy.float64'>'
	with 2 stored elements in Compressed Sparse Row format>

In [218]:
cos_matrix = cosine_similarity(dtm_corp,query_vect)

In [219]:
cos_matrix

array([[0.10327871]])

In [175]:
l = 0.8
s = 0.9

In [177]:
0.6*l + 0.4*s

0.8400000000000001

In [230]:
corp_top_60[0]

feel          0.434117
want          0.263716
depression    0.186630
time          0.186630
help          0.178516
friends       0.174458
people        0.158230
life          0.154172
think         0.129829
day           0.121715
going         0.121715
talk          0.121715
bad           0.117658
need          0.117658
school        0.117658
happy         0.113601
anymore       0.105486
years         0.105486
family        0.101429
work          0.097372
depressed     0.093315
good          0.093315
ve            0.093315
did           0.089258
ll            0.089258
right         0.089258
say           0.089258
try           0.089258
friend        0.085201
job           0.085201
lot           0.081143
thoughts      0.081143
way           0.081143
worse         0.081143
better        0.077086
getting       0.077086
got           0.077086
today         0.077086
didn          0.073029
feeling       0.073029
new           0.073029
point         0.073029
shit          0.073029
best       