In [32]:
import pandas as pd
import matplotlib.pyplot as plt
import datetime
import numpy as np
from readability import Readability
import spacy
from tqdm.notebook import tqdm
tqdm.pandas(desc='progress-bar')

import multiprocessing
cores = multiprocessing.cpu_count()
from ipywidgets import FloatProgress


In [3]:
depression = pd.read_csv('depression_raw.csv')
suicide = pd.read_csv('suicide_raw.csv')

# Depression Data Exploration

In [4]:
depression.head()

Unnamed: 0,username,text,posts,url,date_posted
0,ayoubalharchal,\n\n\nDo you ever feel like everyone is watchi...,8,https://www.takethislife.com/depression/do-you...,"07-19-22, 02:47 PM"
1,ayoubalharchal,\n\n\nDo you compare yourself to other people ...,8,https://www.takethislife.com/depression/do-you...,"07-14-22, 05:15 PM"
2,ayoubalharchal,\n\n\nDo you find yourself apologizing all the...,8,https://www.takethislife.com/depression/do-you...,"07-14-22, 08:34 AM"
3,ayoubalharchal,"\n\n\nIF SO, READING & UNDERSTANDING THIS WILL...",8,https://www.takethislife.com/depression/do-you...,"07-12-22, 05:00 PM"
4,ayoubalharchal,\n\n\nAren’t you a little bit tired of people ...,8,https://www.takethislife.com/depression/aren-t...,"07-12-22, 07:56 AM"


In [5]:
depression.dtypes

username       object
text           object
posts           int64
url            object
date_posted    object
dtype: object

In [6]:
depression.describe()
# The numbers are highly skewed as the users with the most posts appear the most frequently in the data frame. 
# Will manually calculate the mean and median posts per user

Unnamed: 0,posts
count,19328.0
mean,1230.330608
std,2691.801545
min,0.0
25%,8.0
50%,87.0
75%,939.0
max,18210.0


In [7]:
user_post_count = pd.DataFrame(depression.groupby('username')['posts'].max()).sort_values('posts')
user_post_count

Unnamed: 0_level_0,posts
username,Unnamed: 1_level_1
JustBreathe,0
mr201,1
UK20101982,1
HelpMeNowPlease,1
HelpMe22,1
...,...
irishred,11944
Blue Girl,13622
Road Ratt,13622
Aries,18210


In [8]:
mean = np.mean(user_post_count['posts'])
median = np.median(user_post_count['posts'])
print(f"The average number of posts per user is {mean} and the median is {median}.")

The average number of posts per user is 105.58227323008849 and the median is 4.0.


In [9]:
depression.isna().sum()
#username was actually "nan" 

username       1
text           0
posts          0
url            0
date_posted    0
dtype: int64

In [10]:
depression['username'] = [str(d).lower().replace(' ','_') for d in depression['username']]
depression['posts'] = pd.to_numeric(depression['posts'])
depression['date_posted'] = [datetime.datetime.strptime(d, '%m-%d-%y, %H:%M %p').date() for d in depression['date_posted']]

In [11]:
depression.shape

(19328, 5)

In [12]:
#remove posts with less than 30 chars
depression = depression[depression['text'].apply(lambda x: len(str(x))>=30)]
#remove users for which posts showed up as 0
depression = depression[depression['posts']>=1]
#remove posts which users posted twice (different url so not dropped with regular dedupe)
depression = depression.drop_duplicates(subset=['text','username'])

In [13]:
depression.shape

(19155, 5)

In [14]:
#building some shallow features from the posts
depression['post_length'] = [len(post) for post in depression['text']]
depression['word_count'] = [len(post.split(' ')) for post in depression['text']]

In [15]:
def readability_score(text):
    try:
        r = Readability(text)
        f = r.flesch()
        return f.score
    except:
        return None

In [16]:
#using the flesch readabilty score as a feature, higher score means more readability
depression['readability_score'] = depression['text'].apply(lambda x : readability_score(x))

In [39]:
nlp = spacy.load('en_core_web_sm')

def get_pos_tags(text):
    doc = nlp(text)
    return [(token.pos_, token.tag_) for token in doc]

In [43]:
len(depression.text)

19155

In [51]:
depression = depression.reset_index(drop=True)

In [52]:
tags = []
texts = depression['text']
for i in range(len(texts)):
    print(f'{i}/19155',end='\r')
    post = texts[i]
    tags.append(get_pos_tags(post))

19154/19155

In [53]:
depression['pos_tags'] = tags

In [55]:
# depression.to_csv('depression_features.csv',index=None)

In [82]:
all_joined = []
for index, row in depression.iterrows():
    t = []
    raw_tags = row['pos_tags']
    for tup in raw_tags:
        joined_tags = '_'.join(tup)
        t.append(joined_tags)
    all_joined.append(t)


In [88]:
depression['joined_tags'] = all_joined

In [105]:
from collections import Counter
tag_dicts = []
for t in depression['joined_tags']:
    c = Counter(t)
    sc = [(i, round(c[i] / len(t) * 100.0,2)) for i in c]
    scd = dict(sc)
    tag_dicts.append(scd)

In [107]:
depression['tag_ratios'] = tag_dicts

In [108]:
depression

Unnamed: 0,username,text,posts,url,date_posted,post_length,word_count,readability_score,pos_tags,joined_tags,tag_ratios
0,ayoubalharchal,\n\n\nDo you ever feel like everyone is watchi...,8,https://www.takethislife.com/depression/do-you...,2022-07-19,2324,400,82.616667,"[(SPACE, _SP), (AUX, VBP), (PRON, PRP), (ADV, ...","[SPACE__SP, AUX_VBP, PRON_PRP, ADV_RB, VERB_VB...","{'SPACE__SP': 4.76, 'AUX_VBP': 2.75, 'PRON_PRP..."
1,ayoubalharchal,\n\n\nDo you compare yourself to other people ...,8,https://www.takethislife.com/depression/do-you...,2022-07-14,2767,427,64.490101,"[(SPACE, _SP), (AUX, VBP), (PRON, PRP), (VERB,...","[SPACE__SP, AUX_VBP, PRON_PRP, VERB_VB, PRON_P...","{'SPACE__SP': 5.91, 'AUX_VBP': 1.39, 'PRON_PRP..."
2,ayoubalharchal,\n\n\nDo you find yourself apologizing all the...,8,https://www.takethislife.com/depression/do-you...,2022-07-14,2302,379,77.653752,"[(SPACE, _SP), (AUX, VBP), (PRON, PRP), (VERB,...","[SPACE__SP, AUX_VBP, PRON_PRP, VERB_VB, PRON_P...","{'SPACE__SP': 5.11, 'AUX_VBP': 2.46, 'PRON_PRP..."
3,ayoubalharchal,"\n\n\nIF SO, READING & UNDERSTANDING THIS WILL...",8,https://www.takethislife.com/depression/do-you...,2022-07-12,2277,377,74.071391,"[(SPACE, _SP), (NOUN, NN), (PROPN, NNP), (PUNC...","[SPACE__SP, NOUN_NN, PROPN_NNP, PUNCT_,, PROPN...","{'SPACE__SP': 5.59, 'NOUN_NN': 10.8, 'PROPN_NN..."
4,ayoubalharchal,\n\n\nAren’t you a little bit tired of people ...,8,https://www.takethislife.com/depression/aren-t...,2022-07-12,2721,467,79.586657,"[(SPACE, _SP), (AUX, VBP), (PART, RB), (PRON, ...","[SPACE__SP, AUX_VBP, PART_RB, PRON_PRP, DET_DT...","{'SPACE__SP': 4.57, 'AUX_VBP': 1.47, 'PART_RB'..."
...,...,...,...,...,...,...,...,...,...,...,...
19150,sweetblood123,\n\n\nWhy live if we just die in the end-when ...,118,https://www.takethislife.com/depression/why-li...,2006-06-23,276,52,,"[(SPACE, _SP), (SCONJ, WRB), (VERB, VBP), (SCO...","[SPACE__SP, SCONJ_WRB, VERB_VBP, SCONJ_IN, PRO...","{'SPACE__SP': 3.12, 'SCONJ_WRB': 4.69, 'VERB_V..."
19151,imissulotz,\n\n\n:( this is my first post ever on here bu...,1,https://www.takethislife.com/depression/being-...,2006-06-21,1441,314,-215.103734,"[(SPACE, _SP), (PUNCT, NFP), (PRON, DT), (AUX,...","[SPACE__SP, PUNCT_NFP, PRON_DT, AUX_VBZ, PRON_...","{'SPACE__SP': 0.91, 'PUNCT_NFP': 0.3, 'PRON_DT..."
19152,sadeyes4eva,"\n\n\nHi, new here, \n Just started seein...",153,https://www.takethislife.com/depression/how-do...,2006-06-21,999,219,92.603370,"[(SPACE, _SP), (PROPN, NNP), (PUNCT, ,), (ADJ,...","[SPACE__SP, PROPN_NNP, PUNCT_,, ADJ_JJ, ADV_RB...","{'SPACE__SP': 7.23, 'PROPN_NNP': 0.8, 'PUNCT_,..."
19153,,\n\n\nanother day. my sleep is so messed up....,13,https://www.takethislife.com/depression/every-...,2006-06-23,511,114,86.509308,"[(SPACE, _SP), (DET, DT), (NOUN, NN), (PUNCT, ...","[SPACE__SP, DET_DT, NOUN_NN, PUNCT_., SPACE__S...","{'SPACE__SP': 9.7, 'DET_DT': 4.48, 'NOUN_NN': ..."


In [110]:
depression = pd.concat([depression.drop(['tag_ratios'], axis=1), depression['tag_ratios'].apply(pd.Series)], axis=1)

In [118]:
depression.to_csv('depression_features_pos.csv',index=None)

In [119]:
depression.head()

Unnamed: 0,username,text,posts,url,date_posted,post_length,word_count,readability_score,pos_tags,joined_tags,...,AUX_ADD,AUX_:,AUX_-LRB-,VERB_NNS,AUX_HYPH,AUX_PRP,ADV_CC,AUX_.,AUX_XX,AUX_WRB
0,ayoubalharchal,\n\n\nDo you ever feel like everyone is watchi...,8,https://www.takethislife.com/depression/do-you...,2022-07-19,2324,400,82.616667,"[(SPACE, _SP), (AUX, VBP), (PRON, PRP), (ADV, ...","[SPACE__SP, AUX_VBP, PRON_PRP, ADV_RB, VERB_VB...",...,,,,,,,,,,
1,ayoubalharchal,\n\n\nDo you compare yourself to other people ...,8,https://www.takethislife.com/depression/do-you...,2022-07-14,2767,427,64.490101,"[(SPACE, _SP), (AUX, VBP), (PRON, PRP), (VERB,...","[SPACE__SP, AUX_VBP, PRON_PRP, VERB_VB, PRON_P...",...,,,,,,,,,,
2,ayoubalharchal,\n\n\nDo you find yourself apologizing all the...,8,https://www.takethislife.com/depression/do-you...,2022-07-14,2302,379,77.653752,"[(SPACE, _SP), (AUX, VBP), (PRON, PRP), (VERB,...","[SPACE__SP, AUX_VBP, PRON_PRP, VERB_VB, PRON_P...",...,,,,,,,,,,
3,ayoubalharchal,"\n\n\nIF SO, READING & UNDERSTANDING THIS WILL...",8,https://www.takethislife.com/depression/do-you...,2022-07-12,2277,377,74.071391,"[(SPACE, _SP), (NOUN, NN), (PROPN, NNP), (PUNC...","[SPACE__SP, NOUN_NN, PROPN_NNP, PUNCT_,, PROPN...",...,,,,,,,,,,
4,ayoubalharchal,\n\n\nAren’t you a little bit tired of people ...,8,https://www.takethislife.com/depression/aren-t...,2022-07-12,2721,467,79.586657,"[(SPACE, _SP), (AUX, VBP), (PART, RB), (PRON, ...","[SPACE__SP, AUX_VBP, PART_RB, PRON_PRP, DET_DT...",...,,,,,,,,,,


In [123]:
depression.columns

Index(['username', 'text', 'posts', 'url', 'date_posted', 'post_length',
       'word_count', 'readability_score', 'pos_tags', 'joined_tags',
       'SPACE__SP', 'AUX_VBP', 'PRON_PRP', 'ADV_RB', 'VERB_VB', 'SCONJ_IN',
       'PRON_NN', 'AUX_VBZ', 'VERB_VBG', 'PUNCT_.', 'CCONJ_CC', 'VERB_VBP',
       'PART_TO', 'PRON_WP', 'AUX_MD', 'AUX_VB', 'ADP_IN', 'PRON_PRP$',
       'NOUN_NN', 'ADJ_JJ', 'PROPN_NNP', 'NOUN_NNS', 'ADP_RP', 'PUNCT_,',
       'VERB_VBN', 'PUNCT_``', 'NUM_CD', 'PART_RB', 'DET_DT', 'SCONJ_WRB',
       'VERB_VBZ', 'AUX_VBD', 'PRON_EX', 'VERB_VBD', 'PUNCT_:', 'PUNCT_''',
       'PUNCT_HYPH', 'ADJ_JJS', 'PRON_WDT', 'PRON_DT', 'PUNCT_-LRB-',
       'PUNCT_-RRB-', 'PART_POS', 'X_LS', 'ADJ_JJR', 'ADV_RBR', 'PUNCT_NFP',
       'INTJ_UH', 'DET_PDT', 'AUX_VBG', 'ADV_RBS', 'X_XX', 'X_ADD', 'SYM_SYM',
       'AUX_VBN', 'DET_WDT', 'AUX_RB', 'X_FW', 'SYM_$', 'DET_WP$',
       'PROPN_NNPS', 'AUX_UH', 'AUX_IN', 'AUX_POS', 'ADJ_AFX', 'AUX_NN',
       'AUX_NNP', 'VERB_NNP', 'AUX_NFP', 'A