## Data Cleaning and Corpus Creation

In [2]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import text
import re
from nltk.tokenize import RegexpTokenizer, word_tokenize
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
pd.set_option('display.max_columns', None)

In [3]:
basetext = pd.read_csv('../data/basetext.csv').drop(columns='Unnamed: 0')
depression = pd.read_csv('../data/depression_30k.csv').drop(columns='Unnamed: 0')
anxiety = pd.read_csv('../data/anxiety_20k.csv').drop(columns='Unnamed: 0')

In [4]:
corpus = pd.concat([basetext,depression,anxiety])

In [5]:
corpus['subreddit'].unique()

array(['CasualConversation', 'happy', 'depression', 'Anxiety'],
      dtype=object)

In [6]:
corpus['class'] = corpus['subreddit'].map({'CasualConversation':0, 'happy':0, 'depression':1, 'Anxiety':2})

In [7]:
corpus.drop(columns=['author','created_utc'],inplace=True)
corpus = corpus.replace('[removed]', ' ').replace('[deleted]',' ')
corpus = corpus.fillna(' ')

In [8]:
corpus['full_text'] = corpus['title'] + " " + corpus['selftext']

In [9]:
corpus['full_text'] = corpus['full_text'].map(lambda x:x.lower())
corpus['full_text'] = corpus['full_text'].map(lambda x:x.replace("'ve",' have'))

In [10]:
corpus = corpus[['full_text','subreddit','class']]

In [11]:
my_tokenizer = RegexpTokenizer("[a-zA-Z0-9\'?]+")

In [12]:
my_stop = ['\n&gt;' ,'https', 'com', 'www', 'don','didn']
stop_words = text.ENGLISH_STOP_WORDS.union(my_stop)

cvect = CountVectorizer(stop_words=stop_words,max_features=100)

In [13]:
dtm_corp = cvect.fit_transform(corpus['full_text'])
dtm_corp = pd.DataFrame(dtm_corp.toarray(),columns=cvect.get_feature_names())

corp_top_60 = pd.DataFrame(dtm_corp.sum().nlargest(60))



### Splitting and Looking at Corpus by Class

In [14]:
dep = corpus[corpus['subreddit']=='depression']

In [15]:
dtm_dep = cvect.fit_transform(dep['full_text'])
dtm_dep = pd.DataFrame(dtm_dep.toarray(),columns=cvect.get_feature_names())

dep_top_60 = pd.DataFrame(dtm_dep.sum().nlargest(60))

In [16]:
anx = corpus[corpus['subreddit']=='Anxiety']

In [17]:
dtm_anx = cvect.fit_transform(anx['full_text'])
dtm_anx = pd.DataFrame(dtm_anx.toarray(),columns=cvect.get_feature_names())

anx_top_60 = pd.DataFrame(dtm_anx.sum().nlargest(60))

In [18]:
base = corpus[(corpus['subreddit']!='depression') & (corpus['subreddit']!='Anxiety')]

In [19]:
dtm_base = cvect.fit_transform(base['full_text'])
dtm_base = pd.DataFrame(dtm_base.toarray(),columns=cvect.get_feature_names())

base_top_60 = pd.DataFrame(dtm_base.sum().nlargest(60))

In [20]:
dep_top_60.T

Unnamed: 0,just,like,feel,want,know,life,time,people,really,depression,think,friends,day,going,things,help,years,ve,anymore,work,good,feeling,better,make,hate,depressed,im,way,got,school,need,family,talk,bad,happy,year,ll,die,fucking,love,doing,right,lost,job,person,getting,try,say,hard,live,started,kill,point,felt,thing,did,shit,care,parents,end
0,44974,33827,30456,23418,21267,20115,14767,14409,14381,12097,10366,10224,9601,9599,9510,9386,8579,8328,7480,7426,7280,7207,7174,7145,6623,6589,6559,6528,6474,6395,6153,6132,5989,5936,5890,5719,5689,5586,5551,5476,5192,5176,5137,5115,5011,4963,4958,4929,4874,4870,4813,4771,4638,4633,4621,4603,4572,4538,4534,4520


In [21]:
anx_top_60.T

Unnamed: 0,anxiety,just,like,feel,know,really,time,want,people,going,ve,think,help,day,anxious,feeling,life,things,panic,work,bad,having,got,need,started,years,make,getting,way,attack,job,does,good,lot,felt,friends,ll,scared,right,year,days,sleep,did,school,doing,thought,thoughts,stop,new,thing,better,heart,makes,thinking,im,went,attacks,talk,said,long
0,28971,23557,23236,18313,12890,11213,10439,8236,7771,7705,7380,7073,7021,6671,6465,6295,6214,6201,6105,6094,5622,5213,4741,4628,4580,4376,4351,4124,3955,3945,3909,3901,3847,3815,3717,3683,3628,3554,3519,3460,3444,3414,3325,3321,3281,3264,3260,3236,3128,3128,3122,3109,3105,3030,3021,3001,2986,2967,2965,2952


In [22]:
base_top_60.T

Unnamed: 0,just,like,time,happy,really,know,feel,people,life,day,got,today,years,good,love,want,year,think,ve,friends,going,work,things,make,finally,job,new,did,little,lot,school,way,friend,best,started,right,wanted,ago,said,months,home,long,feeling,thought,say,better,getting,went,doing,felt,family,person,help,ll,having,reddit,thing,days,makes,talk
0,14769,10498,7631,6849,6763,6124,6097,5883,5747,5652,5556,4872,4798,4323,4195,4192,3928,3742,3582,3566,3494,3371,3350,3320,3148,2946,2932,2544,2534,2531,2520,2510,2488,2459,2420,2400,2344,2310,2259,2247,2182,2181,2084,2076,2072,2069,2058,2006,2002,1998,1943,1930,1914,1903,1893,1887,1868,1843,1836,1775


In [23]:
corpus

Unnamed: 0,full_text,subreddit,class
0,why is it that the person who beats themself u...,CasualConversation,0
1,dealing with sadness hi i’m will and i’ve been...,CasualConversation,0
2,"my life has never been better, and i feel as t...",CasualConversation,0
3,it‘s my cake day!!!! :o i love reddit and will...,CasualConversation,0
4,can i have weed dealer i colorado about 15 min...,CasualConversation,0
...,...,...,...
19995,eye discomfort and heaviness when particularly...,Anxiety,2
19996,"cbd gummies for anxiety hi, i recently bought ...",Anxiety,2
19997,dae have to open their eyes multiple times whi...,Anxiety,2
19998,"pandemic ruined my life, my work and dreams co...",Anxiety,2


In [24]:
analyzer = SentimentIntensityAnalyzer()

In [25]:
def sentiment(row):
    analyzer = SentimentIntensityAnalyzer()
    res = analyzer.polarity_scores(row)
    return pd.Series([res['neg'], res['neu'], res['pos'], res['compound']])

In [26]:
corpus[['neg','neu','pos','comp']] = corpus['full_text'].apply(sentiment)

In [27]:
corpus

Unnamed: 0,full_text,subreddit,class,neg,neu,pos,comp
0,why is it that the person who beats themself u...,CasualConversation,0,0.087,0.830,0.083,-0.0258
1,dealing with sadness hi i’m will and i’ve been...,CasualConversation,0,0.164,0.726,0.110,-0.8376
2,"my life has never been better, and i feel as t...",CasualConversation,0,0.033,0.863,0.104,0.9637
3,it‘s my cake day!!!! :o i love reddit and will...,CasualConversation,0,0.032,0.617,0.351,0.9429
4,can i have weed dealer i colorado about 15 min...,CasualConversation,0,0.000,1.000,0.000,0.0000
...,...,...,...,...,...,...,...
19995,eye discomfort and heaviness when particularly...,Anxiety,2,0.177,0.773,0.050,-0.7116
19996,"cbd gummies for anxiety hi, i recently bought ...",Anxiety,2,0.179,0.722,0.100,-0.3182
19997,dae have to open their eyes multiple times whi...,Anxiety,2,0.093,0.840,0.067,-0.6848
19998,"pandemic ruined my life, my work and dreams co...",Anxiety,2,0.047,0.793,0.159,0.9421


In [34]:
corpus.groupby('subreddit')['neg'].agg(np.mean)

subreddit
Anxiety               0.151344
CasualConversation    0.079332
depression            0.168420
happy                 0.044273
Name: neg, dtype: float64

In [28]:
corpus.to_csv('../data/corpus.csv')