In [166]:
import numpy as np
import pandas as pd
import nltk
for dependency in ("brown", "names", "wordnet", "averaged_perceptron_tagger", "universal_tagset", "stopwords"):
    nltk.download(dependency)
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
 # https://github.com/EFord36/normalise
import normalise
import contractions
import re
import string
import random

[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\yagne\AppData\Roaming\nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package names to
[nltk_data]     C:\Users\yagne\AppData\Roaming\nltk_data...
[nltk_data]   Package names is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\yagne\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\yagne\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package universal_tagset to
[nltk_data]     C:\Users\yagne\AppData\Roaming\nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\yagne\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to

In [167]:
data = pd.read_csv("data/all_dep_data.csv")

In [168]:
data.sample(n=5)

Unnamed: 0,author,year-month,is_self,subreddit,subreddit_subscribers,title,selftext,post_length,depressed
13025,ryfi1022,2020-05,True,college,483493,Advice on not feeling bad about your college e...,"I am a graduating senior, so my time will be o...",215,1
32015,KonaBoda,2020-08,True,outwardgame,24806,Is it worth investing in magic as a pure fight...,"Wanna do Speedster, Hunter and Monk, pure mart...",77,0
6141,WormiestBurrito,2019-08,True,lfg,77130,[Offline][D&amp;D 5e][Sacramento] Experienced ...,I am trying to start up a new 5e campaign. The...,64,1
30314,spool_threader,2020-05,True,LifeProTips,19461327,LPT: If it makes you sad when animals die in m...,"It also covers a lot of other common triggers,...",28,0
16678,kimbooboo,2020-11,True,polyamory,205347,Just broke up,Me and my husband’s girlfriend just broke up w...,80,1


In [180]:
args = {
    "lowercase": True,
    "stopwords": False,
    "normaliser": False,
    "remove_break_lines": True,
    "expand_contractions": True,
    "remove_punctuation": True,
    "remove_redditchars": True,
    "remove_urls": True,
    "lemmatize": False,
    "stem": False
}
translate_table = dict((ord(char), None) for char in string.punctuation + '‘’')   
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()


def preprocess(text, args=args, out=""):
    clean = text
    
    # Lowercase
    if args["lowercase"]:
        clean = clean.lower()

    # Remove URLs
    if args["remove_urls"]:
        clean = re.sub(r'https?:\/\/\S+', '', clean)

    clean = clean.split()

    # Contraction Expansion
    if args["expand_contractions"]:
        clean = [contractions.fix(w) for w in clean]

    # Remove reddit specific characters
    if args["remove_redditchars"]:
        clean = [re.sub(r'\/r\/|\/u\/',"",w) for w in clean]

    # Remove punctuation
    if args["remove_punctuation"]:
        clean = [w.replace("-"," ") for w in clean]
        clean = [w.translate(translate_table) for w in clean]

    # Stopwords
    if args["stopwords"]:
        clean = [w for w in clean if w not in stopwords.words("english")]

    # Use normaliser package
    if args["normaliser"]:
        clean = normalise.normalise(clean, variety="AmE", verbose=False)

    # Lemmatizer
    if args["lemmatize"]:
        clean = [lemmatizer.lemmatize(w) for w in clean]

    if args["stem"]:
        clean = [stemmer.stem(w) for w in clean]

    # Final clean to remove any empty strings
    clean = " ".join(clean).split()
    clean = [w for w in clean if w != ""]

    # Output
    if out=="tokens":
        return clean
    else:
        clean = " ".join(clean)
        return clean




In [178]:
# i = random.randint(0,10000)
i = 138
text = data.iloc[i,6]
text

'**Disclaimer**: I\'m not talking about people without a choice (i.e. many indigenous people) here. This is in reference to most people in the developed world, who have access to vegan options.\n\n1. Nonhuman animals are sentient. They feel pain, have mental lives, and possess an interest in living. There is growing evidence about animal sentience and cognition.\n2. We don\'t have any nutritional need for animal products. This is backed up by numerous studies. In fact, there\'s evidence veganism is even good for your health. [https://www.ncbi.nlm.nih.gov/pubmed/19562864](https://www.ncbi.nlm.nih.gov/pubmed/19562864)\n\nThe reason people eat animal products is taste or convenience. In other words, we have sentient beings killed for our brief taste pleasure. This is morally unjustifiable. If you have a choice between A) a cow flesh burger and B) a veggie burger, and you choose A), you\'re choosing to have them killed for 5 minutes of pleasure, not necessity.\n\nCommon objections include 

In [179]:
preprocess(text,args, out="")

'disclaimer i am not talking about people without a choice ie many indigenous people here this is in reference to most people in the developed world who have access to vegan options 1 nonhuman animals are sentient they feel pain have mental lives and possess an interest in living there is growing evidence about animal sentience and cognition 2 we do not have any nutritional need for animal products this is backed up by numerous studies in fact there is evidence veganism is even good for your health the reason people eat animal products is taste or convenience in other words we have sentient beings killed for our brief taste pleasure this is morally unjustifiable if you have a choice between a a cow flesh burger and b a veggie burger and you choose a you are choosing to have them killed for 5 minutes of pleasure not necessity common objections include appeals to nature it is natural it is the life cycle it is the way things are this is a common fallacy in ethics people also like to appe

In [181]:
data["full_text"] = data.apply(lambda x: x["title"] + " " + x["selftext"],axis=1)

In [182]:
args = {
    "lowercase": True,
    "stopwords": False,
    "normaliser": False,
    "remove_break_lines": True,
    "expand_contractions": True,
    "remove_punctuation": True,
    "remove_redditchars": True,
    "remove_urls": True,
    "lemmatize": False,
    "stem": False
}
data["clean_text"] = list(map(preprocess, data["full_text"]))

In [183]:
data

Unnamed: 0,author,year-month,is_self,subreddit,subreddit_subscribers,title,selftext,post_length,depressed,full_text,clean_text
0,--liveitup,2019-01,True,hacking,647687,Hulu account hacked. Genuinely interested in h...,"Secure wifi, only use Hulu through a Roku stic...",183,1,Hulu account hacked. Genuinely interested in h...,hulu account hacked genuinely interested in ho...
1,-RL-Lokei,2019-01,True,starcitizen,148021,Yela jumptown/ druglab location,"Hi all,\n\nBefore you refer me onto videos of ...",64,1,"Yela jumptown/ druglab location Hi all,\n\nBef...",yela jumptown druglab location hi all before y...
2,-aegon-,2019-01,True,Judaism,29423,Advice for a poor convert-to-be?,How can I tell my religious Christian family t...,44,1,Advice for a poor convert-to-be? How can I tel...,advice for a poor convert to be how can i tell...
3,0haltja16,2019-01,True,confession,1149851,I cheated on a 5 year old’s AR tests,In grade school the older kids were forced to ...,115,1,I cheated on a 5 year old’s AR tests In grade ...,i cheated on a 5 year olds ar tests in grade s...
4,101kbye,2019-01,True,AskWomenOver30,31874,Has anyone taken a sabbatical and what was rea...,I’m not burnt out at work but I want a 3 month...,174,1,Has anyone taken a sabbatical and what was rea...,has anyone taken a sabbatical and what was rea...
...,...,...,...,...,...,...,...,...,...,...,...
34790,teymourbeydoun,2020-12,True,Inception,7370,Am I missing Behind-the-Scenes footage on Blu-...,"Hello, I’ve just bought the 4K UHD Christopher...",69,0,Am I missing Behind-the-Scenes footage on Blu-...,am i missing behind the scenes footage on blu ...
34791,HoopsFella,2020-12,True,NBAtradeideas,936,Surprise Harden Destinations,Curious where people think Harden could end up...,123,0,Surprise Harden Destinations Curious where peo...,surprise harden destinations curious where peo...
34792,friedshrimpt,2020-12,True,Pratt,490,Is Pratt requiring ACT scores?,My score is very low (23). My gpa is a weighte...,31,0,Is Pratt requiring ACT scores? My score is ver...,is pratt requiring act scores my score is very...
34793,zorbsthegreat,2020-12,True,MRKH,288,Tracking Hormonal Cycles and mood shifts with ...,"I have been diagnoses since 17 and Im 23 now, ...",194,0,Tracking Hormonal Cycles and mood shifts with ...,tracking hormonal cycles and mood shifts with ...
