In [None]:
#Dependencies and Reddit Api Wrapper
import praw
import pandas as pd
from tqdm import tqdm

In [6]:
#Create a reddit account, Then create a OAuth2 token from:
# https://github.com/reddit-archive/reddit/wiki/OAuth2
# Make sure to choose the "Script app:" option. 
reddit = praw.Reddit(client_id='**********',
                     client_secret='**********',
                     password='**********',
                     user_agent='**********',
                     username='**********')

# Getting Data

## Getting d_users

In [None]:
#Get most recent 1000 posts in r/depression and their users
posts = []
ml_subreddit = reddit.subreddit('depression')
for post in tqdm(ml_subreddit.new(limit=1000):)
    posts.append([post.title, post.score, post.id, post.subreddit, post.url, post.num_comments, post.selftext, post.created, post.author])
posts = pd.DataFrame(posts,columns=['title', 'score', 'id', 'subreddit', 'url', 'num_comments', 'body', 'created', 'user'])
print(posts)

In [None]:
#Getting comment karma for each of the 1000 d_users (this usually takes 10ism minutes to run)
d_users = posts.user
d_karma = []
for user in tqdm(d_users):
   try:
    d_karma.append(reddit.redditor(user).comment_karma)
   except:
    d_karma.append(0)
        

In [None]:
#Removing users with less than 100 karma
user_df = pd.DataFrame({'Users': d_users, 'Karma': d_karma})
users_filtered = user_df[user_df["Karma"] > 100]
users_filtered

In [None]:
#ML Dependencies
import re, string
import nltk
import inflect
from nltk import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer, WordNetLemmatizer
from bs4 import BeautifulSoup
import unicodedata
import spacy
import contractions
from contraction_list import CONTRACTION_MAP

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
def replace_contractions(text):
    """Replace contractions in string of text"""
    return contractions.fix(text)


In [None]:
#Remove all posts in depression related subreddits
remove_list = pd.read_csv("Depression_related.csv")

In [None]:

remove_list = remove_list.Subreddits.tolist()

In [None]:
#Getting all posts (that we can) for each d_user
d_posts = []
for user in users_filtered["Users"]:
    sample_text = ''
    subtext = ''
    for submission in reddit.redditor(str(user)).submissions.new(limit=1000):
        if submission.subreddit in remove_list:
            continue 
        try: 
            entry = " ".join([submission.title, submission.selftext])
            entry = replace_contractions(entry)
            sample_text += entry
        except: 
            pass
    d_posts.append(sample_text)

In [None]:
d_frame = pd.DataFrame(d_posts)

In [None]:
d_frame

## Getting c_users (Control group)

In [None]:
##Get most recent 1000 posts in r/AskReddit and their users
posts = []
ml_subreddit = reddit.subreddit('AskReddit')
for post in tqdm(ml_subreddit.new(limit=100)):
    posts.append([post.title, post.score, post.id, post.subreddit, post.url, post.num_comments, post.selftext, post.created, post.author])
posts = pd.DataFrame(posts,columns=['title', 'score', 'id', 'subreddit', 'url', 'num_comments', 'body', 'created', 'user'])


In [4]:
#Getting comment karma for each of the 1000 c_users (this usually takes 10ism minutes to run)
c_users = posts.user
c_karma = []
for user in tqdm(c_users):
   try:
    c_karma.append(reddit.redditor(user).comment_karma)
   except:
    c_karma.append(0)
        

NameError: name 'posts' is not defined

In [None]:
#Removing users with less than 100 karma
c_user_df = pd.DataFrame({'Users': c_users, 'Karma': c_karma})
c_users_filtered = c_user_df[c_user_df["Karma"] > 100]
c_users_filtered

In [None]:
#Getting all posts (that we can) for each c_user
c_posts = []
for user in users_filtered["Users"]:
    sample_text = ''
    subtext = ''
    for submission in reddit.redditor(str(user)).submissions.new(limit=1000):
        if submission.subreddit in remove_list:
            continue 
        try: 
            entry = " ".join([submission.title, submission.selftext])
            entry = replace_contractions(entry)
            sample_text += entry
        except: 
            pass
    c_posts.append(sample_text)

# Cleaning the posts

In [None]:
#Removes all punctuations, html stuff, numbers (changes them to words), stop words (a, and, the, etc...),
#and then finds the stem of each words (combines words like run and running into one)
def remove_non_ascii(words):
    """Remove non-ASCII characters from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore')
        new_words.append(new_word)
    return new_words

def to_lowercase(words):
    """Convert all characters to lowercase from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = word.lower()
        new_words.append(new_word)
    return new_words

def remove_punctuation(words):
    """Remove punctuation from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = re.sub(r'[^\w\s]', '', word)
        if new_word != '':
            new_words.append(new_word)
    return new_words

def replace_numbers(words):
    """Replace all interger occurrences in list of tokenized words with textual representation"""
    p = inflect.engine()
    new_words = []
    for word in words:
        if word.isdigit():
            new_word = p.number_to_words(word)
            new_words.append(new_word)
        else:
            new_words.append(word)
    return new_words

def remove_stopwords(words):
    """Remove stop words from list of tokenized words"""
    new_words = []
    for word in words:
        if word not in stopwords.words('english'):
            new_words.append(word)
    return new_words

def stem_words(words):
    """Stem words in list of tokenized words"""
    stemmer = LancasterStemmer()
    stems = []
    for word in words:
        stem = stemmer.stem(word)
        stems.append(stem)
    return stems

def lemmatize_verbs(words):
    """Lemmatize verbs in list of tokenized words"""
    lemmatizer = WordNetLemmatizer()
    lemmas = []
    for word in words:
        lemma = lemmatizer.lemmatize(word, pos='v')
        lemmas.append(lemma)
    return lemmas

def normalize(words):
    words = remove_non_ascii(words)
    words = to_lowercase(words)
    words = remove_punctuation(words)
    words = replace_numbers(words)
    words = remove_stopwords(words)
    return words


In [None]:
d_posts = stem_words(d_posts)
d_normalized_posts = normalize(d_posts)
c_posts = stem_words(c_posts)
c_normalized_posts = normalize(c_posts)
Depressed = pd.DataFrame({'Text': d_normalized_posts, 'Category': "Depressed"})
Control = pd.DataFrame({'Text': c_normalized_posts, 'Category': "Control"})
Full = pd.concat([Depressed, Control])
Full = Full.sample(frac=1)
Full = Full.reset_index(drop = True)

In [None]:
Full.to_csv("reddit_data_1.csv")

In [None]:
Full