# Connect to PRAW API

In [None]:
import praw

reddit = praw.Reddit(
    client_id="",
    client_secret="",
    user_agent="",
    username="",
    password="",
)
print(reddit.read_only)



# Use Pushshift API to pull the author list (can be skipped, since the list is saved and provided)

In [None]:
from pushshift_py import PushshiftAPI
import datetime as dt

api = PushshiftAPI()
start_time = int(dt.datetime(2019, 1, 1).timestamp())
end_time = int(dt.datetime(2020, 12, 31).timestamp())

results2 = list(api.search_submissions(after=start_time, before=end_time,
                                  subreddit='suicidewatch', 
                                  sort = 'asc', sort_type = 'created_utc',
                                  filter=['author', 'id','title','selftext','subreddit']))

In [None]:
import pandas as pd
d2 = pd.DataFrame(results2)
d2.dropna(inplace = True)
df2 = pd.DataFrame(d2['d_'].to_list())

df2.columns = ["submission_name", "submission_time", "submission_id", "submission_body", "subreddit", "submission_title"]
Authors_unique = df2['submission_name'].unique()

# Load author list, and 3 already sampled lists (to be excluded)

In [None]:
import pandas as pd
author = pd.read_csv("Datasets/Authors_unique.csv")
author_train1 = pd.read_csv("Datasets/Authors_0610_5000.csv")
author_train2 = pd.read_csv("Datasets/Authors_0619_5000.csv")
author_train3 = pd.read_csv("Datasets/Authors_0621_5000.csv")
author_train1.columns = author_train2.columns
author_train = pd.concat([author_train1,author_train2,author_train3])
author = author[~author["0"].isin(author_train["Name"])]


# Define the pipeline of selecting posts and comments

In [None]:
def extract_post_and_comment(author):
    comment_of_poster = []
    post = []
    print("In " + author)
    comment_to_post = []
    if author is not None:
        try:
            cmt_ls = list(reddit.redditor(author).comments.new(limit = None))
            for cmt in cmt_ls:
                comment_of_poster.append([author, cmt.subreddit, cmt.body, cmt.subreddit_id, cmt.link_title, cmt.created_utc])
        except Exception as e:
            pass

        try:
            ls = []
            sub_ls = list(reddit.redditor(author).submissions.new(limit = None))
            
            for submission in sub_ls:
                if submission.created_utc >= int(dt.datetime(2019, 1, 1).timestamp()): 
                    ls.append([submission.subreddit])
            if (pd.DataFrame(ls) == 'SuicideWatch').any().bool():
                for submission in sub_ls:
                    if submission.created_utc >= int(dt.datetime(2019, 1, 1).timestamp()): 
                        post.append([submission.author,submission.subreddit,  submission.selftext, submission.id,
                            submission.title,submission.created_utc])
                        submission.comments.replace_more(limit=None)
                        for com in submission.comments.list():
                            try:
                                list(com.author.submissions.new(limit = None))
                                comment_to_post.append([submission.id, com.author, com.body, com.created_utc])
                            except Exception as e:
                                pass
        except Exception as e:
            pass
    print("Out " + author)
    return post, comment_of_poster, comment_to_post




# Create a multiprocessing executor and save the results

In [None]:
from concurrent import futures
import timeit
import datetime as dt 
import numpy as np

start_time = timeit.default_timer() 
posts = pd.DataFrame({})
comments_poster = pd.DataFrame({})
comments_post = pd.DataFrame({})


with futures.ProcessPoolExecutor(1000) as pool:
    for post, comment_poster, comment_post in pool.map(extract_post_and_comment, author['0']):
        if post:
            posts = posts.append(pd.DataFrame(np.array(post)))
        if comment_poster:
            comments_poster = comments_poster.append(pd.DataFrame(np.array(comment_poster)))
        if comment_post:
            comments_post = comments_post.append(pd.DataFrame(np.array(comment_post)))

elapsed = timeit.default_timer() - start_time



In [None]:
posts.to_csv("post_rest.csv")
comments_poster.to_csv("comment_of_poster_rest.csv")
comments_post.to_csv("comment_to_post_rest.csv")
