In [39]:
from datetime import datetime
import pandas as pd
import praw
import json
import pyarrow as pa
import pyarrow.parquet as pq

from dotenv import load_dotenv
import os

# Load environment variables from .env file
load_dotenv('.env')


def process_reddit_data():
    reddit = praw.Reddit(
        client_id=os.getenv("CLIENT_ID"),
        client_secret=os.getenv("SECRET_KEY"),
        user_agent=os.getenv("REDDIT_GRANT_TYPE"),
        username=os.getenv("REDDIT_USERNAME"),
        password=os.getenv("REDDIT_PASSWORD")
    )

    dfComment = pd.DataFrame(columns=['id', 'timestamp', 'author', 'body', 'title'])
    index_loop = 0

    submission_id = "isfwqm"
    submission = reddit.submission(submission_id)
    submission.comments.replace_more(limit=None)

    for comment in submission.comments.list():
        dfComment.loc[index_loop, 'id'] = comment.id
        dfComment.loc[index_loop, "timestamp"] = datetime.utcfromtimestamp(comment.created_utc)
        dfComment.loc[index_loop, 'author'] = str(comment.author)
        dfComment.loc[index_loop, 'body'] = comment.body
        dfComment.loc[index_loop, 'title'] = submission.title
        index_loop += 1

    dfComment['dt'] = pd.to_datetime(dfComment['timestamp']).dt.strftime('%Y-%m-%d')
    dfComment['author'] = dfComment['author'].apply(lambda x: str(x))

    data_folder = './notebooks/data/'

    if not os.path.exists(data_folder):
        os.makedirs(data_folder)

    ls_dt = sorted(list(dfComment['dt'].unique()))

    for dt in ls_dt:
        file_name = f'{submission_id}_{dt}.parquet'
        dfComment_daily = dfComment[dfComment['dt'] == dt]

        arrow_table = pa.Table.from_pandas(dfComment_daily)
        pq.write_table(arrow_table, data_folder + file_name)

    return None

if __name__ == "__main__":
    process_reddit_data()

In [27]:
df1 = pd.read_parquet('data/isfwqm_2020-09-14.parquet')
df1

Unnamed: 0,id,timestamp,author,body,title,dt
0,g57nf4z,2020-09-14 08:01:59,mechacorgi19,"""beats"" to study to indeed",[Meme Monday] Singaporean Lo-Fi Girl,2020-09-14
1,g57l9l9,2020-09-14 07:22:18,Koalakaisg,Ahh yes nothing beats studying to the ASMR sou...,[Meme Monday] Singaporean Lo-Fi Girl,2020-09-14
2,g57lst3,2020-09-14 07:31:50,pokoook,What's going on?\n\n⚪ It's annoying or not in...,[Meme Monday] Singaporean Lo-Fi Girl,2020-09-14
3,g57k79n,2020-09-14 07:03:53,SpermWhale,i like the wedding Hello Kitty on top of the s...,[Meme Monday] Singaporean Lo-Fi Girl,2020-09-14
4,g57lg72,2020-09-14 07:25:32,,[deleted],[Meme Monday] Singaporean Lo-Fi Girl,2020-09-14
...,...,...,...,...,...,...
305,g582rf4,2020-09-14 12:30:26,,[deleted],[Meme Monday] Singaporean Lo-Fi Girl,2020-09-14
306,g581pg0,2020-09-14 12:16:54,TryinaD,I’m talking about people with adhd who rly nee...,[Meme Monday] Singaporean Lo-Fi Girl,2020-09-14
307,g59wcbc,2020-09-14 19:47:33,various_beans,I've lived in korea. The parents were signing ...,[Meme Monday] Singaporean Lo-Fi Girl,2020-09-14
308,g58zuzp,2020-09-14 16:45:05,HazyNightz,Gahment: so it's only that set of kids ah... T...,[Meme Monday] Singaporean Lo-Fi Girl,2020-09-14


In [28]:
def retrieve_list_of_submission_id():
    reddit = praw.Reddit(
        client_id=os.getenv("CLIENT_ID"),
        client_secret=os.getenv("SECRET_KEY"),
        user_agent=os.getenv("REDDIT_GRANT_TYPE"),
        username=os.getenv("REDDIT_USERNAME"),
        password=os.getenv("REDDIT_PASSWORD")
    )

    submissions = []

    for submission in reddit.subreddit("Singapore").new(limit=1000):
        submissions.append(submission.id)
        pd.DataFrame(submissions).to_csv('1000_submission_ids.csv')
    
    return submissions

retrieve_list_of_submission_id()

['1b4cvlw',
 '1b4cuvd',
 '1b4br3c',
 '1b4biez',
 '1b4b960',
 '1b46x6s',
 '1b46kfs',
 '1b45mta',
 '1b3zx0o',
 '1b3yt5c',
 '1b3y28h',
 '1b3w78v',
 '1b3ueff',
 '1b3tujp',
 '1b3stu9',
 '1b3soe3',
 '1b3s89b',
 '1b3s0ab',
 '1b3qr5p',
 '1b3qows',
 '1b3qcy0',
 '1b3pyq2',
 '1b3pw27',
 '1b3pvlz',
 '1b3pu5p',
 '1b3pt0y',
 '1b3nc03',
 '1b3mr5u',
 '1b3mech',
 '1b3kgc5',
 '1b3k2bj',
 '1b3jylx',
 '1b3jd2m',
 '1b3jahx',
 '1b3ina3',
 '1b3hqk5',
 '1b3hpmy',
 '1b3h902',
 '1b3gqhc',
 '1b3eu8a',
 '1b3cr1c',
 '1b35ec6',
 '1b347zr',
 '1b33a5y',
 '1b30qao',
 '1b2zxyz',
 '1b2y8rj',
 '1b2xllq',
 '1b2xksu',
 '1b2xfwm',
 '1b2xdf5',
 '1b2x1h0',
 '1b2wqe2',
 '1b2wddp',
 '1b2w6eh',
 '1b2vijf',
 '1b2uy1l',
 '1b2uovy',
 '1b2ub8c',
 '1b2s9ud',
 '1b2rnfd',
 '1b2rh86',
 '1b2q5v2',
 '1b2pdfc',
 '1b2opz2',
 '1b2o9me',
 '1b2m4tj',
 '1b2lfqt',
 '1b2itsr',
 '1b2igl5',
 '1b28j7y',
 '1b27mw5',
 '1b27hnb',
 '1b25nvp',
 '1b25lnl',
 '1b25987',
 '1b228j3',
 '1b24ro1',
 '1b24cup',
 '1b24bai',
 '1b245vo',
 '1b23koj',
 '1b22vjd',
 '1b

In [54]:
def get_submission_ids(file):

        df = pd.read_csv(file)
        submission_ids = df['submission_id'].tolist()
        submission_titles = df['submission_title'].tolist()
        
        return submission_ids[:5], submission_titles[:5]
    

def process_reddit_data(file):
        
        reddit = praw.Reddit(
            client_id=os.getenv("CLIENT_ID"),
            client_secret=os.getenv("SECRET_KEY"),
            user_agent=os.getenv("REDDIT_GRANT_TYPE"),
            username=os.getenv("REDDIT_USERNAME"),
            password=os.getenv("REDDIT_PASSWORD")
        )

        dfComment = pd.DataFrame(columns=['id', 'timestamp', 'author', 'body', 'title'])
        index_loop = 0

        submission_ids_list, submission_titles = get_submission_ids(file)

        for submission_id, submission_title in zip(submission_ids_list, submission_titles):
            submission = reddit.submission(submission_id)
            submission.comments.replace_more(limit=None)

            for comment in submission.comments.list():
                dfComment.loc[index_loop, 'id'] = comment.id
                dfComment.loc[index_loop, "timestamp"] = datetime.utcfromtimestamp(comment.created_utc)
                dfComment.loc[index_loop, 'author'] = str(comment.author)
                dfComment.loc[index_loop, 'body'] = comment.body
                dfComment.loc[index_loop, 'title'] = submission_title
                index_loop += 1

            dfComment['dt'] = pd.to_datetime(dfComment['timestamp']).dt.strftime('%Y-%m-%d')
            dfComment['author'] = dfComment['author'].apply(lambda x: str(x))

            data_folder = f'./proj_radical_sparks/data/{submission_id}/'
            print(data_folder)

            if not os.path.exists(data_folder):
                os.makedirs(data_folder)

            ls_dt = sorted(list(dfComment['dt'].unique()))

            for dt in ls_dt:
                file_name = f'{submission_id}_{dt}.parquet'
                dfComment_daily = dfComment[dfComment['dt'] == dt]

                arrow_table = pa.Table.from_pandas(dfComment_daily)
                file_path = os.path.join(data_folder, file_name)
                pq.write_table(arrow_table, file_path)

        return None

In [60]:
process_reddit_data('/Users/johnnytay/Library/CloudStorage/OneDrive-Personal/My NUS Mtech EBAC course/Semester 3/Practice Module/bead_pyspark/proj_radical_sparks/new_100_submission.csv')

./proj_radical_sparks/data/1b4ff0t/
./proj_radical_sparks/data/1b4eszt/
./proj_radical_sparks/data/1b4cvlw/
./proj_radical_sparks/data/1b4cuvd/
./proj_radical_sparks/data/1b4br3c/
