In [None]:
import pandas as pd
import praw
from praw.models import MoreComments
from dotenv import load_dotenv
import os
import configparser
import pyarrow as pa
import pyarrow.parquet as pq
from datetime import date, timedelta, datetime
from prawcore.exceptions import RequestException
import time
import warnings
warnings.filterwarnings("ignore")

env_path ='.env'
load_dotenv(dotenv_path=env_path)
env_client_id= os.getenv("CLIENT_ID")
env_client_secret= os.getenv("SECRET_KEY")
env_user_agent=os.getenv("REDDIT_GRANT_TYPE")
env_username=os.getenv("REDDIT_USERNAME")
env_password=os.getenv("REDDIT_PASSWORD")

reddit = praw.Reddit(
    client_id= env_client_id,
    client_secret= env_client_secret,
    user_agent=env_user_agent,
    username=env_username,
    password=env_password
)

In [None]:
def retrieve_list_of_submission_id(subreddit_name_list):
        reddit = praw.Reddit(
            client_id=os.getenv("CLIENT_ID"),
            client_secret=os.getenv("SECRET_KEY"),
            user_agent=os.getenv("REDDIT_GRANT_TYPE"),
            username=os.getenv("REDDIT_USERNAME"),
            password=os.getenv("REDDIT_PASSWORD")
        )

        submissions = []

        for subreddit_name in subreddit_name_list:
            for submission in reddit.subreddit(subreddit_name).new(limit=500):
                submissions.append(submission.id)       
        return submissions

In [None]:
import pandas as pd
from datetime import datetime
import praw
import time

def fetch_comments_from_submission(submission_id, start_date, end_date):
    
    # Initialize DataFrame to store comments
    dfComment = pd.DataFrame(columns=[
        'author', 'body', 'created_utc', 'id', 
        'submission', 'subreddit', 'subreddit_id'
    ])
    
    try:
        # Fetch submission from Reddit
        submission = reddit.submission(id=submission_id)

        # Replace "More Comments" in the submission's comments tree
        submission.comments.replace_more(limit=None)

        # Loop through each comment and append it to the DataFrame
        for comment in submission.comments.list():
            dfComment = dfComment.append({
                'author': comment.author.name if comment.author else '[deleted]',
                'body': comment.body,
                'created_utc': comment.created_utc,
                'id': comment.id,
                'submission': submission.id,
                'subreddit': submission.subreddit.display_name,
                'subreddit_id': submission.subreddit_id
            }, ignore_index=True)
    
    except RequestException as e:
        print(f"Waiting for 120 seconds due to Reddit API error: {e}")
        time.sleep(120)
    
    except Exception as e:
        print(f"An unknown error occurred: {e}")
    
    dfComment['created_utc'] = pd.to_datetime(dfComment['created_utc'], unit='s')
    dfComment = dfComment[(dfComment['created_utc'] >= start_date) & (dfComment['created_utc'] <= end_date)]
    return dfComment

In [None]:
start_date = '2023-9-15'
end_date = '2024-9-16'
reddit_list = ['datascience', 'MachineLearning','technology','artificial','cybersecurity']
file_name = f"{start_date}_{end_date}.csv"
start_date = pd.to_datetime(start_date)
end_date = pd.to_datetime(end_date)

In [None]:
# Initialize an empty DataFrame to store all comments
final_df = pd.DataFrame(columns=[
    'author', 'body', 'created_utc', 'id', 
    'submission', 'subreddit', 'subreddit_id'
])
start_time = time.time()
for submissions in retrieve_list_of_submission_id(reddit_list):
    comments_df = fetch_comments_from_submission(submissions, start_date, end_date)
    final_df = pd.concat([final_df, comments_df], ignore_index=True)
final_df.drop_duplicates().to_csv(file_name, index=False)
end_time = time.time()

print('Time taken: ', end_time - start_time)
print('Length of final DataFrame: ', len(final_df))