# Part 1: Scrape comments and posts from Reddit

### import libraries and setup reddit app from client interfaces

In [None]:
#import libraries and sensitive information
import praw
import pandas as pd
from datetime import datetime
from reddit_auth0 import client_id, client_secret, username, password

In [None]:
#setup reddit
reddit = praw.Reddit(
    user_agent = True,  client_id =client_id,
    client_secret = client_secret,
    username = username, password = password)

### Get {number of posts} from each {keyword}. In this notebook, 4 keywords 50 posts each

In [None]:
def fetch_reddit_posts(keyword, limit):
    """Fetch top relevant post URLs based on keyword and limit"""
    search_results = reddit.subreddit("all").search(keyword, sort="relevance", limit=limit)
    
    urls = [post.url for post in search_results]
    keywords = [keyword] * len(urls)  # Create an array with the keyword repeated
    
    return urls, keywords  # Returning both lists

# Storage for accumulating multiple searches
urls = []
keywords = []

# Main loop for user input
while True:
    keyword = input("\nEnter a keyword to search Reddit (or type 'exit' to stop): ").strip()
    
    if keyword.lower() == "exit":
        print("\nExiting program. Here are all the collected URLs and keywords:\n")
        print("URLs:", urls)
        print("Keywords:", keywords)
        break

    try:
        num_posts = int(input("Enter the number of top posts to fetch: ").strip())
    except ValueError:
        print("Please enter a valid number.")
        continue

    # Fetch and store results
    new_urls, new_keywords = fetch_reddit_posts(keyword, num_posts)

    urls.extend(new_urls)  # Append new URLs to the existing list
    keywords.extend(new_keywords)  # Append new keywords to the existing list

    # Display results after each search
    print("\nCurrent Collected Data:")
    print("URLs:", urls)
    print("Keywords:", keywords)

### actual number of posts saved 

In [None]:
len(keywords)

In [None]:
len(urls)

### delete dupliucated links and media posts (images and videos). In this case, we only use posts with words only

In [None]:
seen = set()
unique_urls = []
unique_keywords = []

for url, keyword in zip(urls, keywords):
    if url.startswith("https://www.reddit.com/") and url not in seen:
        unique_urls.append(url)
        unique_keywords.append(keyword)  # Keep the corresponding keyword
        seen.add(url)

# Overwrite the original lists
urls = unique_urls
keywords = unique_keywords

# Output results
print("Unique URLs after filtering and removing duplicates:")
print(urls)
print("Length of URLs:", len(urls))  

print("\nKeywords aligned with unique URLs:")
print(keywords)
print("Length of Keywords:", len(keywords))

### Extract the posts

In [None]:
post_log = []

In [None]:
#fetch comments, posts, and replies
i = 0
while i < len(urls) - 1:
    post = reddit.submission(url = urls[i])
    post_log.append({
        "author": post.author,
        "time_posted": datetime.fromtimestamp(post.created_utc),
        "title": post.title,
        "post": post.selftext,
        "up-votes": post.score,
        "subreddit": post.subreddit,
        "number_of_comments": len(post.comments),
        "keyword": keywords[i],
        "post_number": i+1
    })
    print(f"post {i} extracted ")
    i = i+1

In [None]:
df = pd.DataFrame(post_log)
df

In [None]:
sum(df['number_of_comments'])

### Extract the comments and replies

In [None]:
comment_datalog = []

In [None]:
print(len(urls))

In [None]:
i = 0
while i < len(urls) - 1:
    post = reddit.submission(url = urls[i])
    post.comments.replace_more(limit=None)
    for comment in post.comments:
        comment_datalog.append({
            "author": comment.author,
            "time_posted": datetime.fromtimestamp(comment.created_utc),
            "post": comment.body,
            "up-votes": comment.score,
            "comment": 1,
            "reply": 0,
            "keyword": keywords[i],
            "post_number": i+1
        })
        for reply in comment.replies:
            comment_datalog.append({
            "author": reply.author,
            "time_posted": datetime.fromtimestamp(reply.created_utc),
            "post": reply.body,
            "up-votes": reply.score,
            "comment": 0,
            "reply": 1,
            "keyword": keywords[i],
            "post_number": i+1
        })
    print(f"post {i} done.")
    i = i + 1   

In [None]:
print(reddit.auth.limits)

In [None]:
df2 = pd.DataFrame(comment_datalog)
print(df2.shape)
df2

In [None]:
df2 = df2.drop_duplicates(subset='text', keep='first')
print(df2.shape)

In [None]:
index = df2['post_number'].unique()

In [None]:
df = df[df['post_number'].isin(index)]

### Export into csv files

In [None]:
df.to_csv('reddit_post_log.csv', index=False)
df2.to_csv('reddit_comment_log.csv', index=False)

## combine comments and posts together

In [None]:
import pandas as pd

In [None]:
post_log = pd.read_csv("reddit_post_log.csv")
print(post_log.shape)
post_log.head()

In [None]:
comment_log = pd.read_csv("reddit_comment_log.csv")
print(comment_log.shape)
comment_log.head()

In [None]:
post_log['post'] = post_log['post'].fillna(post_log['title'])
post_log.head()

In [None]:
comment_log.rename(columns={'text': 'post'}, inplace=True)
comment_log.rename(columns={'Up-votes': 'up-votes'}, inplace=True)
comment_log.rename(columns={'Author': 'author'}, inplace=True)
comment_log.head()

In [None]:
common_columns = post_log.columns.intersection(comment_log.columns)
post_log_common = post_log[common_columns]
comment_log_common = comment_log[common_columns]
general_log = pd.concat([post_log_common, comment_log_common], ignore_index=True)
print(general_log.shape)
general_log

In [None]:
general_log.to_csv('reddit_post_comment_log.csv', index=False)