In [18]:
import praw
import networkx as nx
from collections import defaultdict, deque

### Collecting Data
- users ~ nodes
- userA replies to userB's post: directed(?) edge.
- user's attribute: subreddit they are most active in
    - "most active" ~ "most number of comments"

Initial Node Identification: You begin by selecting a popular Reddit post and using its author as the starting node in your network.

First-Level Reply Collection: For this initial post, you collect all first-level replies, establishing direct connections (edges) between the original poster and the users who replied.

User Community Determination: For each user (node) in the network, you determine their most active subreddit by analyzing their recent comments and identifying the subreddit where they are most frequently active.

Network Expansion: Utilizing a breadth-first search (BFS) method, you systematically expand the network. For each new user added, you repeat the process of collecting first-level replies and identifying their most active subreddit.

_Alternative:_

I start by identifying the top 100 most upvoted users on the platform. 

My approach is to employ a depth-first search (DFS) method for each of these top users, analyzing their comments in detail. I track all their interactions, focusing on both users who have replied to them and those they have replied to. As I delve into the comment chains, every new user I encounter triggers the same DFS process, leading to a systematic expansion of the network. 

To manage the scope of this expansion, I limit the process to either 1000 iterations or until I detect a cycle in these interactions. This method is designed to create a detailed map of user interactions and relationships on Reddit, centered around its most influential members.

- There's a possibility that the top upvoted users may not be representative of the broader Reddit community. Their interactions could be skewed towards certain topics or subreddits

In [19]:
# Initialize the Reddit connection with API credentials
def initialize_reddit():
    # Reddit API credentials
    client_id = '9zk3ptJvrNihAEGiiyTYPg'
    client_secret = 'dpYAGYqjV3jfcdg7bNjXJ9gX99cOuQ'
    user_agent = 'network analysis'
    username = "stuffingmybrain"
    return praw.Reddit(client_id=client_id,
                       client_secret=client_secret,
                       user_agent=user_agent,
                       username=username)

In [20]:
# Fetch the first-level replies to a Reddit submission
def fetch_first_level_replies(submission):
    first_level_replies = []
    try:
        # Expanding the comments and filtering out non-first-level replies
        submission.comments.replace_more(limit=0)
        for comment in submission.comments.list():
            if comment.parent_id == submission.fullname:
                first_level_replies.append(comment)
    except Exception as e:
        print(f"Error fetching comments: {e}")
    return first_level_replies

In [25]:
# Compute the most active subreddit for a given Reddit user
def compute_most_active_subreddit(reddit, username):
    subreddit_activity = defaultdict(int)
    try:
        # Iterating through recent comments of the user and counting subreddit activity
        for comment in reddit.redditor(username).comments.new(limit=1000):
            subreddit_activity[comment.subreddit.display_name] += 1
    except Exception as e:
        print(f"Error fetching user comments: {e}")
    return max(subreddit_activity, key=subreddit_activity.get, default=None)

In [26]:
reddit = initialize_reddit()
compute_most_active_subreddit(reddit, "stuffingmybrain")

'berkeley'

### Constructing Network

In [17]:
# Expand the network using BFS starting from a root submission
def bfs_network_expansion(reddit, root_submission):
    visited = set()
    queue = deque([root_submission])
    G = nx.DiGraph()

    while queue:
        current_submission = queue.popleft()
        try:
            # Getting the author of the current submission
            author_name = current_submission.author.name if current_submission.author else "Deleted"
        except Exception:
            author_name = "Deleted"
        replies = fetch_first_level_replies(current_submission)
        for reply in replies:
            replier_name = reply.author.name if reply.author else "Deleted"
            if replier_name not in visited:
                visited.add(replier_name)
                most_active_subreddit = compute_most_active_subreddit(reddit, replier_name)
                if most_active_subreddit:
                    G.add_node(replier_name, subreddit=most_active_subreddit)
                if author_name != "Deleted":
                    G.add_edge(author_name, replier_name)
                # Add additional logic here if I wish to queue further levels

    return G

In [None]:
reddit = initialize_reddit()
submission_url = 'https://www.reddit.com/r/...'  # Replace with the actual URL of the desired post
submission = reddit.submission(url=submission_url)
network = bfs_network_expansion(reddit, submission)