In [1]:
import os
import warnings
from typing import List, Tuple

import networkx as nx
import numpy as np
import pandas as pd
from tqdm import tqdm

tqdm.pandas()
warnings.filterwarnings("ignore")

In [2]:
def clean_text(text_series: pd.Series) -> pd.Series:
    """
    Cleans text data by removing URLs and HTML entities.
    """
    # Remove URLs
    text_series = text_series.str.replace(
        r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|'
        r'(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', regex=True)

    # Remove HTML
    text_series = text_series.str.replace('&gt;', '')

    return text_series

def add_datetime_columns(df: pd.DataFrame, time_column: str = 'created_utc') -> pd.DataFrame:
    """
    Adds 'Y' (year) and 'YM' (year-month) columns to the DataFrame based on a timestamp column.
    """
    # Convert timestamp to datetime and extract the year
    df['Y'] = pd.to_datetime(df[time_column], unit='s').dt.year

    # Extract year-month in 'YYYY-MM' format
    df['YM'] = pd.to_datetime(df[time_column], unit='s').dt.strftime('%Y-%m')

    return df

def process_subreddit(data_root: str, subreddit_name: str, min_num_comments: int = 3, min_score: int = -1, years: List[int] = [2016], chunk_size: int = 10**6) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """
    Processes submissions and comments for a given subreddit.
    """
    # Read submissions CSV file
    submissions = pd.read_csv(f"{os.path.join(data_root, subreddit_name)}_submissions.csv")

    # Add 'Y' and 'YM' columns to submissions DataFrame
    submissions = add_datetime_columns(submissions, 'created_utc')

    # Filter submissions by specified years
    submissions = submissions[submissions['Y'].isin(years)]
    print(f"{subreddit_name}: # submissions {len(submissions)}")

    # Filter submissions based on minimum score and number of comments
    submissions = submissions[submissions['score'] > min_score]
    submissions = submissions[submissions['num_comments'] >= min_num_comments]

    # Add subreddit name to DataFrame
    submissions['sub'] = subreddit_name

    print(f"{subreddit_name}: # submissions after cleaning {len(submissions)}")
    print('Submissions processing done.')

    print('Starting processing comments')

    # Read comments CSV file in chunks
    comments_chunks = pd.read_csv(f"{os.path.join(data_root, subreddit_name)}_comments.csv", chunksize=chunk_size)
    comments = pd.DataFrame()

    for i, chunk in enumerate(comments_chunks):
        chunk['link_id'] = chunk['link_id'].str.replace('t3_', '')

        # Keep only comments linked to the filtered submissions
        chunk = chunk[chunk['link_id'].isin(submissions['id'].unique())]
        print(f"Chunk {i}: {len(chunk)} comments after filtering")

        # Remove prefix from 'parent_id'
        chunk['parent_id'] = chunk['parent_id'].str[3:]

        # Add 'Y' and 'YM' columns to comments DataFrame
        chunk = add_datetime_columns(chunk, 'created_utc')

        # Break the loop if the chunk's years are beyond the specified range
        if chunk['Y'].min() > max(years):
            break

        # Clean the 'body' text in comments
        chunk['body'] = clean_text(chunk['body'])

        # Add subreddit name to DataFrame
        chunk['sub'] = subreddit_name

        # Concatenate the processed chunk to the main comments DataFrame
        comments = pd.concat([comments, chunk], ignore_index=True)

    print(f"{subreddit_name}: # comments {len(comments)}")

    # Remove 't3_' prefix from 'link_id' in comments (redundant but kept for consistency)
    comments['link_id'] = comments['link_id'].str.replace('t3_', '')

    # Keep only comments linked to the filtered submissions
    comments = comments[comments['link_id'].isin(submissions['id'].unique())]

    print(f"{subreddit_name}: # comments after cleaning {len(comments)}")
    print('Finished processing comments')

    return submissions, comments

submissions, comments = process_subreddit("data", "business", min_num_comments=0, min_score=-1, years=range(2016, 2017))

# Filter submissions to only include those with comments in DC
submissions = submissions[submissions['sub'].isin(comments['sub'].unique())]

print('Overall Submissions:', len(submissions))
print('Overall Comments:', len(comments))


business: # submissions 265670
business: # submissions after cleaning 265670
Submissions processing done.
Starting processing comments
Chunk 0: 7913 comments after filtering
Chunk 1: 103515 comments after filtering
business: # comments 111428
business: # comments after cleaning 111428
Finished processing comments
Overall Submissions: 265670
Overall Comments: 111428


In [3]:
submissions.head()

Unnamed: 0,Index,id,author,author_flair_text,created_utc,num_comments,score,domain,title,Y,YM,sub
797080,797080,3yyy1u,ElizabethNarula,,1451608233,0,1,prdaily.com,Starbucks cookies and more.,2016,2016-01,business
797081,797081,3yyzmg,awesomer121,,1451609122,5,0,itunes.apple.com,A 14 year old launches an application that wil...,2016,2016-01,business
797082,797082,3yz1ap,basementguys,,1451610053,0,1,adpost.com,INSULATION PRODUCTS,2016,2016-01,business
797083,797083,3yz45l,ElizabethNarula,,1451611853,0,1,marketingland.com,Top 10 Video Creators in November: The Ellen S...,2016,2016-01,business
797084,797084,3yz8sl,donnagain,,1451614722,0,1,youtube.com,Day 29 It's Time To Burry Your Ego,2016,2016-01,business


In [4]:
# Calculate the number of shared comments between each pair of authors
frequencies = np.array(pd.crosstab(comments['author'], comments['link_id'])).astype(float)

adjacency_matrix = frequencies @ frequencies.T
np.fill_diagonal(adjacency_matrix, 0)

# Connect authors with at least n shared comments
adjacency_matrix = adjacency_matrix >= 2

# Take a subset of the adjacency matrix
adjacency_matrix = adjacency_matrix[:1000, :1000]

# Write the adjacency matrix to a GEXF file
graph = nx.from_numpy_array(adjacency_matrix)
nx.write_gexf(graph, 'data/business_users.gexf')

In [None]:
# Plot the degree of the graph
import matplotlib.pyplot as plt

degrees = np.array([degree for node, degree in graph.degree()])

plt.hist(degrees, bins=25, color='skyblue', edgecolor='black')
plt.xlabel('Degree')
plt.ylabel('Frequency')
plt.title('Degree Distribution')
plt.savefig('data/degree_distribution.png')

In [None]:
comments.head(100)

In [None]:
sbm = "3z2f4k"

# Assuming DC and sbm are defined somewhere earlier in your code
data = comments[comments['link_id'] == sbm]
fig, ax = plt.subplots(figsize=(10, 10))

# Filter necessary columns
filtered_comments = data[['id', 'parent_id', 'body']]

# Create a directed graph
G = nx.DiGraph()

# Define a root node for visualization and add nodes and edges
root_node = 'Root'
G.add_node(root_node, body="Root Post")
for _, row in filtered_comments.iterrows():
    G.add_node(row['id'], body=row['id'])  # Use full ID for display
    parent_id = row['parent_id'] if not row['parent_id'].startswith(sbm) else root_node
    G.add_edge(parent_id, row['id'])

# Compute depth for each node for shell assignment
depth = nx.single_source_shortest_path_length(G, root_node)
# Create shell layout based on depths
max_depth = max(depth.values())
shells = [[] for _ in range(max_depth + 1)]
for node, d in depth.items():
    shells[d].append(node)

# Define node colors based on depth with a more vibrant color map
node_color = [depth[node] / max_depth for node in G.nodes()]

if 1:
  # Draw the graph using the shell layout
  pos = nx.shell_layout(G, shells)
else:
  pos = nx.spring_layout(G, k=0.1, iterations=50)  # Adjust k and iterations for better layout

nx.draw_networkx_nodes(G, pos, node_color=node_color, node_size=100, cmap=plt.cm.viridis, alpha=0.8)
nx.draw_networkx_edges(G, pos, width=1.0, alpha=0.5)
nx.draw_networkx_labels(G, pos, labels={n: G.nodes[n]['body'] for n in G.nodes}, font_size=8)

plt.title("Comment Tree Visualization")
plt.axis('off')  # Hide axes

plt.show()
