In [1]:
import os
import warnings
from typing import List, Tuple

import networkx as nx
import numpy as np
import pandas as pd
from tqdm import tqdm

tqdm.pandas()
warnings.filterwarnings("ignore")

In [2]:
def clean_text(text_series: pd.Series) -> pd.Series:
    """
    Cleans text data by removing URLs and HTML entities.
    """
    # Remove URLs
    text_series = text_series.str.replace(
        r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|'
        r'(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', regex=True)

    # Remove HTML
    text_series = text_series.str.replace('&gt;', '')

    return text_series

def add_datetime_columns(df: pd.DataFrame, time_column: str = 'created_utc') -> pd.DataFrame:
    """
    Adds 'Y' (year) and 'YM' (year-month) columns to the DataFrame based on a timestamp column.
    """
    # Convert timestamp to datetime and extract the year
    df['Y'] = pd.to_datetime(df[time_column], unit='s').dt.year

    # Extract year-month in 'YYYY-MM' format
    df['YM'] = pd.to_datetime(df[time_column], unit='s').dt.strftime('%Y-%m')

    return df

def process_subreddit(data_root: str, subreddit_name: str, min_num_comments: int = 3, min_score: int = -1, years: List[int] = [2016], chunk_size: int = 10**6) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """
    Processes submissions and comments for a given subreddit.
    """
    # Read submissions CSV file
    submissions = pd.read_csv(f"{os.path.join(data_root, subreddit_name)}_submissions.csv")

    # Add 'Y' and 'YM' columns to submissions DataFrame
    submissions = add_datetime_columns(submissions, 'created_utc')

    # Filter submissions by specified years
    submissions = submissions[submissions['Y'].isin(years)]
    print(f"{subreddit_name}: # submissions {len(submissions)}")

    # Filter submissions based on minimum score and number of comments
    submissions = submissions[submissions['score'] > min_score]
    submissions = submissions[submissions['num_comments'] >= min_num_comments]

    # Add subreddit name to DataFrame
    submissions['sub'] = subreddit_name

    print(f"{subreddit_name}: # submissions after cleaning {len(submissions)}")
    print('Submissions processing done.')

    print('Starting processing comments')

    # Read comments CSV file in chunks
    comments_chunks = pd.read_csv(f"{os.path.join(data_root, subreddit_name)}_comments.csv", chunksize=chunk_size)
    comments = pd.DataFrame()

    for i, chunk in enumerate(comments_chunks):
        chunk['link_id'] = chunk['link_id'].str.replace('t3_', '')

        # Keep only comments linked to the filtered submissions
        chunk = chunk[chunk['link_id'].isin(submissions['id'].unique())]
        print(f"Chunk {i}: {len(chunk)} comments after filtering")

        # Remove prefix from 'parent_id'
        chunk['parent_id'] = chunk['parent_id'].str[3:]

        # Add 'Y' and 'YM' columns to comments DataFrame
        chunk = add_datetime_columns(chunk, 'created_utc')

        # Break the loop if the chunk's years are beyond the specified range
        if chunk['Y'].min() > max(years):
            break

        # Clean the 'body' text in comments
        chunk['body'] = clean_text(chunk['body'])

        # Add subreddit name to DataFrame
        chunk['sub'] = subreddit_name

        # Concatenate the processed chunk to the main comments DataFrame
        comments = pd.concat([comments, chunk], ignore_index=True)

    print(f"{subreddit_name}: # comments {len(comments)}")

    # Remove 't3_' prefix from 'link_id' in comments (redundant but kept for consistency)
    comments['link_id'] = comments['link_id'].str.replace('t3_', '')

    # Keep only comments linked to the filtered submissions
    comments = comments[comments['link_id'].isin(submissions['id'].unique())]

    print(f"{subreddit_name}: # comments after cleaning {len(comments)}")
    print('Finished processing comments')

    return submissions, comments

submissions, comments = process_subreddit("data", "business", min_num_comments=1, min_score=-1, years=range(2016, 2017))

# Filter submissions to only include those with comments in DC
submissions = submissions[submissions['sub'].isin(comments['sub'].unique())]

print('Overall Submissions:', len(submissions))
print('Overall Comments:', len(comments))

business: # submissions 265670
business: # submissions after cleaning 34342
Submissions processing done.
Starting processing comments
Chunk 0: 7889 comments after filtering
Chunk 1: 102534 comments after filtering
business: # comments 110423
business: # comments after cleaning 110423
Finished processing comments
Overall Submissions: 34342
Overall Comments: 110423


In [3]:
# Get the credibility information for each domain
domain_credibility = pd.read_csv("data/domain_credibility.csv", index_col=0, header=0, names=['domain', 'bias', 'credibility'])

# Merge credibility information with submissions on domain
submissions = submissions.merge(domain_credibility, left_on='domain', right_on='domain', how='left')

# Drop submissions with missing credibility information
submissions = submissions.dropna(subset=['bias', 'credibility'])

# Remove submissions from [deleted] authors
submissions = submissions[submissions['author'] != '[deleted]']

# Calculate the average credibility rating for each author
author_credibility = submissions.groupby('author', as_index=False)['credibility'].mean()

# Remove comments from authors with no credibility information
comments = comments[comments['author'].isin(author_credibility["author"])]

In [4]:
# TODO: Include links to the original submissions in the comments DataFrame
# posts = pd.concat([submissions[["author", "id"]].rename(columns={"id": "link_id"}), comments[["author", "link_id"]]])

# Calculate the number of shared comments between each pair of authors
frequencies_df = pd.crosstab(comments["author"], comments['link_id'])

credibilities = frequencies_df.merge(author_credibility, on="author", how='left')["credibility"].to_list()

frequencies = np.array(frequencies_df, dtype=float)

adjacency_matrix = frequencies @ frequencies.T
np.fill_diagonal(adjacency_matrix, 0)

# Connect authors with at least n shared comments
adjacency_matrix = (adjacency_matrix >= 1).astype(int)

# Create a graph from the adjacency matrix
graph = nx.from_numpy_array(adjacency_matrix)

# Add betweeness centrality, clustering coefficient, degree, and credibility as node attributes
betweenness = {i: float(b) for i, b in nx.betweenness_centrality(graph).items()}
nx.set_node_attributes(graph, betweenness, 'betweenness')

clustering = {i: float(c) for i, c in nx.clustering(graph).items()}
nx.set_node_attributes(graph, clustering, 'clustering')

degree = dict(nx.degree(graph))
nx.set_node_attributes(graph, degree, 'degree')

# author_names = {i: author for i, author in enumerate(frequencies_df.index)}
# nx.set_node_attributes(graph, author_names, 'author')

credibility = {i: int(credibility > 0.5) for i, credibility in enumerate(credibilities)}
nx.set_node_attributes(graph, credibility, 'credibility')

nx.write_gexf(graph, 'data/business_users.gexf')