In [1]:
import os
import warnings
from typing import List, Tuple

import networkx as nx
import numpy as np
import pandas as pd
from tqdm import tqdm

tqdm.pandas()
warnings.filterwarnings("ignore")

In [2]:
def clean_text(text_series: pd.Series) -> pd.Series:
    """
    Cleans text data by removing URLs and HTML entities.
    """
    # Remove URLs
    text_series = text_series.str.replace(
        r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|'
        r'(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', regex=True)

    # Remove HTML
    text_series = text_series.str.replace('&gt;', '')

    return text_series

def add_datetime_columns(df: pd.DataFrame, time_column: str = 'created_utc') -> pd.DataFrame:
    """
    Adds 'Y' (year) and 'YM' (year-month) columns to the DataFrame based on a timestamp column.
    """
    # Convert timestamp to datetime and extract the year
    df['Y'] = pd.to_datetime(df[time_column], unit='s').dt.year

    # Extract year-month in 'YYYY-MM' format
    df['YM'] = pd.to_datetime(df[time_column], unit='s').dt.strftime('%Y-%m')

    return df

def process_subreddit(data_root: str, subreddit_name: str, min_num_comments: int = 3, min_score: int = -1, years: List[int] = [2016], chunk_size: int = 10**6) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """
    Processes submissions and comments for a given subreddit.
    """
    # Read submissions CSV file
    submissions = pd.read_csv(f"{os.path.join(data_root, subreddit_name)}_submissions.csv")

    # Add 'Y' and 'YM' columns to submissions DataFrame
    submissions = add_datetime_columns(submissions, 'created_utc')

    # Filter submissions by specified years
    submissions = submissions[submissions['Y'].isin(years)]
    print(f"{subreddit_name}: # submissions {len(submissions)}")

    # Filter submissions based on minimum score and number of comments
    submissions = submissions[submissions['score'] > min_score]
    submissions = submissions[submissions['num_comments'] >= min_num_comments]

    # Add subreddit name to DataFrame
    submissions['sub'] = subreddit_name

    print(f"{subreddit_name}: # submissions after cleaning {len(submissions)}")
    print('Submissions processing done.')

    print('Starting processing comments')

    # Read comments CSV file in chunks
    comments_chunks = pd.read_csv(f"{os.path.join(data_root, subreddit_name)}_comments.csv", chunksize=chunk_size)
    comments = pd.DataFrame()

    for i, chunk in enumerate(comments_chunks):
        chunk['link_id'] = chunk['link_id'].str.replace('t3_', '')

        # Keep only comments linked to the filtered submissions
        chunk = chunk[chunk['link_id'].isin(submissions['id'].unique())]
        print(f"Chunk {i}: {len(chunk)} comments after filtering")

        # Remove prefix from 'parent_id'
        chunk['parent_id'] = chunk['parent_id'].str[3:]

        # Add 'Y' and 'YM' columns to comments DataFrame
        chunk = add_datetime_columns(chunk, 'created_utc')

        # Break the loop if the chunk's years are beyond the specified range
        if chunk['Y'].min() > max(years):
            break

        # Clean the 'body' text in comments
        chunk['body'] = clean_text(chunk['body'])

        # Add subreddit name to DataFrame
        chunk['sub'] = subreddit_name

        # Concatenate the processed chunk to the main comments DataFrame
        comments = pd.concat([comments, chunk], ignore_index=True)

    print(f"{subreddit_name}: # comments {len(comments)}")

    # Remove 't3_' prefix from 'link_id' in comments (redundant but kept for consistency)
    comments['link_id'] = comments['link_id'].str.replace('t3_', '')

    # Keep only comments linked to the filtered submissions
    comments = comments[comments['link_id'].isin(submissions['id'].unique())]

    print(f"{subreddit_name}: # comments after cleaning {len(comments)}")
    print('Finished processing comments')

    return submissions, comments

submissions, comments = process_subreddit("data", "business", min_num_comments=0, min_score=-1, years=range(2016, 2017))

# Filter submissions to only include those with comments in DC
submissions = submissions[submissions['sub'].isin(comments['sub'].unique())]

print('Overall Submissions:', len(submissions))
print('Overall Comments:', len(comments))

business: # submissions 265670
business: # submissions after cleaning 265670
Submissions processing done.
Starting processing comments
Chunk 0: 7913 comments after filtering
Chunk 1: 103515 comments after filtering
business: # comments 111428
business: # comments after cleaning 111428
Finished processing comments
Overall Submissions: 265670
Overall Comments: 111428


In [3]:
comments

Unnamed: 0,Index,id,author,created_utc,parent_id,link_id,body,Y,YM,sub
0,991732,cyhvfdu,jzwinck,1451609858,3yyzmg,3yyzmg,"Does this use QR codes or not? If not, why not?",2016,2016-01,business
1,991735,cyhvm1e,[deleted],1451610235,cyhvfdu,3yyzmg,[deleted],2016,2016-01,business
2,991752,cyhzora,bethinataverasqme,1451618961,3yzf42,3yzf42,Call US (863) 703-6697 ADT Touchpad/Large LCD ...,2016,2016-01,business
3,991763,cyi27cz,JoanWhite12,1451625091,3yznxh,3yznxh,Call US (850) 361-2832 ADT Area order with Loc...,2016,2016-01,business
4,991765,cyi29jt,MorPlan,1451625265,3yzo6d,3yzo6d,Looking for smart office supplies? Morplan off...,2016,2016-01,business
...,...,...,...,...,...,...,...,...,...,...
111423,1800878,j05qbw2,[deleted],1671000320,49vnnp,49vnnp,[removed],2022,2022-12,business
111424,1803398,j0tacyl,[deleted],1671430889,5g20we,5g20we,[removed],2022,2022-12,business
111425,1803414,j0ten7v,[deleted],1671434111,4hytd7,4hytd7,[removed],2022,2022-12,business
111426,1803416,j0tfhda,[deleted],1671434776,41tnv6,41tnv6,[removed],2022,2022-12,business


In [4]:
# Get the credibility information for each domain
domain_credibility = pd.read_csv("data/domain_credibility.csv", index_col=0, header=0, names=['domain', 'bias', 'credibility'])

# Merge credibility information with submissions on domain
submissions = submissions.merge(domain_credibility, left_on='domain', right_on='domain', how='left')

# Drop submissions with missing credibility information
submissions = submissions.dropna(subset=['bias', 'credibility'])

# Remove submissions from [deleted] authors
submissions = submissions[submissions['author'] != '[deleted]']

# Calculate the average credibility rating for each author
author_credibility = submissions.groupby('author', as_index=False)['credibility'].mean()

# Remove comments from authors with no credibility information
comments = comments[comments['author'].isin(author_credibility["author"])]

In [None]:
submissions[["id", "author"]]

Unnamed: 0,Index,id,author,author_flair_text,created_utc,num_comments,score,domain,title,Y,YM,sub,bias,credibility
107,797187,3z01ej,jurvand,,1451635461,2,10,businessinsider.com,The White House has delayed imposing new finan...,2016,2016-01,business,-0.3,0.8
259,797339,3z113a,CommercialSolarGuy,,1451665108,0,2,bloomberg.com,"Coal Glut, Renewables Make EU Power Cheapest i...",2016,2016-01,business,-0.3,0.6
269,797349,3z1bl8,3xpendableyouth,,1451670298,0,0,nytimes.com,Market Volatility Causes IPO Market to Stall O...,2016,2016-01,business,-0.3,0.8
270,797350,3z1bnf,3xpendableyouth,,1451670319,0,0,usatoday.com,5 things that doomed stocks in 2015,2016,2016-01,business,-0.3,0.8
272,797352,3z1bom,3xpendableyouth,,1451670337,0,0,duluthnewstribune.com,Cargill fires about 180 Somali workers after p...,2016,2016-01,business,0.0,0.8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
265639,1062719,5lappa,Brianlife,,1483206186,35,161,bbc.com,French workers get 'right to disconnect' from ...,2016,2016-12,business,-0.3,0.8
265647,1062727,5law9f,barkorut,,1483208456,0,43,csmonitor.com,2017 money resolutions: Small steps now for bi...,2016,2016-12,business,0.0,0.8
265649,1062729,5lawu9,vigorous,,1483208659,0,0,france24.com,Cuba reports banner tourism year in 2016,2016,2016-12,business,0.0,0.8
265663,1062743,5lbou4,stockgenius111,,1483218323,0,1,nytimes.com,"F. Ross Johnson, Symbol of ’80s Corporate Exce...",2016,2016-12,business,-0.3,0.8


In [None]:
# TODO: Include links to the original submissions in the comments DataFrame
# posts = pd.concat([submissions[["author", "id"]].rename(columns={"id": "link_id"}), comments[["author", "link_id"]]])

# Calculate the number of shared comments between each pair of authors
frequencies_df = pd.crosstab(comments["author"], comments['link_id'])

credibilities = frequencies_df.merge(author_credibility, on="author", how='left')["credibility"].to_list()

frequencies = np.array(frequencies_df, dtype=float)

adjacency_matrix = frequencies @ frequencies.T
np.fill_diagonal(adjacency_matrix, 0)

# Connect authors with at least n shared comments
adjacency_matrix = adjacency_matrix >= 1

# Create a graph from the adjacency matrix
graph = nx.from_numpy_array(adjacency_matrix)

# Add author names and credibility as node attributes
author_names = {i: author for i, author in enumerate(frequencies_df.index)}
nx.set_node_attributes(graph, author_names, 'author')

credibility = {i: credibility for i, credibility in enumerate(credibilities)}
nx.set_node_attributes(graph, credibility, 'credibility')

# nx.write_gexf(graph, 'data/business_users.gexf')

In [22]:
posts = pd.concat([submissions[["author", "id"]].rename(columns={"id": "link_id"}), comments[["author", "link_id"]]])

In [14]:
graph.number_of_edges()

1853